/ contrib / linearize / linearize-data.py
linearize-data.py
  1  #!/usr/bin/env python3
  2  #
  3  # linearize-data.py: Construct a linear, no-fork version of the chain.
  4  #
  5  # Copyright (c) 2013-2022 The Bitcoin Core developers
  6  # Distributed under the MIT software license, see the accompanying
  7  # file COPYING or http://www.opensource.org/licenses/mit-license.php.
  8  #
  9  
 10  import struct
 11  import re
 12  import os
 13  import os.path
 14  import sys
 15  import hashlib
 16  import datetime
 17  import time
 18  import glob
 19  from collections import namedtuple
 20  
 21  settings = {}
 22  
 23  def calc_hash_str(blk_hdr):
 24      blk_hdr_hash = hashlib.sha256(hashlib.sha256(blk_hdr).digest()).digest()
 25      return blk_hdr_hash[::-1].hex()
 26  
 27  def get_blk_dt(blk_hdr):
 28      members = struct.unpack("<I", blk_hdr[68:68+4])
 29      nTime = members[0]
 30      dt = datetime.datetime.fromtimestamp(nTime)
 31      dt_ym = datetime.datetime(dt.year, dt.month, 1)
 32      return (dt_ym, nTime)
 33  
 34  # When getting the list of block hashes, undo any byte reversals.
 35  def get_block_hashes(settings):
 36      blkindex = []
 37      with open(settings['hashlist'], "r", encoding="utf8") as f:
 38          for line in f:
 39              line = line.rstrip()
 40              if settings['rev_hash_bytes'] == 'true':
 41                  line = bytes.fromhex(line)[::-1].hex()
 42              blkindex.append(line)
 43  
 44      print("Read " + str(len(blkindex)) + " hashes")
 45  
 46      return blkindex
 47  
 48  # The block map shouldn't give or receive byte-reversed hashes.
 49  def mkblockmap(blkindex):
 50      blkmap = {}
 51      for height,hash in enumerate(blkindex):
 52          blkmap[hash] = height
 53      return blkmap
 54  
 55  # This gets the first block file ID that exists from the input block
 56  # file directory.
 57  def getFirstBlockFileId(block_dir_path):
 58      # First, this sets up a pattern to search for block files, for
 59      # example 'blkNNNNN.dat'.
 60      blkFilePattern = os.path.join(block_dir_path, "blk[0-9][0-9][0-9][0-9][0-9].dat")
 61  
 62      # This search is done with glob
 63      blkFnList = glob.glob(blkFilePattern)
 64  
 65      if len(blkFnList) == 0:
 66          print("blocks not pruned - starting at 0")
 67          return 0
 68      # We then get the lexicographic minimum, which should be the first
 69      # block file name.
 70      firstBlkFilePath = min(blkFnList)
 71      firstBlkFn = os.path.basename(firstBlkFilePath)
 72  
 73      # now, the string should be ['b','l','k','N','N','N','N','N','.','d','a','t']
 74      # So get the ID by choosing:              3   4   5   6   7
 75      # The ID is not necessarily 0 if this is a pruned node.
 76      blkId = int(firstBlkFn[3:8])
 77      return blkId
 78  
 79  # Block header and extent on disk
 80  BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
 81  
 82  class BlockDataCopier:
 83      def __init__(self, settings, blkindex, blkmap):
 84          self.settings = settings
 85          self.blkindex = blkindex
 86          self.blkmap = blkmap
 87  
 88          # Get first occurring block file id - for pruned nodes this
 89          # will not necessarily be 0
 90          self.inFn = getFirstBlockFileId(self.settings['input'])
 91          self.inF = None
 92          self.outFn = 0
 93          self.outsz = 0
 94          self.outF = None
 95          self.outFname = None
 96          self.blkCountIn = 0
 97          self.blkCountOut = 0
 98  
 99          self.lastDate = datetime.datetime(2000, 1, 1)
100          self.highTS = 1408893517 - 315360000
101          self.timestampSplit = False
102          self.fileOutput = True
103          self.setFileTime = False
104          self.maxOutSz = settings['max_out_sz']
105          if 'output' in settings:
106              self.fileOutput = False
107          if settings['file_timestamp'] != 0:
108              self.setFileTime = True
109          if settings['split_timestamp'] != 0:
110              self.timestampSplit = True
111          # Extents and cache for out-of-order blocks
112          self.blockExtents = {}
113          self.outOfOrderData = {}
114          self.outOfOrderSize = 0 # running total size for items in outOfOrderData
115  
116      def writeBlock(self, inhdr, blk_hdr, rawblock):
117          blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock)
118          if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz):
119              self.outF.close()
120              if self.setFileTime:
121                  os.utime(self.outFname, (int(time.time()), self.highTS))
122              self.outF = None
123              self.outFname = None
124              self.outFn = self.outFn + 1
125              self.outsz = 0
126  
127          (blkDate, blkTS) = get_blk_dt(blk_hdr)
128          if self.timestampSplit and (blkDate > self.lastDate):
129              print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str)
130              self.lastDate = blkDate
131              if self.outF:
132                  self.outF.close()
133                  if self.setFileTime:
134                      os.utime(self.outFname, (int(time.time()), self.highTS))
135                  self.outF = None
136                  self.outFname = None
137                  self.outFn = self.outFn + 1
138                  self.outsz = 0
139  
140          if not self.outF:
141              if self.fileOutput:
142                  self.outFname = self.settings['output_file']
143              else:
144                  self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn)
145              print("Output file " + self.outFname)
146              self.outF = open(self.outFname, "wb")
147  
148          self.outF.write(inhdr)
149          self.outF.write(blk_hdr)
150          self.outF.write(rawblock)
151          self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
152  
153          self.blkCountOut = self.blkCountOut + 1
154          if blkTS > self.highTS:
155              self.highTS = blkTS
156  
157          if (self.blkCountOut % 1000) == 0:
158              print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
159                      (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
160  
161      def inFileName(self, fn):
162          return os.path.join(self.settings['input'], "blk%05d.dat" % fn)
163  
164      def fetchBlock(self, extent):
165          '''Fetch block contents from disk given extents'''
166          with open(self.inFileName(extent.fn), "rb") as f:
167              f.seek(extent.offset)
168              return f.read(extent.size)
169  
170      def copyOneBlock(self):
171          '''Find the next block to be written in the input, and copy it to the output.'''
172          extent = self.blockExtents.pop(self.blkCountOut)
173          if self.blkCountOut in self.outOfOrderData:
174              # If the data is cached, use it from memory and remove from the cache
175              rawblock = self.outOfOrderData.pop(self.blkCountOut)
176              self.outOfOrderSize -= len(rawblock)
177          else: # Otherwise look up data on disk
178              rawblock = self.fetchBlock(extent)
179  
180          self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
181  
182      def run(self):
183          while self.blkCountOut < len(self.blkindex):
184              if not self.inF:
185                  fname = self.inFileName(self.inFn)
186                  print("Input file " + fname)
187                  try:
188                      self.inF = open(fname, "rb")
189                  except IOError:
190                      print("Premature end of block data")
191                      return
192  
193              inhdr = self.inF.read(8)
194              if (not inhdr or (inhdr[0] == "\0")):
195                  self.inF.close()
196                  self.inF = None
197                  self.inFn = self.inFn + 1
198                  continue
199  
200              inMagic = inhdr[:4]
201              if (inMagic != self.settings['netmagic']):
202                  # Seek backwards 7 bytes (skipping the first byte in the previous search)
203                  # and continue searching from the new position if the magic bytes are not
204                  # found.
205                  self.inF.seek(-7, os.SEEK_CUR)
206                  continue
207              inLenLE = inhdr[4:]
208              su = struct.unpack("<I", inLenLE)
209              inLen = su[0] - 80 # length without header
210              blk_hdr = self.inF.read(80)
211              inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
212  
213              self.hash_str = calc_hash_str(blk_hdr)
214              if not self.hash_str in blkmap:
215                  # Because blocks can be written to files out-of-order as of 0.10, the script
216                  # may encounter blocks it doesn't know about. Treat as debug output.
217                  if settings['debug_output'] == 'true':
218                      print("Skipping unknown block " + self.hash_str)
219                  self.inF.seek(inLen, os.SEEK_CUR)
220                  continue
221  
222              blkHeight = self.blkmap[self.hash_str]
223              self.blkCountIn += 1
224  
225              if self.blkCountOut == blkHeight:
226                  # If in-order block, just copy
227                  rawblock = self.inF.read(inLen)
228                  self.writeBlock(inhdr, blk_hdr, rawblock)
229  
230                  # See if we can catch up to prior out-of-order blocks
231                  while self.blkCountOut in self.blockExtents:
232                      self.copyOneBlock()
233  
234              else: # If out-of-order, skip over block data for now
235                  self.blockExtents[blkHeight] = inExtent
236                  if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
237                      # If there is space in the cache, read the data
238                      # Reading the data in file sequence instead of seeking and fetching it later is preferred,
239                      # but we don't want to fill up memory
240                      self.outOfOrderData[blkHeight] = self.inF.read(inLen)
241                      self.outOfOrderSize += inLen
242                  else: # If no space in cache, seek forward
243                      self.inF.seek(inLen, os.SEEK_CUR)
244  
245          print("Done (%i blocks written)" % (self.blkCountOut))
246  
247  if __name__ == '__main__':
248      if len(sys.argv) != 2:
249          print("Usage: linearize-data.py CONFIG-FILE")
250          sys.exit(1)
251  
252      with open(sys.argv[1], encoding="utf8") as f:
253          for line in f:
254              # skip comment lines
255              m = re.search(r'^\s*#', line)
256              if m:
257                  continue
258  
259              # parse key=value lines
260              m = re.search(r'^(\w+)\s*=\s*(\S.*)$', line)
261              if m is None:
262                  continue
263              settings[m.group(1)] = m.group(2)
264  
265      # Force hash byte format setting to be lowercase to make comparisons easier.
266      # Also place upfront in case any settings need to know about it.
267      if 'rev_hash_bytes' not in settings:
268          settings['rev_hash_bytes'] = 'false'
269      settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower()
270  
271      if 'netmagic' not in settings:
272          settings['netmagic'] = 'f9beb4d9'
273      if 'genesis' not in settings:
274          settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f'
275      if 'input' not in settings:
276          settings['input'] = 'input'
277      if 'hashlist' not in settings:
278          settings['hashlist'] = 'hashlist.txt'
279      if 'file_timestamp' not in settings:
280          settings['file_timestamp'] = 0
281      if 'split_timestamp' not in settings:
282          settings['split_timestamp'] = 0
283      if 'max_out_sz' not in settings:
284          settings['max_out_sz'] = 1000 * 1000 * 1000
285      if 'out_of_order_cache_sz' not in settings:
286          settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
287      if 'debug_output' not in settings:
288          settings['debug_output'] = 'false'
289  
290      settings['max_out_sz'] = int(settings['max_out_sz'])
291      settings['split_timestamp'] = int(settings['split_timestamp'])
292      settings['file_timestamp'] = int(settings['file_timestamp'])
293      settings['netmagic'] = bytes.fromhex(settings['netmagic'])
294      settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
295      settings['debug_output'] = settings['debug_output'].lower()
296  
297      if 'output_file' not in settings and 'output' not in settings:
298          print("Missing output file / directory")
299          sys.exit(1)
300  
301      blkindex = get_block_hashes(settings)
302      blkmap = mkblockmap(blkindex)
303  
304      # Block hash map won't be byte-reversed. Neither should the genesis hash.
305      if not settings['genesis'] in blkmap:
306          print("Genesis block not found in hashlist")
307      else:
308          BlockDataCopier(settings, blkindex, blkmap).run()