/ contrib / linearize / linearize-data.py
linearize-data.py
  1  #!/usr/bin/env python3
  2  #
  3  # linearize-data.py: Construct a linear, no-fork version of the chain.
  4  #
  5  # Copyright (c) 2013-present The Bitcoin Core developers
  6  # Distributed under the MIT software license, see the accompanying
  7  # file COPYING or http://www.opensource.org/licenses/mit-license.php.
  8  #
  9  
 10  import struct
 11  import re
 12  import os
 13  import os.path
 14  import sys
 15  import hashlib
 16  import datetime
 17  import time
 18  import glob
 19  from collections import namedtuple
 20  
 21  settings = {}
 22  
 23  def calc_hash_str(blk_hdr):
 24      blk_hdr_hash = hashlib.sha256(hashlib.sha256(blk_hdr).digest()).digest()
 25      return blk_hdr_hash[::-1].hex()
 26  
 27  def get_blk_dt(blk_hdr):
 28      members = struct.unpack("<I", blk_hdr[68:68+4])
 29      nTime = members[0]
 30      dt = datetime.datetime.fromtimestamp(nTime)
 31      dt_ym = datetime.datetime(dt.year, dt.month, 1)
 32      return (dt_ym, nTime)
 33  
 34  # When getting the list of block hashes, undo any byte reversals.
 35  def get_block_hashes(settings):
 36      blkindex = []
 37      with open(settings['hashlist'], "r") as f:
 38          for line in f:
 39              line = line.rstrip()
 40              if settings['rev_hash_bytes'] == 'true':
 41                  line = bytes.fromhex(line)[::-1].hex()
 42              blkindex.append(line)
 43  
 44      print("Read " + str(len(blkindex)) + " hashes")
 45  
 46      return blkindex
 47  
 48  # The block map shouldn't give or receive byte-reversed hashes.
 49  def mkblockmap(blkindex):
 50      blkmap = {}
 51      for height,hash in enumerate(blkindex):
 52          blkmap[hash] = height
 53      return blkmap
 54  
 55  # This gets the first block file ID that exists from the input block
 56  # file directory.
 57  def getFirstBlockFileId(block_dir_path):
 58      # First, this sets up a pattern to search for block files, for
 59      # example 'blkNNNNN.dat'.
 60      blkFilePattern = os.path.join(block_dir_path, "blk[0-9][0-9][0-9][0-9][0-9].dat")
 61  
 62      # This search is done with glob
 63      blkFnList = glob.glob(blkFilePattern)
 64  
 65      if len(blkFnList) == 0:
 66          print("blocks not pruned - starting at 0")
 67          return 0
 68      # We then get the lexicographic minimum, which should be the first
 69      # block file name.
 70      firstBlkFilePath = min(blkFnList)
 71      firstBlkFn = os.path.basename(firstBlkFilePath)
 72  
 73      # now, the string should be ['b','l','k','N','N','N','N','N','.','d','a','t']
 74      # So get the ID by choosing:              3   4   5   6   7
 75      # The ID is not necessarily 0 if this is a pruned node.
 76      blkId = int(firstBlkFn[3:8])
 77      return blkId
 78  
 79  def read_xor_key(blocks_path):
 80      NUM_XOR_BYTES = 8  # From InitBlocksdirXorKey::xor_key.size()
 81      try:
 82          xor_filename = os.path.join(blocks_path, "xor.dat")
 83          with open(xor_filename, "rb") as xor_file:
 84              return xor_file.read(NUM_XOR_BYTES)
 85      # support also blockdirs created with pre-v28 versions, where no xor key exists yet
 86      except FileNotFoundError:
 87          return bytes([0] * NUM_XOR_BYTES)
 88  
 89  # Block header and extent on disk
 90  BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size'])
 91  
 92  class BlockDataCopier:
 93      def __init__(self, settings, blkindex, blkmap):
 94          self.settings = settings
 95          self.blkindex = blkindex
 96          self.blkmap = blkmap
 97  
 98          # Get first occurring block file id - for pruned nodes this
 99          # will not necessarily be 0
100          self.inFn = getFirstBlockFileId(self.settings['input'])
101          self.inF = None
102          self.outFn = 0
103          self.outsz = 0
104          self.outF = None
105          self.outFname = None
106          self.blkCountIn = 0
107          self.blkCountOut = 0
108          self.xor_key = read_xor_key(self.settings['input'])
109  
110          self.lastDate = datetime.datetime(2000, 1, 1)
111          self.highTS = 1408893517 - 315360000
112          self.timestampSplit = False
113          self.fileOutput = True
114          self.setFileTime = False
115          self.maxOutSz = settings['max_out_sz']
116          if 'output' in settings:
117              self.fileOutput = False
118          if settings['file_timestamp'] != 0:
119              self.setFileTime = True
120          if settings['split_timestamp'] != 0:
121              self.timestampSplit = True
122          # Extents and cache for out-of-order blocks
123          self.blockExtents = {}
124          self.outOfOrderData = {}
125          self.outOfOrderSize = 0 # running total size for items in outOfOrderData
126  
127      def read_xored(self, f, size):
128          offset = f.tell()
129          data = bytearray(f.read(size))
130          for i in range(len(data)):
131              data[i] ^= self.xor_key[(i + offset) % len(self.xor_key)]
132          return bytes(data)
133  
134      def writeBlock(self, inhdr, blk_hdr, rawblock):
135          blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock)
136          if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz):
137              self.outF.close()
138              if self.setFileTime:
139                  os.utime(self.outFname, (int(time.time()), self.highTS))
140              self.outF = None
141              self.outFname = None
142              self.outFn = self.outFn + 1
143              self.outsz = 0
144  
145          (blkDate, blkTS) = get_blk_dt(blk_hdr)
146          if self.timestampSplit and (blkDate > self.lastDate):
147              print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str)
148              self.lastDate = blkDate
149              if self.outF:
150                  self.outF.close()
151                  if self.setFileTime:
152                      os.utime(self.outFname, (int(time.time()), self.highTS))
153                  self.outF = None
154                  self.outFname = None
155                  self.outFn = self.outFn + 1
156                  self.outsz = 0
157  
158          if not self.outF:
159              if self.fileOutput:
160                  self.outFname = self.settings['output_file']
161              else:
162                  self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn)
163              print("Output file " + self.outFname)
164              self.outF = open(self.outFname, "wb")
165  
166          self.outF.write(inhdr)
167          self.outF.write(blk_hdr)
168          self.outF.write(rawblock)
169          self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock)
170  
171          self.blkCountOut = self.blkCountOut + 1
172          if blkTS > self.highTS:
173              self.highTS = blkTS
174  
175          if (self.blkCountOut % 1000) == 0:
176              print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' %
177                      (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex)))
178  
179      def inFileName(self, fn):
180          return os.path.join(self.settings['input'], "blk%05d.dat" % fn)
181  
182      def fetchBlock(self, extent):
183          '''Fetch block contents from disk given extents'''
184          with open(self.inFileName(extent.fn), "rb") as f:
185              f.seek(extent.offset)
186              return self.read_xored(f, extent.size)
187  
188      def copyOneBlock(self):
189          '''Find the next block to be written in the input, and copy it to the output.'''
190          extent = self.blockExtents.pop(self.blkCountOut)
191          if self.blkCountOut in self.outOfOrderData:
192              # If the data is cached, use it from memory and remove from the cache
193              rawblock = self.outOfOrderData.pop(self.blkCountOut)
194              self.outOfOrderSize -= len(rawblock)
195          else: # Otherwise look up data on disk
196              rawblock = self.fetchBlock(extent)
197  
198          self.writeBlock(extent.inhdr, extent.blkhdr, rawblock)
199  
200      def run(self):
201          while self.blkCountOut < len(self.blkindex):
202              if not self.inF:
203                  fname = self.inFileName(self.inFn)
204                  print("Input file " + fname)
205                  try:
206                      self.inF = open(fname, "rb")
207                  except IOError:
208                      print("Premature end of block data")
209                      return
210  
211              inhdr = self.read_xored(self.inF, 8)
212              if (not inhdr or (inhdr[0] == "\0")):
213                  self.inF.close()
214                  self.inF = None
215                  self.inFn = self.inFn + 1
216                  continue
217  
218              inMagic = inhdr[:4]
219              if (inMagic != self.settings['netmagic']):
220                  # Seek backwards 7 bytes (skipping the first byte in the previous search)
221                  # and continue searching from the new position if the magic bytes are not
222                  # found.
223                  self.inF.seek(-7, os.SEEK_CUR)
224                  continue
225              inLenLE = inhdr[4:]
226              su = struct.unpack("<I", inLenLE)
227              inLen = su[0] - 80 # length without header
228              blk_hdr = self.read_xored(self.inF, 80)
229              inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen)
230  
231              self.hash_str = calc_hash_str(blk_hdr)
232              if self.hash_str not in blkmap:
233                  # Because blocks can be written to files out-of-order as of 0.10, the script
234                  # may encounter blocks it doesn't know about. Treat as debug output.
235                  if settings['debug_output'] == 'true':
236                      print("Skipping unknown block " + self.hash_str)
237                  self.inF.seek(inLen, os.SEEK_CUR)
238                  continue
239  
240              blkHeight = self.blkmap[self.hash_str]
241              self.blkCountIn += 1
242  
243              if self.blkCountOut == blkHeight:
244                  # If in-order block, just copy
245                  rawblock = self.read_xored(self.inF, inLen)
246                  self.writeBlock(inhdr, blk_hdr, rawblock)
247  
248                  # See if we can catch up to prior out-of-order blocks
249                  while self.blkCountOut in self.blockExtents:
250                      self.copyOneBlock()
251  
252              else: # If out-of-order, skip over block data for now
253                  self.blockExtents[blkHeight] = inExtent
254                  if self.outOfOrderSize < self.settings['out_of_order_cache_sz']:
255                      # If there is space in the cache, read the data
256                      # Reading the data in file sequence instead of seeking and fetching it later is preferred,
257                      # but we don't want to fill up memory
258                      self.outOfOrderData[blkHeight] = self.read_xored(self.inF, inLen)
259                      self.outOfOrderSize += inLen
260                  else: # If no space in cache, seek forward
261                      self.inF.seek(inLen, os.SEEK_CUR)
262  
263          print("Done (%i blocks written)" % (self.blkCountOut))
264  
265  if __name__ == '__main__':
266      if len(sys.argv) != 2:
267          print("Usage: linearize-data.py CONFIG-FILE")
268          sys.exit(1)
269  
270      with open(sys.argv[1]) as f:
271          for line in f:
272              # skip comment lines
273              m = re.search(r'^\s*#', line)
274              if m:
275                  continue
276  
277              # parse key=value lines
278              m = re.search(r'^(\w+)\s*=\s*(\S.*)$', line)
279              if m is None:
280                  continue
281              settings[m.group(1)] = m.group(2)
282  
283      # Force hash byte format setting to be lowercase to make comparisons easier.
284      # Also place upfront in case any settings need to know about it.
285      if 'rev_hash_bytes' not in settings:
286          settings['rev_hash_bytes'] = 'false'
287      settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower()
288  
289      if 'netmagic' not in settings:
290          settings['netmagic'] = 'f9beb4d9'
291      if 'genesis' not in settings:
292          settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f'
293      if 'input' not in settings:
294          settings['input'] = 'input'
295      if 'hashlist' not in settings:
296          settings['hashlist'] = 'hashlist.txt'
297      if 'file_timestamp' not in settings:
298          settings['file_timestamp'] = 0
299      if 'split_timestamp' not in settings:
300          settings['split_timestamp'] = 0
301      if 'max_out_sz' not in settings:
302          settings['max_out_sz'] = 1000 * 1000 * 1000
303      if 'out_of_order_cache_sz' not in settings:
304          settings['out_of_order_cache_sz'] = 100 * 1000 * 1000
305      if 'debug_output' not in settings:
306          settings['debug_output'] = 'false'
307  
308      settings['max_out_sz'] = int(settings['max_out_sz'])
309      settings['split_timestamp'] = int(settings['split_timestamp'])
310      settings['file_timestamp'] = int(settings['file_timestamp'])
311      settings['netmagic'] = bytes.fromhex(settings['netmagic'])
312      settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz'])
313      settings['debug_output'] = settings['debug_output'].lower()
314  
315      if 'output_file' not in settings and 'output' not in settings:
316          print("Missing output file / directory")
317          sys.exit(1)
318  
319      blkindex = get_block_hashes(settings)
320      blkmap = mkblockmap(blkindex)
321  
322      # Block hash map won't be byte-reversed. Neither should the genesis hash.
323      if settings['genesis'] not in blkmap:
324          print("Genesis block not found in hashlist")
325      else:
326          BlockDataCopier(settings, blkindex, blkmap).run()