linearize-data.py
1 #!/usr/bin/env python3 2 # 3 # linearize-data.py: Construct a linear, no-fork version of the chain. 4 # 5 # Copyright (c) 2013-present The Bitcoin Core developers 6 # Distributed under the MIT software license, see the accompanying 7 # file COPYING or http://www.opensource.org/licenses/mit-license.php. 8 # 9 10 import struct 11 import re 12 import os 13 import os.path 14 import sys 15 import hashlib 16 import datetime 17 import time 18 import glob 19 from collections import namedtuple 20 21 settings = {} 22 23 def calc_hash_str(blk_hdr): 24 blk_hdr_hash = hashlib.sha256(hashlib.sha256(blk_hdr).digest()).digest() 25 return blk_hdr_hash[::-1].hex() 26 27 def get_blk_dt(blk_hdr): 28 members = struct.unpack("<I", blk_hdr[68:68+4]) 29 nTime = members[0] 30 dt = datetime.datetime.fromtimestamp(nTime) 31 dt_ym = datetime.datetime(dt.year, dt.month, 1) 32 return (dt_ym, nTime) 33 34 # When getting the list of block hashes, undo any byte reversals. 35 def get_block_hashes(settings): 36 blkindex = [] 37 with open(settings['hashlist'], "r") as f: 38 for line in f: 39 line = line.rstrip() 40 if settings['rev_hash_bytes'] == 'true': 41 line = bytes.fromhex(line)[::-1].hex() 42 blkindex.append(line) 43 44 print("Read " + str(len(blkindex)) + " hashes") 45 46 return blkindex 47 48 # The block map shouldn't give or receive byte-reversed hashes. 49 def mkblockmap(blkindex): 50 blkmap = {} 51 for height,hash in enumerate(blkindex): 52 blkmap[hash] = height 53 return blkmap 54 55 # This gets the first block file ID that exists from the input block 56 # file directory. 57 def getFirstBlockFileId(block_dir_path): 58 # First, this sets up a pattern to search for block files, for 59 # example 'blkNNNNN.dat'. 60 blkFilePattern = os.path.join(block_dir_path, "blk[0-9][0-9][0-9][0-9][0-9].dat") 61 62 # This search is done with glob 63 blkFnList = glob.glob(blkFilePattern) 64 65 if len(blkFnList) == 0: 66 print("blocks not pruned - starting at 0") 67 return 0 68 # We then get the lexicographic minimum, which should be the first 69 # block file name. 70 firstBlkFilePath = min(blkFnList) 71 firstBlkFn = os.path.basename(firstBlkFilePath) 72 73 # now, the string should be ['b','l','k','N','N','N','N','N','.','d','a','t'] 74 # So get the ID by choosing: 3 4 5 6 7 75 # The ID is not necessarily 0 if this is a pruned node. 76 blkId = int(firstBlkFn[3:8]) 77 return blkId 78 79 def read_xor_key(blocks_path): 80 NUM_XOR_BYTES = 8 # From InitBlocksdirXorKey::xor_key.size() 81 try: 82 xor_filename = os.path.join(blocks_path, "xor.dat") 83 with open(xor_filename, "rb") as xor_file: 84 return xor_file.read(NUM_XOR_BYTES) 85 # support also blockdirs created with pre-v28 versions, where no xor key exists yet 86 except FileNotFoundError: 87 return bytes([0] * NUM_XOR_BYTES) 88 89 # Block header and extent on disk 90 BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size']) 91 92 class BlockDataCopier: 93 def __init__(self, settings, blkindex, blkmap): 94 self.settings = settings 95 self.blkindex = blkindex 96 self.blkmap = blkmap 97 98 # Get first occurring block file id - for pruned nodes this 99 # will not necessarily be 0 100 self.inFn = getFirstBlockFileId(self.settings['input']) 101 self.inF = None 102 self.outFn = 0 103 self.outsz = 0 104 self.outF = None 105 self.outFname = None 106 self.blkCountIn = 0 107 self.blkCountOut = 0 108 self.xor_key = read_xor_key(self.settings['input']) 109 110 self.lastDate = datetime.datetime(2000, 1, 1) 111 self.highTS = 1408893517 - 315360000 112 self.timestampSplit = False 113 self.fileOutput = True 114 self.setFileTime = False 115 self.maxOutSz = settings['max_out_sz'] 116 if 'output' in settings: 117 self.fileOutput = False 118 if settings['file_timestamp'] != 0: 119 self.setFileTime = True 120 if settings['split_timestamp'] != 0: 121 self.timestampSplit = True 122 # Extents and cache for out-of-order blocks 123 self.blockExtents = {} 124 self.outOfOrderData = {} 125 self.outOfOrderSize = 0 # running total size for items in outOfOrderData 126 127 def read_xored(self, f, size): 128 offset = f.tell() 129 data = bytearray(f.read(size)) 130 for i in range(len(data)): 131 data[i] ^= self.xor_key[(i + offset) % len(self.xor_key)] 132 return bytes(data) 133 134 def writeBlock(self, inhdr, blk_hdr, rawblock): 135 blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock) 136 if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz): 137 self.outF.close() 138 if self.setFileTime: 139 os.utime(self.outFname, (int(time.time()), self.highTS)) 140 self.outF = None 141 self.outFname = None 142 self.outFn = self.outFn + 1 143 self.outsz = 0 144 145 (blkDate, blkTS) = get_blk_dt(blk_hdr) 146 if self.timestampSplit and (blkDate > self.lastDate): 147 print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str) 148 self.lastDate = blkDate 149 if self.outF: 150 self.outF.close() 151 if self.setFileTime: 152 os.utime(self.outFname, (int(time.time()), self.highTS)) 153 self.outF = None 154 self.outFname = None 155 self.outFn = self.outFn + 1 156 self.outsz = 0 157 158 if not self.outF: 159 if self.fileOutput: 160 self.outFname = self.settings['output_file'] 161 else: 162 self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn) 163 print("Output file " + self.outFname) 164 self.outF = open(self.outFname, "wb") 165 166 self.outF.write(inhdr) 167 self.outF.write(blk_hdr) 168 self.outF.write(rawblock) 169 self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock) 170 171 self.blkCountOut = self.blkCountOut + 1 172 if blkTS > self.highTS: 173 self.highTS = blkTS 174 175 if (self.blkCountOut % 1000) == 0: 176 print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' % 177 (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex))) 178 179 def inFileName(self, fn): 180 return os.path.join(self.settings['input'], "blk%05d.dat" % fn) 181 182 def fetchBlock(self, extent): 183 '''Fetch block contents from disk given extents''' 184 with open(self.inFileName(extent.fn), "rb") as f: 185 f.seek(extent.offset) 186 return self.read_xored(f, extent.size) 187 188 def copyOneBlock(self): 189 '''Find the next block to be written in the input, and copy it to the output.''' 190 extent = self.blockExtents.pop(self.blkCountOut) 191 if self.blkCountOut in self.outOfOrderData: 192 # If the data is cached, use it from memory and remove from the cache 193 rawblock = self.outOfOrderData.pop(self.blkCountOut) 194 self.outOfOrderSize -= len(rawblock) 195 else: # Otherwise look up data on disk 196 rawblock = self.fetchBlock(extent) 197 198 self.writeBlock(extent.inhdr, extent.blkhdr, rawblock) 199 200 def run(self): 201 while self.blkCountOut < len(self.blkindex): 202 if not self.inF: 203 fname = self.inFileName(self.inFn) 204 print("Input file " + fname) 205 try: 206 self.inF = open(fname, "rb") 207 except IOError: 208 print("Premature end of block data") 209 return 210 211 inhdr = self.read_xored(self.inF, 8) 212 if (not inhdr or (inhdr[0] == "\0")): 213 self.inF.close() 214 self.inF = None 215 self.inFn = self.inFn + 1 216 continue 217 218 inMagic = inhdr[:4] 219 if (inMagic != self.settings['netmagic']): 220 # Seek backwards 7 bytes (skipping the first byte in the previous search) 221 # and continue searching from the new position if the magic bytes are not 222 # found. 223 self.inF.seek(-7, os.SEEK_CUR) 224 continue 225 inLenLE = inhdr[4:] 226 su = struct.unpack("<I", inLenLE) 227 inLen = su[0] - 80 # length without header 228 blk_hdr = self.read_xored(self.inF, 80) 229 inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen) 230 231 self.hash_str = calc_hash_str(blk_hdr) 232 if self.hash_str not in blkmap: 233 # Because blocks can be written to files out-of-order as of 0.10, the script 234 # may encounter blocks it doesn't know about. Treat as debug output. 235 if settings['debug_output'] == 'true': 236 print("Skipping unknown block " + self.hash_str) 237 self.inF.seek(inLen, os.SEEK_CUR) 238 continue 239 240 blkHeight = self.blkmap[self.hash_str] 241 self.blkCountIn += 1 242 243 if self.blkCountOut == blkHeight: 244 # If in-order block, just copy 245 rawblock = self.read_xored(self.inF, inLen) 246 self.writeBlock(inhdr, blk_hdr, rawblock) 247 248 # See if we can catch up to prior out-of-order blocks 249 while self.blkCountOut in self.blockExtents: 250 self.copyOneBlock() 251 252 else: # If out-of-order, skip over block data for now 253 self.blockExtents[blkHeight] = inExtent 254 if self.outOfOrderSize < self.settings['out_of_order_cache_sz']: 255 # If there is space in the cache, read the data 256 # Reading the data in file sequence instead of seeking and fetching it later is preferred, 257 # but we don't want to fill up memory 258 self.outOfOrderData[blkHeight] = self.read_xored(self.inF, inLen) 259 self.outOfOrderSize += inLen 260 else: # If no space in cache, seek forward 261 self.inF.seek(inLen, os.SEEK_CUR) 262 263 print("Done (%i blocks written)" % (self.blkCountOut)) 264 265 if __name__ == '__main__': 266 if len(sys.argv) != 2: 267 print("Usage: linearize-data.py CONFIG-FILE") 268 sys.exit(1) 269 270 with open(sys.argv[1]) as f: 271 for line in f: 272 # skip comment lines 273 m = re.search(r'^\s*#', line) 274 if m: 275 continue 276 277 # parse key=value lines 278 m = re.search(r'^(\w+)\s*=\s*(\S.*)$', line) 279 if m is None: 280 continue 281 settings[m.group(1)] = m.group(2) 282 283 # Force hash byte format setting to be lowercase to make comparisons easier. 284 # Also place upfront in case any settings need to know about it. 285 if 'rev_hash_bytes' not in settings: 286 settings['rev_hash_bytes'] = 'false' 287 settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower() 288 289 if 'netmagic' not in settings: 290 settings['netmagic'] = 'f9beb4d9' 291 if 'genesis' not in settings: 292 settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f' 293 if 'input' not in settings: 294 settings['input'] = 'input' 295 if 'hashlist' not in settings: 296 settings['hashlist'] = 'hashlist.txt' 297 if 'file_timestamp' not in settings: 298 settings['file_timestamp'] = 0 299 if 'split_timestamp' not in settings: 300 settings['split_timestamp'] = 0 301 if 'max_out_sz' not in settings: 302 settings['max_out_sz'] = 1000 * 1000 * 1000 303 if 'out_of_order_cache_sz' not in settings: 304 settings['out_of_order_cache_sz'] = 100 * 1000 * 1000 305 if 'debug_output' not in settings: 306 settings['debug_output'] = 'false' 307 308 settings['max_out_sz'] = int(settings['max_out_sz']) 309 settings['split_timestamp'] = int(settings['split_timestamp']) 310 settings['file_timestamp'] = int(settings['file_timestamp']) 311 settings['netmagic'] = bytes.fromhex(settings['netmagic']) 312 settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz']) 313 settings['debug_output'] = settings['debug_output'].lower() 314 315 if 'output_file' not in settings and 'output' not in settings: 316 print("Missing output file / directory") 317 sys.exit(1) 318 319 blkindex = get_block_hashes(settings) 320 blkmap = mkblockmap(blkindex) 321 322 # Block hash map won't be byte-reversed. Neither should the genesis hash. 323 if settings['genesis'] not in blkmap: 324 print("Genesis block not found in hashlist") 325 else: 326 BlockDataCopier(settings, blkindex, blkmap).run()