linearize-data.py
1 #!/usr/bin/env python3 2 # 3 # linearize-data.py: Construct a linear, no-fork version of the chain. 4 # 5 # Copyright (c) 2013-2022 The Bitcoin Core developers 6 # Distributed under the MIT software license, see the accompanying 7 # file COPYING or http://www.opensource.org/licenses/mit-license.php. 8 # 9 10 import struct 11 import re 12 import os 13 import os.path 14 import sys 15 import hashlib 16 import datetime 17 import time 18 import glob 19 from collections import namedtuple 20 21 settings = {} 22 23 def calc_hash_str(blk_hdr): 24 blk_hdr_hash = hashlib.sha256(hashlib.sha256(blk_hdr).digest()).digest() 25 return blk_hdr_hash[::-1].hex() 26 27 def get_blk_dt(blk_hdr): 28 members = struct.unpack("<I", blk_hdr[68:68+4]) 29 nTime = members[0] 30 dt = datetime.datetime.fromtimestamp(nTime) 31 dt_ym = datetime.datetime(dt.year, dt.month, 1) 32 return (dt_ym, nTime) 33 34 # When getting the list of block hashes, undo any byte reversals. 35 def get_block_hashes(settings): 36 blkindex = [] 37 with open(settings['hashlist'], "r", encoding="utf8") as f: 38 for line in f: 39 line = line.rstrip() 40 if settings['rev_hash_bytes'] == 'true': 41 line = bytes.fromhex(line)[::-1].hex() 42 blkindex.append(line) 43 44 print("Read " + str(len(blkindex)) + " hashes") 45 46 return blkindex 47 48 # The block map shouldn't give or receive byte-reversed hashes. 49 def mkblockmap(blkindex): 50 blkmap = {} 51 for height,hash in enumerate(blkindex): 52 blkmap[hash] = height 53 return blkmap 54 55 # This gets the first block file ID that exists from the input block 56 # file directory. 57 def getFirstBlockFileId(block_dir_path): 58 # First, this sets up a pattern to search for block files, for 59 # example 'blkNNNNN.dat'. 60 blkFilePattern = os.path.join(block_dir_path, "blk[0-9][0-9][0-9][0-9][0-9].dat") 61 62 # This search is done with glob 63 blkFnList = glob.glob(blkFilePattern) 64 65 if len(blkFnList) == 0: 66 print("blocks not pruned - starting at 0") 67 return 0 68 # We then get the lexicographic minimum, which should be the first 69 # block file name. 70 firstBlkFilePath = min(blkFnList) 71 firstBlkFn = os.path.basename(firstBlkFilePath) 72 73 # now, the string should be ['b','l','k','N','N','N','N','N','.','d','a','t'] 74 # So get the ID by choosing: 3 4 5 6 7 75 # The ID is not necessarily 0 if this is a pruned node. 76 blkId = int(firstBlkFn[3:8]) 77 return blkId 78 79 # Block header and extent on disk 80 BlockExtent = namedtuple('BlockExtent', ['fn', 'offset', 'inhdr', 'blkhdr', 'size']) 81 82 class BlockDataCopier: 83 def __init__(self, settings, blkindex, blkmap): 84 self.settings = settings 85 self.blkindex = blkindex 86 self.blkmap = blkmap 87 88 # Get first occurring block file id - for pruned nodes this 89 # will not necessarily be 0 90 self.inFn = getFirstBlockFileId(self.settings['input']) 91 self.inF = None 92 self.outFn = 0 93 self.outsz = 0 94 self.outF = None 95 self.outFname = None 96 self.blkCountIn = 0 97 self.blkCountOut = 0 98 99 self.lastDate = datetime.datetime(2000, 1, 1) 100 self.highTS = 1408893517 - 315360000 101 self.timestampSplit = False 102 self.fileOutput = True 103 self.setFileTime = False 104 self.maxOutSz = settings['max_out_sz'] 105 if 'output' in settings: 106 self.fileOutput = False 107 if settings['file_timestamp'] != 0: 108 self.setFileTime = True 109 if settings['split_timestamp'] != 0: 110 self.timestampSplit = True 111 # Extents and cache for out-of-order blocks 112 self.blockExtents = {} 113 self.outOfOrderData = {} 114 self.outOfOrderSize = 0 # running total size for items in outOfOrderData 115 116 def writeBlock(self, inhdr, blk_hdr, rawblock): 117 blockSizeOnDisk = len(inhdr) + len(blk_hdr) + len(rawblock) 118 if not self.fileOutput and ((self.outsz + blockSizeOnDisk) > self.maxOutSz): 119 self.outF.close() 120 if self.setFileTime: 121 os.utime(self.outFname, (int(time.time()), self.highTS)) 122 self.outF = None 123 self.outFname = None 124 self.outFn = self.outFn + 1 125 self.outsz = 0 126 127 (blkDate, blkTS) = get_blk_dt(blk_hdr) 128 if self.timestampSplit and (blkDate > self.lastDate): 129 print("New month " + blkDate.strftime("%Y-%m") + " @ " + self.hash_str) 130 self.lastDate = blkDate 131 if self.outF: 132 self.outF.close() 133 if self.setFileTime: 134 os.utime(self.outFname, (int(time.time()), self.highTS)) 135 self.outF = None 136 self.outFname = None 137 self.outFn = self.outFn + 1 138 self.outsz = 0 139 140 if not self.outF: 141 if self.fileOutput: 142 self.outFname = self.settings['output_file'] 143 else: 144 self.outFname = os.path.join(self.settings['output'], "blk%05d.dat" % self.outFn) 145 print("Output file " + self.outFname) 146 self.outF = open(self.outFname, "wb") 147 148 self.outF.write(inhdr) 149 self.outF.write(blk_hdr) 150 self.outF.write(rawblock) 151 self.outsz = self.outsz + len(inhdr) + len(blk_hdr) + len(rawblock) 152 153 self.blkCountOut = self.blkCountOut + 1 154 if blkTS > self.highTS: 155 self.highTS = blkTS 156 157 if (self.blkCountOut % 1000) == 0: 158 print('%i blocks scanned, %i blocks written (of %i, %.1f%% complete)' % 159 (self.blkCountIn, self.blkCountOut, len(self.blkindex), 100.0 * self.blkCountOut / len(self.blkindex))) 160 161 def inFileName(self, fn): 162 return os.path.join(self.settings['input'], "blk%05d.dat" % fn) 163 164 def fetchBlock(self, extent): 165 '''Fetch block contents from disk given extents''' 166 with open(self.inFileName(extent.fn), "rb") as f: 167 f.seek(extent.offset) 168 return f.read(extent.size) 169 170 def copyOneBlock(self): 171 '''Find the next block to be written in the input, and copy it to the output.''' 172 extent = self.blockExtents.pop(self.blkCountOut) 173 if self.blkCountOut in self.outOfOrderData: 174 # If the data is cached, use it from memory and remove from the cache 175 rawblock = self.outOfOrderData.pop(self.blkCountOut) 176 self.outOfOrderSize -= len(rawblock) 177 else: # Otherwise look up data on disk 178 rawblock = self.fetchBlock(extent) 179 180 self.writeBlock(extent.inhdr, extent.blkhdr, rawblock) 181 182 def run(self): 183 while self.blkCountOut < len(self.blkindex): 184 if not self.inF: 185 fname = self.inFileName(self.inFn) 186 print("Input file " + fname) 187 try: 188 self.inF = open(fname, "rb") 189 except IOError: 190 print("Premature end of block data") 191 return 192 193 inhdr = self.inF.read(8) 194 if (not inhdr or (inhdr[0] == "\0")): 195 self.inF.close() 196 self.inF = None 197 self.inFn = self.inFn + 1 198 continue 199 200 inMagic = inhdr[:4] 201 if (inMagic != self.settings['netmagic']): 202 # Seek backwards 7 bytes (skipping the first byte in the previous search) 203 # and continue searching from the new position if the magic bytes are not 204 # found. 205 self.inF.seek(-7, os.SEEK_CUR) 206 continue 207 inLenLE = inhdr[4:] 208 su = struct.unpack("<I", inLenLE) 209 inLen = su[0] - 80 # length without header 210 blk_hdr = self.inF.read(80) 211 inExtent = BlockExtent(self.inFn, self.inF.tell(), inhdr, blk_hdr, inLen) 212 213 self.hash_str = calc_hash_str(blk_hdr) 214 if not self.hash_str in blkmap: 215 # Because blocks can be written to files out-of-order as of 0.10, the script 216 # may encounter blocks it doesn't know about. Treat as debug output. 217 if settings['debug_output'] == 'true': 218 print("Skipping unknown block " + self.hash_str) 219 self.inF.seek(inLen, os.SEEK_CUR) 220 continue 221 222 blkHeight = self.blkmap[self.hash_str] 223 self.blkCountIn += 1 224 225 if self.blkCountOut == blkHeight: 226 # If in-order block, just copy 227 rawblock = self.inF.read(inLen) 228 self.writeBlock(inhdr, blk_hdr, rawblock) 229 230 # See if we can catch up to prior out-of-order blocks 231 while self.blkCountOut in self.blockExtents: 232 self.copyOneBlock() 233 234 else: # If out-of-order, skip over block data for now 235 self.blockExtents[blkHeight] = inExtent 236 if self.outOfOrderSize < self.settings['out_of_order_cache_sz']: 237 # If there is space in the cache, read the data 238 # Reading the data in file sequence instead of seeking and fetching it later is preferred, 239 # but we don't want to fill up memory 240 self.outOfOrderData[blkHeight] = self.inF.read(inLen) 241 self.outOfOrderSize += inLen 242 else: # If no space in cache, seek forward 243 self.inF.seek(inLen, os.SEEK_CUR) 244 245 print("Done (%i blocks written)" % (self.blkCountOut)) 246 247 if __name__ == '__main__': 248 if len(sys.argv) != 2: 249 print("Usage: linearize-data.py CONFIG-FILE") 250 sys.exit(1) 251 252 with open(sys.argv[1], encoding="utf8") as f: 253 for line in f: 254 # skip comment lines 255 m = re.search(r'^\s*#', line) 256 if m: 257 continue 258 259 # parse key=value lines 260 m = re.search(r'^(\w+)\s*=\s*(\S.*)$', line) 261 if m is None: 262 continue 263 settings[m.group(1)] = m.group(2) 264 265 # Force hash byte format setting to be lowercase to make comparisons easier. 266 # Also place upfront in case any settings need to know about it. 267 if 'rev_hash_bytes' not in settings: 268 settings['rev_hash_bytes'] = 'false' 269 settings['rev_hash_bytes'] = settings['rev_hash_bytes'].lower() 270 271 if 'netmagic' not in settings: 272 settings['netmagic'] = 'f9beb4d9' 273 if 'genesis' not in settings: 274 settings['genesis'] = '000000000019d6689c085ae165831e934ff763ae46a2a6c172b3f1b60a8ce26f' 275 if 'input' not in settings: 276 settings['input'] = 'input' 277 if 'hashlist' not in settings: 278 settings['hashlist'] = 'hashlist.txt' 279 if 'file_timestamp' not in settings: 280 settings['file_timestamp'] = 0 281 if 'split_timestamp' not in settings: 282 settings['split_timestamp'] = 0 283 if 'max_out_sz' not in settings: 284 settings['max_out_sz'] = 1000 * 1000 * 1000 285 if 'out_of_order_cache_sz' not in settings: 286 settings['out_of_order_cache_sz'] = 100 * 1000 * 1000 287 if 'debug_output' not in settings: 288 settings['debug_output'] = 'false' 289 290 settings['max_out_sz'] = int(settings['max_out_sz']) 291 settings['split_timestamp'] = int(settings['split_timestamp']) 292 settings['file_timestamp'] = int(settings['file_timestamp']) 293 settings['netmagic'] = bytes.fromhex(settings['netmagic']) 294 settings['out_of_order_cache_sz'] = int(settings['out_of_order_cache_sz']) 295 settings['debug_output'] = settings['debug_output'].lower() 296 297 if 'output_file' not in settings and 'output' not in settings: 298 print("Missing output file / directory") 299 sys.exit(1) 300 301 blkindex = get_block_hashes(settings) 302 blkmap = mkblockmap(blkindex) 303 304 # Block hash map won't be byte-reversed. Neither should the genesis hash. 305 if not settings['genesis'] in blkmap: 306 print("Genesis block not found in hashlist") 307 else: 308 BlockDataCopier(settings, blkindex, blkmap).run()