utxo_to_sqlite.py
1 #!/usr/bin/env python3 2 # Copyright (c) 2024-present The Bitcoin Core developers 3 # Distributed under the MIT software license, see the accompanying 4 # file COPYING or http://www.opensource.org/licenses/mit-license.php. 5 """Tool to convert a compact-serialized UTXO set to a SQLite3 database. 6 7 The input UTXO set can be generated by Bitcoin Core with the `dumptxoutset` RPC: 8 $ bitcoin-cli dumptxoutset ~/utxos.dat latest 9 10 The created database contains a table `utxos` with the following schema: 11 (txid TEXT, vout INT, value INT, coinbase INT, height INT, scriptpubkey TEXT) 12 13 If --txid=raw or --txid=rawle is specified, txid will be BLOB instead; 14 if --spk=raw, then scriptpubkey will be BLOB instead. 15 """ 16 import argparse 17 import os 18 import sqlite3 19 import sys 20 import time 21 22 23 UTXO_DUMP_MAGIC = b'utxo\xff' 24 UTXO_DUMP_VERSION = 2 25 NET_MAGIC_BYTES = { 26 b"\xf9\xbe\xb4\xd9": "Mainnet", 27 b"\x0a\x03\xcf\x40": "Signet", 28 b"\x0b\x11\x09\x07": "Testnet3", 29 b"\x1c\x16\x3f\x28": "Testnet4", 30 b"\xfa\xbf\xb5\xda": "Regtest", 31 } 32 33 34 def read_varint(f): 35 """Equivalent of `ReadVarInt()` (see serialization module).""" 36 n = 0 37 while True: 38 dat = f.read(1)[0] 39 n = (n << 7) | (dat & 0x7f) 40 if (dat & 0x80) > 0: 41 n += 1 42 else: 43 return n 44 45 46 def read_compactsize(f): 47 """Equivalent of `ReadCompactSize()` (see serialization module).""" 48 n = f.read(1)[0] 49 if n == 253: 50 n = int.from_bytes(f.read(2), "little") 51 elif n == 254: 52 n = int.from_bytes(f.read(4), "little") 53 elif n == 255: 54 n = int.from_bytes(f.read(8), "little") 55 return n 56 57 58 def decompress_amount(x): 59 """Equivalent of `DecompressAmount()` (see compressor module).""" 60 if x == 0: 61 return 0 62 x -= 1 63 e = x % 10 64 x //= 10 65 n = 0 66 if e < 9: 67 d = (x % 9) + 1 68 x //= 9 69 n = x * 10 + d 70 else: 71 n = x + 1 72 while e > 0: 73 n *= 10 74 e -= 1 75 return n 76 77 78 def decompress_script(f): 79 """Equivalent of `DecompressScript()` (see compressor module).""" 80 size = read_varint(f) # sizes 0-5 encode compressed script types 81 if size == 0: # P2PKH 82 return bytes([0x76, 0xa9, 20]) + f.read(20) + bytes([0x88, 0xac]) 83 elif size == 1: # P2SH 84 return bytes([0xa9, 20]) + f.read(20) + bytes([0x87]) 85 elif size in (2, 3): # P2PK (compressed) 86 return bytes([33, size]) + f.read(32) + bytes([0xac]) 87 elif size in (4, 5): # P2PK (uncompressed) 88 compressed_pubkey = bytes([size - 2]) + f.read(32) 89 return bytes([65]) + decompress_pubkey(compressed_pubkey) + bytes([0xac]) 90 else: # others (bare multisig, segwit etc.) 91 size -= 6 92 assert size <= 10000, f"too long script with size {size}" 93 return f.read(size) 94 95 96 def decompress_pubkey(compressed_pubkey): 97 """Decompress pubkey by calculating y = sqrt(x^3 + 7) % p 98 (see functions `secp256k1_eckey_pubkey_parse` and `secp256k1_ge_set_xo_var`). 99 """ 100 P = 2**256 - 2**32 - 977 # secp256k1 field size 101 assert len(compressed_pubkey) == 33 and compressed_pubkey[0] in (2, 3) 102 x = int.from_bytes(compressed_pubkey[1:], 'big') 103 rhs = (x**3 + 7) % P 104 y = pow(rhs, (P + 1)//4, P) # get sqrt using Tonelli-Shanks algorithm (for p % 4 = 3) 105 assert pow(y, 2, P) == rhs, f"pubkey is not on curve ({compressed_pubkey.hex()})" 106 tag_is_odd = compressed_pubkey[0] == 3 107 y_is_odd = (y & 1) == 1 108 if tag_is_odd != y_is_odd: # fix parity (even/odd) if necessary 109 y = P - y 110 return bytes([4]) + x.to_bytes(32, 'big') + y.to_bytes(32, 'big') 111 112 113 def main(): 114 parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) 115 parser.add_argument('infile', help='filename of compact-serialized UTXO set (input)') 116 parser.add_argument('outfile', help='filename of created SQLite3 database (output)') 117 parser.add_argument('--verbose', action='store_true', help='show details about each UTXO') 118 parser.add_argument('--spk', choices=['hex', 'raw'], default='hex', help='encode scriptPubKey as hex or raw bytes') 119 parser.add_argument('--txid', choices=['hex', 'raw', 'rawle'], default='hex', help='encode txid as hex, raw bytes (sha256 byteorder), or reversed raw bytes (little endian)') 120 args = parser.parse_args() 121 122 if not os.path.exists(args.infile): 123 print(f"Error: provided input file '{args.infile}' doesn't exist.") 124 sys.exit(1) 125 126 if os.path.exists(args.outfile): 127 print(f"Error: provided output file '{args.outfile}' already exists.") 128 sys.exit(1) 129 130 spk_hex = (args.spk == 'hex') 131 txid_hex = (args.txid == 'hex') 132 txid_reverse = (args.txid != 'raw') 133 134 # create database table 135 txid_fmt = "TEXT" if txid_hex else "BLOB" 136 spk_fmt = "TEXT" if spk_hex else "BLOB" 137 con = sqlite3.connect(args.outfile) 138 con.execute(f"CREATE TABLE utxos(txid {txid_fmt}, vout INT, value INT, coinbase INT, height INT, scriptpubkey {spk_fmt})") 139 140 # read metadata (magic bytes, version, network magic, block hash, UTXO count) 141 f = open(args.infile, 'rb') 142 magic_bytes = f.read(5) 143 version = int.from_bytes(f.read(2), 'little') 144 network_magic = f.read(4) 145 block_hash = f.read(32) 146 num_utxos = int.from_bytes(f.read(8), 'little') 147 if magic_bytes != UTXO_DUMP_MAGIC: 148 print(f"Error: provided input file '{args.infile}' is not an UTXO dump.") 149 sys.exit(1) 150 if version != UTXO_DUMP_VERSION: 151 print(f"Error: provided input file '{args.infile}' has unknown UTXO dump version {version} " 152 f"(only version {UTXO_DUMP_VERSION} supported)") 153 sys.exit(1) 154 network_string = NET_MAGIC_BYTES.get(network_magic, f"unknown network ({network_magic.hex()})") 155 print(f"UTXO Snapshot for {network_string} at block hash " 156 f"{block_hash[::-1].hex()[:32]}..., contains {num_utxos} coins") 157 158 start_time = time.time() 159 write_batch = [] 160 coins_per_hash_left = 0 161 prevout_hash = None 162 max_height = 0 163 164 for coin_idx in range(1, num_utxos+1): 165 # read key (COutPoint) 166 if coins_per_hash_left == 0: # read next prevout hash 167 prevout_hash = f.read(32) 168 coins_per_hash_left = read_compactsize(f) 169 prevout_index = read_compactsize(f) 170 # read value (Coin) 171 code = read_varint(f) 172 height = code >> 1 173 is_coinbase = code & 1 174 amount = decompress_amount(read_varint(f)) 175 scriptpubkey = decompress_script(f) 176 177 scriptpubkey_write = scriptpubkey.hex() if spk_hex else scriptpubkey 178 txid_write = prevout_hash[::-1] if txid_reverse else prevout_hash 179 txid_write = txid_write.hex() if txid_hex else txid_write 180 write_batch.append((txid_write, prevout_index, amount, is_coinbase, height, scriptpubkey_write)) 181 if height > max_height: 182 max_height = height 183 coins_per_hash_left -= 1 184 185 if args.verbose: 186 print(f"Coin {coin_idx}/{num_utxos}:") 187 print(f" prevout = {prevout_hash[::-1].hex()}:{prevout_index}") 188 print(f" amount = {amount}, height = {height}, coinbase = {is_coinbase}") 189 print(f" scriptPubKey = {scriptpubkey.hex()}\n") 190 191 if coin_idx % (16*1024) == 0 or coin_idx == num_utxos: 192 # write utxo batch to database 193 con.executemany("INSERT INTO utxos VALUES(?, ?, ?, ?, ?, ?)", write_batch) 194 con.commit() 195 write_batch.clear() 196 197 if coin_idx % (1024*1024) == 0: 198 elapsed = time.time() - start_time 199 print(f"{coin_idx} coins converted [{coin_idx/num_utxos*100:.2f}%], " + 200 f"{elapsed:.3f}s passed since start") 201 con.close() 202 203 print(f"TOTAL: {num_utxos} coins written to {args.outfile}, snapshot height is {max_height}.") 204 if f.read(1) != b'': # EOF should be reached by now 205 print(f"WARNING: input file {args.infile} has not reached EOF yet!") 206 sys.exit(1) 207 208 209 if __name__ == '__main__': 210 main()