bdb.py
  1  #!/usr/bin/env python3
  2  # Copyright (c) 2020-2021 The Bitcoin Core developers
  3  # Distributed under the MIT software license, see the accompanying
  4  # file COPYING or http://www.opensource.org/licenses/mit-license.php.
  5  """
  6  Utilities for working directly with the wallet's BDB database file
  7  
  8  This is specific to the configuration of BDB used in this project:
  9      - pagesize: 4096 bytes
 10      - Outer database contains single subdatabase named 'main'
 11      - btree
 12      - btree leaf pages
 13  
 14  Each key-value pair is two entries in a btree leaf. The first is the key, the one that follows
 15  is the value. And so on. Note that the entry data is itself not in the correct order. Instead
 16  entry offsets are stored in the correct order and those offsets are needed to then retrieve
 17  the data itself.
 18  
 19  Page format can be found in BDB source code dbinc/db_page.h
 20  This only implements the deserialization of btree metadata pages and normal btree pages. Overflow
 21  pages are not implemented but may be needed in the future if dealing with wallets with large
 22  transactions.
 23  
 24  `db_dump -da wallet.dat` is useful to see the data in a wallet.dat BDB file
 25  """
 26  
 27  import struct
 28  
 29  # Important constants
 30  PAGESIZE = 4096
 31  OUTER_META_PAGE = 0
 32  INNER_META_PAGE = 2
 33  
 34  # Page type values
 35  BTREE_INTERNAL = 3
 36  BTREE_LEAF = 5
 37  BTREE_META = 9
 38  
 39  # Some magic numbers for sanity checking
 40  BTREE_MAGIC = 0x053162
 41  DB_VERSION = 9
 42  
 43  # Deserializes a leaf page into a dict.
 44  # Btree internal pages have the same header, for those, return None.
 45  # For the btree leaf pages, deserialize them and put all the data into a dict
 46  def dump_leaf_page(data):
 47      page_info = {}
 48      page_header = data[0:26]
 49      _, pgno, prev_pgno, next_pgno, entries, hf_offset, level, pg_type = struct.unpack('QIIIHHBB', page_header)
 50      page_info['pgno'] = pgno
 51      page_info['prev_pgno'] = prev_pgno
 52      page_info['next_pgno'] = next_pgno
 53      page_info['hf_offset'] = hf_offset
 54      page_info['level'] = level
 55      page_info['pg_type'] = pg_type
 56      page_info['entry_offsets'] = struct.unpack('{}H'.format(entries), data[26:26 + entries * 2])
 57      page_info['entries'] = []
 58  
 59      if pg_type == BTREE_INTERNAL:
 60          # Skip internal pages. These are the internal nodes of the btree and don't contain anything relevant to us
 61          return None
 62  
 63      assert pg_type == BTREE_LEAF, 'A non-btree leaf page has been encountered while dumping leaves'
 64  
 65      for i in range(0, entries):
 66          offset = page_info['entry_offsets'][i]
 67          entry = {'offset': offset}
 68          page_data_header = data[offset:offset + 3]
 69          e_len, pg_type = struct.unpack('HB', page_data_header)
 70          entry['len'] = e_len
 71          entry['pg_type'] = pg_type
 72          entry['data'] = data[offset + 3:offset + 3 + e_len]
 73          page_info['entries'].append(entry)
 74  
 75      return page_info
 76  
 77  # Deserializes a btree metadata page into a dict.
 78  # Does a simple sanity check on the magic value, type, and version
 79  def dump_meta_page(page):
 80      # metadata page
 81      # general metadata
 82      metadata = {}
 83      meta_page = page[0:72]
 84      _, pgno, magic, version, pagesize, encrypt_alg, pg_type, metaflags, _, free, last_pgno, nparts, key_count, record_count, flags, uid = struct.unpack('QIIIIBBBBIIIIII20s', meta_page)
 85      metadata['pgno'] = pgno
 86      metadata['magic'] = magic
 87      metadata['version'] = version
 88      metadata['pagesize'] = pagesize
 89      metadata['encrypt_alg'] = encrypt_alg
 90      metadata['pg_type'] = pg_type
 91      metadata['metaflags'] = metaflags
 92      metadata['free'] = free
 93      metadata['last_pgno'] = last_pgno
 94      metadata['nparts'] = nparts
 95      metadata['key_count'] = key_count
 96      metadata['record_count'] = record_count
 97      metadata['flags'] = flags
 98      metadata['uid'] = uid.hex().encode()
 99  
100      assert magic == BTREE_MAGIC, 'bdb magic does not match bdb btree magic'
101      assert pg_type == BTREE_META, 'Metadata page is not a btree metadata page'
102      assert version == DB_VERSION, 'Database too new'
103  
104      # btree metadata
105      btree_meta_page = page[72:512]
106      _, minkey, re_len, re_pad, root, _, crypto_magic, _, iv, chksum = struct.unpack('IIIII368sI12s16s20s', btree_meta_page)
107      metadata['minkey'] = minkey
108      metadata['re_len'] = re_len
109      metadata['re_pad'] = re_pad
110      metadata['root'] = root
111      metadata['crypto_magic'] = crypto_magic
112      metadata['iv'] = iv.hex().encode()
113      metadata['chksum'] = chksum.hex().encode()
114  
115      return metadata
116  
117  # Given the dict from dump_leaf_page, get the key-value pairs and put them into a dict
118  def extract_kv_pairs(page_data):
119      out = {}
120      last_key = None
121      for i, entry in enumerate(page_data['entries']):
122          # By virtue of these all being pairs, even number entries are keys, and odd are values
123          if i % 2 == 0:
124              out[entry['data']] = b''
125              last_key = entry['data']
126          else:
127              out[last_key] = entry['data']
128      return out
129  
130  # Extract the key-value pairs of the BDB file given in filename
131  def dump_bdb_kv(filename):
132      # Read in the BDB file and start deserializing it
133      pages = []
134      with open(filename, 'rb') as f:
135          data = f.read(PAGESIZE)
136          while len(data) > 0:
137              pages.append(data)
138              data = f.read(PAGESIZE)
139  
140      # Sanity check the meta pages
141      dump_meta_page(pages[OUTER_META_PAGE])
142      dump_meta_page(pages[INNER_META_PAGE])
143  
144      # Fetch the kv pairs from the leaf pages
145      kv = {}
146      for i in range(3, len(pages)):
147          info = dump_leaf_page(pages[i])
148          if info is not None:
149              info_kv = extract_kv_pairs(info)
150              kv = {**kv, **info_kv}
151      return kv