feature_dbcrash.py
1 #!/usr/bin/env python3 2 # Copyright (c) 2017-2022 The Bitcoin Core developers 3 # Distributed under the MIT software license, see the accompanying 4 # file COPYING or http://www.opensource.org/licenses/mit-license.php. 5 """Test recovery from a crash during chainstate writing. 6 7 - 4 nodes 8 * node0, node1, and node2 will have different dbcrash ratios, and different 9 dbcache sizes 10 * node3 will be a regular node, with no crashing. 11 * The nodes will not connect to each other. 12 13 - use default test framework starting chain. initialize starting_tip_height to 14 tip height. 15 16 - Main loop: 17 * generate lots of transactions on node3, enough to fill up a block. 18 * uniformly randomly pick a tip height from starting_tip_height to 19 tip_height; with probability 1/(height_difference+4), invalidate this block. 20 * mine enough blocks to overtake tip_height at start of loop. 21 * for each node in [node0,node1,node2]: 22 - for each mined block: 23 * submit block to node 24 * if node crashed on/after submitting: 25 - restart until recovery succeeds 26 - check that utxo matches node3 using gettxoutsetinfo""" 27 28 import errno 29 import http.client 30 import random 31 import time 32 33 from test_framework.blocktools import COINBASE_MATURITY 34 from test_framework.messages import ( 35 COIN, 36 ) 37 from test_framework.test_framework import BitcoinTestFramework 38 from test_framework.util import ( 39 assert_equal, 40 ) 41 from test_framework.wallet import ( 42 MiniWallet, 43 getnewdestination, 44 ) 45 46 47 class ChainstateWriteCrashTest(BitcoinTestFramework): 48 def set_test_params(self): 49 self.num_nodes = 4 50 self.rpc_timeout = 480 51 self.supports_cli = False 52 53 # Set -maxmempool=0 to turn off mempool memory sharing with dbcache 54 self.base_args = [ 55 "-limitdescendantsize=0", 56 "-maxmempool=0", 57 "-dbbatchsize=200000", 58 ] 59 60 # Set different crash ratios and cache sizes. Note that not all of 61 # -dbcache goes to the in-memory coins cache. 62 self.node0_args = ["-dbcrashratio=8", "-dbcache=4"] + self.base_args 63 self.node1_args = ["-dbcrashratio=16", "-dbcache=8"] + self.base_args 64 self.node2_args = ["-dbcrashratio=24", "-dbcache=16"] + self.base_args 65 66 # Node3 is a normal node with default args, except will mine full blocks 67 # and txs with "dust" outputs 68 self.node3_args = ["-blockmaxweight=4000000", "-dustrelayfee=0"] 69 self.extra_args = [self.node0_args, self.node1_args, self.node2_args, self.node3_args] 70 71 def setup_network(self): 72 self.add_nodes(self.num_nodes, extra_args=self.extra_args) 73 self.start_nodes() 74 # Leave them unconnected, we'll use submitblock directly in this test 75 76 def restart_node(self, node_index, expected_tip): 77 """Start up a given node id, wait for the tip to reach the given block hash, and calculate the utxo hash. 78 79 Exceptions on startup should indicate node crash (due to -dbcrashratio), in which case we try again. Give up 80 after 60 seconds. Returns the utxo hash of the given node.""" 81 82 time_start = time.time() 83 while time.time() - time_start < 120: 84 try: 85 # Any of these RPC calls could throw due to node crash 86 self.start_node(node_index) 87 self.nodes[node_index].waitforblock(expected_tip) 88 utxo_hash = self.nodes[node_index].gettxoutsetinfo()['hash_serialized_3'] 89 return utxo_hash 90 except Exception: 91 # An exception here should mean the node is about to crash. 92 # If bitcoind exits, then try again. wait_for_node_exit() 93 # should raise an exception if bitcoind doesn't exit. 94 self.wait_for_node_exit(node_index, timeout=10) 95 self.crashed_on_restart += 1 96 time.sleep(1) 97 98 # If we got here, bitcoind isn't coming back up on restart. Could be a 99 # bug in bitcoind, or we've gotten unlucky with our dbcrash ratio -- 100 # perhaps we generated a test case that blew up our cache? 101 # TODO: If this happens a lot, we should try to restart without -dbcrashratio 102 # and make sure that recovery happens. 103 raise AssertionError(f"Unable to successfully restart node {node_index} in allotted time") 104 105 def submit_block_catch_error(self, node_index, block): 106 """Try submitting a block to the given node. 107 108 Catch any exceptions that indicate the node has crashed. 109 Returns true if the block was submitted successfully; false otherwise.""" 110 111 try: 112 self.nodes[node_index].submitblock(block) 113 return True 114 except (http.client.CannotSendRequest, http.client.RemoteDisconnected) as e: 115 self.log.debug(f"node {node_index} submitblock raised exception: {e}") 116 return False 117 except OSError as e: 118 self.log.debug(f"node {node_index} submitblock raised OSError exception: errno={e.errno}") 119 if e.errno in [errno.EPIPE, errno.ECONNREFUSED, errno.ECONNRESET]: 120 # The node has likely crashed 121 return False 122 else: 123 # Unexpected exception, raise 124 raise 125 126 def sync_node3blocks(self, block_hashes): 127 """Use submitblock to sync node3's chain with the other nodes 128 129 If submitblock fails, restart the node and get the new utxo hash. 130 If any nodes crash while updating, we'll compare utxo hashes to 131 ensure recovery was successful.""" 132 133 node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_3'] 134 135 # Retrieve all the blocks from node3 136 blocks = [] 137 for block_hash in block_hashes: 138 blocks.append([block_hash, self.nodes[3].getblock(block_hash, 0)]) 139 140 # Deliver each block to each other node 141 for i in range(3): 142 nodei_utxo_hash = None 143 self.log.debug(f"Syncing blocks to node {i}") 144 for (block_hash, block) in blocks: 145 # Get the block from node3, and submit to node_i 146 self.log.debug(f"submitting block {block_hash}") 147 if not self.submit_block_catch_error(i, block): 148 # TODO: more carefully check that the crash is due to -dbcrashratio 149 # (change the exit code perhaps, and check that here?) 150 self.wait_for_node_exit(i, timeout=30) 151 self.log.debug(f"Restarting node {i} after block hash {block_hash}") 152 nodei_utxo_hash = self.restart_node(i, block_hash) 153 assert nodei_utxo_hash is not None 154 self.restart_counts[i] += 1 155 else: 156 # Clear it out after successful submitblock calls -- the cached 157 # utxo hash will no longer be correct 158 nodei_utxo_hash = None 159 160 # Check that the utxo hash matches node3's utxo set 161 # NOTE: we only check the utxo set if we had to restart the node 162 # after the last block submitted: 163 # - checking the utxo hash causes a cache flush, which we don't 164 # want to do every time; so 165 # - we only update the utxo cache after a node restart, since flushing 166 # the cache is a no-op at that point 167 if nodei_utxo_hash is not None: 168 self.log.debug(f"Checking txoutsetinfo matches for node {i}") 169 assert_equal(nodei_utxo_hash, node3_utxo_hash) 170 171 def verify_utxo_hash(self): 172 """Verify that the utxo hash of each node matches node3. 173 174 Restart any nodes that crash while querying.""" 175 node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_3'] 176 self.log.info("Verifying utxo hash matches for all nodes") 177 178 for i in range(3): 179 try: 180 nodei_utxo_hash = self.nodes[i].gettxoutsetinfo()['hash_serialized_3'] 181 except OSError: 182 # probably a crash on db flushing 183 nodei_utxo_hash = self.restart_node(i, self.nodes[3].getbestblockhash()) 184 assert_equal(nodei_utxo_hash, node3_utxo_hash) 185 186 def generate_small_transactions(self, node, count, utxo_list): 187 FEE = 1000 # TODO: replace this with node relay fee based calculation 188 num_transactions = 0 189 random.shuffle(utxo_list) 190 while len(utxo_list) >= 2 and num_transactions < count: 191 utxos_to_spend = [utxo_list.pop() for _ in range(2)] 192 input_amount = int(sum([utxo['value'] for utxo in utxos_to_spend]) * COIN) 193 if input_amount < FEE: 194 # Sanity check -- if we chose inputs that are too small, skip 195 continue 196 197 self.wallet.send_self_transfer_multi( 198 from_node=node, 199 utxos_to_spend=utxos_to_spend, 200 num_outputs=3, 201 fee_per_output=FEE // 3, 202 ) 203 num_transactions += 1 204 205 def run_test(self): 206 self.wallet = MiniWallet(self.nodes[3]) 207 initial_height = self.nodes[3].getblockcount() 208 self.generate(self.nodes[3], COINBASE_MATURITY, sync_fun=self.no_op) 209 210 # Track test coverage statistics 211 self.restart_counts = [0, 0, 0] # Track the restarts for nodes 0-2 212 self.crashed_on_restart = 0 # Track count of crashes during recovery 213 214 # Start by creating a lot of utxos on node3 215 utxo_list = [] 216 for _ in range(5): 217 utxo_list.extend(self.wallet.send_self_transfer_multi(from_node=self.nodes[3], num_outputs=1000)['new_utxos']) 218 self.generate(self.nodes[3], 1, sync_fun=self.no_op) 219 assert_equal(len(self.nodes[3].getrawmempool()), 0) 220 self.log.info(f"Prepped {len(utxo_list)} utxo entries") 221 222 # Sync these blocks with the other nodes 223 block_hashes_to_sync = [] 224 for height in range(initial_height + 1, self.nodes[3].getblockcount() + 1): 225 block_hashes_to_sync.append(self.nodes[3].getblockhash(height)) 226 227 self.log.debug(f"Syncing {len(block_hashes_to_sync)} blocks with other nodes") 228 # Syncing the blocks could cause nodes to crash, so the test begins here. 229 self.sync_node3blocks(block_hashes_to_sync) 230 231 starting_tip_height = self.nodes[3].getblockcount() 232 233 # Main test loop: 234 # each time through the loop, generate a bunch of transactions, 235 # and then either mine a single new block on the tip, or some-sized reorg. 236 for i in range(40): 237 self.log.info(f"Iteration {i}, generating 2500 transactions {self.restart_counts}") 238 # Generate a bunch of small-ish transactions 239 self.generate_small_transactions(self.nodes[3], 2500, utxo_list) 240 # Pick a random block between current tip, and starting tip 241 current_height = self.nodes[3].getblockcount() 242 random_height = random.randint(starting_tip_height, current_height) 243 self.log.debug(f"At height {current_height}, considering height {random_height}") 244 if random_height > starting_tip_height: 245 # Randomly reorg from this point with some probability (1/4 for 246 # tip, 1/5 for tip-1, ...) 247 if random.random() < 1.0 / (current_height + 4 - random_height): 248 self.log.debug(f"Invalidating block at height {random_height}") 249 self.nodes[3].invalidateblock(self.nodes[3].getblockhash(random_height)) 250 251 # Now generate new blocks until we pass the old tip height 252 self.log.debug("Mining longer tip") 253 block_hashes = [] 254 while current_height + 1 > self.nodes[3].getblockcount(): 255 block_hashes.extend(self.generatetoaddress( 256 self.nodes[3], 257 nblocks=min(10, current_height + 1 - self.nodes[3].getblockcount()), 258 # new address to avoid mining a block that has just been invalidated 259 address=getnewdestination()[2], 260 sync_fun=self.no_op, 261 )) 262 self.log.debug(f"Syncing {len(block_hashes)} new blocks...") 263 self.sync_node3blocks(block_hashes) 264 self.wallet.rescan_utxos() 265 utxo_list = self.wallet.get_utxos() 266 self.log.debug(f"MiniWallet utxo count: {len(utxo_list)}") 267 268 # Check that the utxo hashes agree with node3 269 # Useful side effect: each utxo cache gets flushed here, so that we 270 # won't get crashes on shutdown at the end of the test. 271 self.verify_utxo_hash() 272 273 # Check the test coverage 274 self.log.info(f"Restarted nodes: {self.restart_counts}; crashes on restart: {self.crashed_on_restart}") 275 276 # If no nodes were restarted, we didn't test anything. 277 assert self.restart_counts != [0, 0, 0] 278 279 # Make sure we tested the case of crash-during-recovery. 280 assert self.crashed_on_restart > 0 281 282 # Warn if any of the nodes escaped restart. 283 for i in range(3): 284 if self.restart_counts[i] == 0: 285 self.log.warning(f"Node {i} never crashed during utxo flush!") 286 287 288 if __name__ == "__main__": 289 ChainstateWriteCrashTest().main()