feature_dbcrash.py
1 #!/usr/bin/env python3 2 # Copyright (c) 2017-present The Bitcoin Core developers 3 # Distributed under the MIT software license, see the accompanying 4 # file COPYING or http://www.opensource.org/licenses/mit-license.php. 5 """Test recovery from a crash during chainstate writing. 6 7 - 4 nodes 8 * node0, node1, and node2 will have different dbcrash ratios, and different 9 dbcache sizes 10 * node3 will be a regular node, with no crashing. 11 * The nodes will not connect to each other. 12 13 - use default test framework starting chain. initialize starting_tip_height to 14 tip height. 15 16 - Main loop: 17 * generate lots of transactions on node3, enough to fill up a block. 18 * uniformly randomly pick a tip height from starting_tip_height to 19 tip_height; with probability 1/(height_difference+4), invalidate this block. 20 * mine enough blocks to overtake tip_height at start of loop. 21 * for each node in [node0,node1,node2]: 22 - for each mined block: 23 * submit block to node 24 * if node crashed on/after submitting: 25 - restart until recovery succeeds 26 - check that utxo matches node3 using gettxoutsetinfo""" 27 28 import errno 29 import http.client 30 import random 31 import time 32 33 from test_framework.blocktools import COINBASE_MATURITY 34 from test_framework.messages import ( 35 COIN, 36 ) 37 from test_framework.test_framework import BitcoinTestFramework 38 from test_framework.util import ( 39 assert_not_equal, 40 assert_equal, 41 ) 42 from test_framework.wallet import ( 43 MiniWallet, 44 getnewdestination, 45 ) 46 47 48 class ChainstateWriteCrashTest(BitcoinTestFramework): 49 def set_test_params(self): 50 self.num_nodes = 4 51 self.rpc_timeout = 480 52 self.supports_cli = False 53 54 # Set -maxmempool=0 to turn off mempool memory sharing with dbcache 55 self.base_args = [ 56 "-maxmempool=0", 57 "-dbbatchsize=200000", 58 ] 59 60 # Set different crash ratios and cache sizes. Note that not all of 61 # -dbcache goes to the in-memory coins cache. 62 self.node0_args = ["-dbcrashratio=8", "-dbcache=4"] + self.base_args 63 self.node1_args = ["-dbcrashratio=16", "-dbcache=8"] + self.base_args 64 self.node2_args = ["-dbcrashratio=24", "-dbcache=16"] + self.base_args 65 66 # Node3 is a normal node with default args, except will mine full blocks 67 # and txs with "dust" outputs 68 self.node3_args = ["-blockmaxweight=4000000", "-dustrelayfee=0"] 69 self.extra_args = [self.node0_args, self.node1_args, self.node2_args, self.node3_args] 70 71 def setup_network(self): 72 self.add_nodes(self.num_nodes, extra_args=self.extra_args) 73 self.start_nodes() 74 # Leave them unconnected, we'll use submitblock directly in this test 75 76 def restart_node(self, node_index, expected_tip): 77 """Start up a given node id, wait for the tip to reach the given block hash, and calculate the utxo hash. 78 79 Exceptions during startup or subsequent RPC calls should indicate a node crash (due to -dbcrashratio), in which case we try again. Give up 80 after a timeout. Returns the utxo hash of the given node.""" 81 82 time_start = time.time() 83 while time.time() - time_start < 120 * self.options.timeout_factor: 84 try: 85 # Any of these RPC calls could throw due to node crash 86 self.start_node(node_index) 87 self.nodes[node_index].waitforblock(expected_tip) 88 utxo_hash = self.nodes[node_index].gettxoutsetinfo()['hash_serialized_3'] 89 return utxo_hash 90 except Exception: 91 # An exception here should mean the node is about to crash. 92 # If bitcoind exits, then try again. wait_for_node_exit() 93 # should raise an exception if bitcoind doesn't exit. 94 self.wait_for_node_exit(node_index, timeout=10) 95 self.crashed_on_restart += 1 96 97 # If we got here, bitcoind isn't coming back up on restart. Could be a 98 # bug in bitcoind, or we've gotten unlucky with our dbcrash ratio -- 99 # perhaps we generated a test case that blew up our cache? 100 # If this happens, the test should try to restart without -dbcrashratio 101 # and make sure that recovery happens. 102 raise AssertionError(f"Unable to successfully restart node {node_index} in allotted time") 103 104 def submit_block_catch_error(self, node_index, block): 105 """Try submitting a block to the given node. 106 107 Catch any exceptions that indicate the node has crashed. 108 Returns true if the block was submitted successfully; false otherwise.""" 109 110 try: 111 self.nodes[node_index].submitblock(block) 112 return True 113 except (http.client.CannotSendRequest, http.client.RemoteDisconnected) as e: 114 self.log.debug(f"node {node_index} submitblock raised exception: {e}") 115 return False 116 except OSError as e: 117 self.log.debug(f"node {node_index} submitblock raised OSError exception: errno={e.errno}") 118 if e.errno in [errno.EPIPE, errno.ECONNREFUSED, errno.ECONNRESET]: 119 # The node has likely crashed 120 return False 121 else: 122 # Unexpected exception, raise 123 raise 124 125 def sync_node3blocks(self, block_hashes): 126 """Use submitblock to sync node3's chain with the other nodes 127 128 If submitblock fails, restart the node and get the new utxo hash. 129 If any nodes crash while updating, we'll compare utxo hashes to 130 ensure recovery was successful.""" 131 132 node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_3'] 133 134 # Retrieve all the blocks from node3 135 blocks = [] 136 for block_hash in block_hashes: 137 blocks.append([block_hash, self.nodes[3].getblock(block_hash, 0)]) 138 139 # Deliver each block to each other node 140 for i in range(3): 141 nodei_utxo_hash = None 142 self.log.debug(f"Syncing blocks to node {i}") 143 for (block_hash, block) in blocks: 144 # Get the block from node3, and submit to node_i 145 self.log.debug(f"submitting block {block_hash}") 146 if not self.submit_block_catch_error(i, block): 147 # TODO: more carefully check that the crash is due to -dbcrashratio 148 # (change the exit code perhaps, and check that here?) 149 self.wait_for_node_exit(i, timeout=30) 150 self.log.debug(f"Restarting node {i} after block hash {block_hash}") 151 nodei_utxo_hash = self.restart_node(i, block_hash) 152 assert nodei_utxo_hash is not None 153 self.restart_counts[i] += 1 154 else: 155 # Clear it out after successful submitblock calls -- the cached 156 # utxo hash will no longer be correct 157 nodei_utxo_hash = None 158 159 # Check that the utxo hash matches node3's utxo set 160 # NOTE: we only check the utxo set if we had to restart the node 161 # after the last block submitted: 162 # - checking the utxo hash causes a cache flush, which we don't 163 # want to do every time; so 164 # - we only update the utxo cache after a node restart, since flushing 165 # the cache is a no-op at that point 166 if nodei_utxo_hash is not None: 167 self.log.debug(f"Checking txoutsetinfo matches for node {i}") 168 assert_equal(nodei_utxo_hash, node3_utxo_hash) 169 170 def verify_utxo_hash(self): 171 """Verify that the utxo hash of each node matches node3. 172 173 Restart any nodes that crash while querying.""" 174 node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_3'] 175 self.log.info("Verifying utxo hash matches for all nodes") 176 177 for i in range(3): 178 try: 179 nodei_utxo_hash = self.nodes[i].gettxoutsetinfo()['hash_serialized_3'] 180 except OSError: 181 # probably a crash on db flushing 182 nodei_utxo_hash = self.restart_node(i, self.nodes[3].getbestblockhash()) 183 assert_equal(nodei_utxo_hash, node3_utxo_hash) 184 185 def generate_small_transactions(self, node, count, utxo_list): 186 FEE = 1000 187 num_transactions = 0 188 random.shuffle(utxo_list) 189 while len(utxo_list) >= 2 and num_transactions < count: 190 utxos_to_spend = [utxo_list.pop() for _ in range(2)] 191 input_amount = int(sum([utxo['value'] for utxo in utxos_to_spend]) * COIN) 192 if input_amount < FEE: 193 # Sanity check -- if we chose inputs that are too small, skip 194 continue 195 196 self.wallet.send_self_transfer_multi( 197 from_node=node, 198 utxos_to_spend=utxos_to_spend, 199 num_outputs=3, 200 fee_per_output=FEE // 3, 201 ) 202 num_transactions += 1 203 204 def run_test(self): 205 self.wallet = MiniWallet(self.nodes[3]) 206 initial_height = self.nodes[3].getblockcount() 207 self.generate(self.nodes[3], COINBASE_MATURITY, sync_fun=self.no_op) 208 209 # Track test coverage statistics 210 self.restart_counts = [0, 0, 0] # Track the restarts for nodes 0-2 211 self.crashed_on_restart = 0 # Track count of crashes during recovery 212 213 # Start by creating a lot of utxos on node3 214 utxo_list = [] 215 for _ in range(5): 216 utxo_list.extend(self.wallet.send_self_transfer_multi(from_node=self.nodes[3], num_outputs=1000)['new_utxos']) 217 self.generate(self.nodes[3], 1, sync_fun=self.no_op) 218 assert_equal(len(self.nodes[3].getrawmempool()), 0) 219 self.log.info(f"Prepped {len(utxo_list)} utxo entries") 220 221 # Sync these blocks with the other nodes 222 block_hashes_to_sync = [] 223 for height in range(initial_height + 1, self.nodes[3].getblockcount() + 1): 224 block_hashes_to_sync.append(self.nodes[3].getblockhash(height)) 225 226 self.log.debug(f"Syncing {len(block_hashes_to_sync)} blocks with other nodes") 227 # Syncing the blocks could cause nodes to crash, so the test begins here. 228 self.sync_node3blocks(block_hashes_to_sync) 229 230 starting_tip_height = self.nodes[3].getblockcount() 231 232 # Main test loop: 233 # each time through the loop, generate a bunch of transactions, 234 # and then either mine a single new block on the tip, or some-sized reorg. 235 for i in range(40): 236 self.log.info(f"Iteration {i}, generating 2500 transactions {self.restart_counts}") 237 # Generate a bunch of small-ish transactions 238 self.generate_small_transactions(self.nodes[3], 2500, utxo_list) 239 # Pick a random block between current tip, and starting tip 240 current_height = self.nodes[3].getblockcount() 241 random_height = random.randint(starting_tip_height, current_height) 242 self.log.debug(f"At height {current_height}, considering height {random_height}") 243 if random_height > starting_tip_height: 244 # Randomly reorg from this point with some probability (1/4 for 245 # tip, 1/5 for tip-1, ...) 246 if random.random() < 1.0 / (current_height + 4 - random_height): 247 self.log.debug(f"Invalidating block at height {random_height}") 248 self.nodes[3].invalidateblock(self.nodes[3].getblockhash(random_height)) 249 250 # Now generate new blocks until we pass the old tip height 251 self.log.debug("Mining longer tip") 252 block_hashes = [] 253 while current_height + 1 > self.nodes[3].getblockcount(): 254 block_hashes.extend(self.generatetoaddress( 255 self.nodes[3], 256 nblocks=min(10, current_height + 1 - self.nodes[3].getblockcount()), 257 # new address to avoid mining a block that has just been invalidated 258 address=getnewdestination()[2], 259 sync_fun=self.no_op, 260 )) 261 self.log.debug(f"Syncing {len(block_hashes)} new blocks...") 262 self.sync_node3blocks(block_hashes) 263 self.wallet.rescan_utxos() 264 utxo_list = self.wallet.get_utxos() 265 self.log.debug(f"MiniWallet utxo count: {len(utxo_list)}") 266 267 # Check that the utxo hashes agree with node3 268 # Useful side effect: each utxo cache gets flushed here, so that we 269 # won't get crashes on shutdown at the end of the test. 270 self.verify_utxo_hash() 271 272 # Check the test coverage 273 self.log.info(f"Restarted nodes: {self.restart_counts}; crashes on restart: {self.crashed_on_restart}") 274 275 # If no nodes were restarted, we didn't test anything. 276 assert_not_equal(self.restart_counts, [0, 0, 0]) 277 278 # Make sure we tested the case of crash-during-recovery. 279 assert self.crashed_on_restart > 0 280 281 # Warn if any of the nodes escaped restart. 282 for i in range(3): 283 if self.restart_counts[i] == 0: 284 self.log.warning(f"Node {i} never crashed during utxo flush!") 285 286 287 if __name__ == "__main__": 288 ChainstateWriteCrashTest(__file__).main()