/ test / functional / feature_dbcrash.py
feature_dbcrash.py
  1  #!/usr/bin/env python3
  2  # Copyright (c) 2017-present The Bitcoin Core developers
  3  # Distributed under the MIT software license, see the accompanying
  4  # file COPYING or http://www.opensource.org/licenses/mit-license.php.
  5  """Test recovery from a crash during chainstate writing.
  6  
  7  - 4 nodes
  8    * node0, node1, and node2 will have different dbcrash ratios, and different
  9      dbcache sizes
 10    * node3 will be a regular node, with no crashing.
 11    * The nodes will not connect to each other.
 12  
 13  - use default test framework starting chain. initialize starting_tip_height to
 14    tip height.
 15  
 16  - Main loop:
 17    * generate lots of transactions on node3, enough to fill up a block.
 18    * uniformly randomly pick a tip height from starting_tip_height to
 19      tip_height; with probability 1/(height_difference+4), invalidate this block.
 20    * mine enough blocks to overtake tip_height at start of loop.
 21    * for each node in [node0,node1,node2]:
 22       - for each mined block:
 23         * submit block to node
 24         * if node crashed on/after submitting:
 25           - restart until recovery succeeds
 26           - check that utxo matches node3 using gettxoutsetinfo"""
 27  
 28  import errno
 29  import http.client
 30  import random
 31  import time
 32  
 33  from test_framework.blocktools import COINBASE_MATURITY
 34  from test_framework.messages import (
 35      COIN,
 36  )
 37  from test_framework.test_framework import BitcoinTestFramework
 38  from test_framework.util import (
 39      assert_not_equal,
 40      assert_equal,
 41  )
 42  from test_framework.wallet import (
 43      MiniWallet,
 44      getnewdestination,
 45  )
 46  
 47  
 48  class ChainstateWriteCrashTest(BitcoinTestFramework):
 49      def set_test_params(self):
 50          self.num_nodes = 4
 51          self.rpc_timeout = 480
 52          self.supports_cli = False
 53  
 54          # Set -maxmempool=0 to turn off mempool memory sharing with dbcache
 55          self.base_args = [
 56              "-maxmempool=0",
 57              "-dbbatchsize=200000",
 58          ]
 59  
 60          # Set different crash ratios and cache sizes.  Note that not all of
 61          # -dbcache goes to the in-memory coins cache.
 62          self.node0_args = ["-dbcrashratio=8", "-dbcache=4"] + self.base_args
 63          self.node1_args = ["-dbcrashratio=16", "-dbcache=8"] + self.base_args
 64          self.node2_args = ["-dbcrashratio=24", "-dbcache=16"] + self.base_args
 65  
 66          # Node3 is a normal node with default args, except will mine full blocks
 67          # and txs with "dust" outputs
 68          self.node3_args = ["-blockmaxweight=4000000", "-dustrelayfee=0"]
 69          self.extra_args = [self.node0_args, self.node1_args, self.node2_args, self.node3_args]
 70  
 71      def setup_network(self):
 72          self.add_nodes(self.num_nodes, extra_args=self.extra_args)
 73          self.start_nodes()
 74          # Leave them unconnected, we'll use submitblock directly in this test
 75  
 76      def restart_node(self, node_index, expected_tip):
 77          """Start up a given node id, wait for the tip to reach the given block hash, and calculate the utxo hash.
 78  
 79          Exceptions during startup or subsequent RPC calls should indicate a node crash (due to -dbcrashratio), in which case we try again. Give up
 80          after a timeout. Returns the utxo hash of the given node."""
 81  
 82          time_start = time.time()
 83          while time.time() - time_start < 120 * self.options.timeout_factor:
 84              try:
 85                  # Any of these RPC calls could throw due to node crash
 86                  self.start_node(node_index)
 87                  self.nodes[node_index].waitforblock(expected_tip)
 88                  utxo_hash = self.nodes[node_index].gettxoutsetinfo()['hash_serialized_3']
 89                  return utxo_hash
 90              except Exception:
 91                  # An exception here should mean the node is about to crash.
 92                  # If bitcoind exits, then try again.  wait_for_node_exit()
 93                  # should raise an exception if bitcoind doesn't exit.
 94                  self.wait_for_node_exit(node_index, timeout=10)
 95              self.crashed_on_restart += 1
 96  
 97          # If we got here, bitcoind isn't coming back up on restart.  Could be a
 98          # bug in bitcoind, or we've gotten unlucky with our dbcrash ratio --
 99          # perhaps we generated a test case that blew up our cache?
100          # If this happens, the test should try to restart without -dbcrashratio
101          # and make sure that recovery happens.
102          raise AssertionError(f"Unable to successfully restart node {node_index} in allotted time")
103  
104      def submit_block_catch_error(self, node_index, block):
105          """Try submitting a block to the given node.
106  
107          Catch any exceptions that indicate the node has crashed.
108          Returns true if the block was submitted successfully; false otherwise."""
109  
110          try:
111              self.nodes[node_index].submitblock(block)
112              return True
113          except (http.client.CannotSendRequest, http.client.RemoteDisconnected) as e:
114              self.log.debug(f"node {node_index} submitblock raised exception: {e}")
115              return False
116          except OSError as e:
117              self.log.debug(f"node {node_index} submitblock raised OSError exception: errno={e.errno}")
118              if e.errno in [errno.EPIPE, errno.ECONNREFUSED, errno.ECONNRESET]:
119                  # The node has likely crashed
120                  return False
121              else:
122                  # Unexpected exception, raise
123                  raise
124  
125      def sync_node3blocks(self, block_hashes):
126          """Use submitblock to sync node3's chain with the other nodes
127  
128          If submitblock fails, restart the node and get the new utxo hash.
129          If any nodes crash while updating, we'll compare utxo hashes to
130          ensure recovery was successful."""
131  
132          node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_3']
133  
134          # Retrieve all the blocks from node3
135          blocks = []
136          for block_hash in block_hashes:
137              blocks.append([block_hash, self.nodes[3].getblock(block_hash, 0)])
138  
139          # Deliver each block to each other node
140          for i in range(3):
141              nodei_utxo_hash = None
142              self.log.debug(f"Syncing blocks to node {i}")
143              for (block_hash, block) in blocks:
144                  # Get the block from node3, and submit to node_i
145                  self.log.debug(f"submitting block {block_hash}")
146                  if not self.submit_block_catch_error(i, block):
147                      # TODO: more carefully check that the crash is due to -dbcrashratio
148                      # (change the exit code perhaps, and check that here?)
149                      self.wait_for_node_exit(i, timeout=30)
150                      self.log.debug(f"Restarting node {i} after block hash {block_hash}")
151                      nodei_utxo_hash = self.restart_node(i, block_hash)
152                      assert nodei_utxo_hash is not None
153                      self.restart_counts[i] += 1
154                  else:
155                      # Clear it out after successful submitblock calls -- the cached
156                      # utxo hash will no longer be correct
157                      nodei_utxo_hash = None
158  
159              # Check that the utxo hash matches node3's utxo set
160              # NOTE: we only check the utxo set if we had to restart the node
161              # after the last block submitted:
162              # - checking the utxo hash causes a cache flush, which we don't
163              # want to do every time; so
164              # - we only update the utxo cache after a node restart, since flushing
165              # the cache is a no-op at that point
166              if nodei_utxo_hash is not None:
167                  self.log.debug(f"Checking txoutsetinfo matches for node {i}")
168                  assert_equal(nodei_utxo_hash, node3_utxo_hash)
169  
170      def verify_utxo_hash(self):
171          """Verify that the utxo hash of each node matches node3.
172  
173          Restart any nodes that crash while querying."""
174          node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_3']
175          self.log.info("Verifying utxo hash matches for all nodes")
176  
177          for i in range(3):
178              try:
179                  nodei_utxo_hash = self.nodes[i].gettxoutsetinfo()['hash_serialized_3']
180              except OSError:
181                  # probably a crash on db flushing
182                  nodei_utxo_hash = self.restart_node(i, self.nodes[3].getbestblockhash())
183              assert_equal(nodei_utxo_hash, node3_utxo_hash)
184  
185      def generate_small_transactions(self, node, count, utxo_list):
186          FEE = 1000
187          num_transactions = 0
188          random.shuffle(utxo_list)
189          while len(utxo_list) >= 2 and num_transactions < count:
190              utxos_to_spend = [utxo_list.pop() for _ in range(2)]
191              input_amount = int(sum([utxo['value'] for utxo in utxos_to_spend]) * COIN)
192              if input_amount < FEE:
193                  # Sanity check -- if we chose inputs that are too small, skip
194                  continue
195  
196              self.wallet.send_self_transfer_multi(
197                  from_node=node,
198                  utxos_to_spend=utxos_to_spend,
199                  num_outputs=3,
200                  fee_per_output=FEE // 3,
201              )
202              num_transactions += 1
203  
204      def run_test(self):
205          self.wallet = MiniWallet(self.nodes[3])
206          initial_height = self.nodes[3].getblockcount()
207          self.generate(self.nodes[3], COINBASE_MATURITY, sync_fun=self.no_op)
208  
209          # Track test coverage statistics
210          self.restart_counts = [0, 0, 0]  # Track the restarts for nodes 0-2
211          self.crashed_on_restart = 0      # Track count of crashes during recovery
212  
213          # Start by creating a lot of utxos on node3
214          utxo_list = []
215          for _ in range(5):
216              utxo_list.extend(self.wallet.send_self_transfer_multi(from_node=self.nodes[3], num_outputs=1000)['new_utxos'])
217          self.generate(self.nodes[3], 1, sync_fun=self.no_op)
218          assert_equal(len(self.nodes[3].getrawmempool()), 0)
219          self.log.info(f"Prepped {len(utxo_list)} utxo entries")
220  
221          # Sync these blocks with the other nodes
222          block_hashes_to_sync = []
223          for height in range(initial_height + 1, self.nodes[3].getblockcount() + 1):
224              block_hashes_to_sync.append(self.nodes[3].getblockhash(height))
225  
226          self.log.debug(f"Syncing {len(block_hashes_to_sync)} blocks with other nodes")
227          # Syncing the blocks could cause nodes to crash, so the test begins here.
228          self.sync_node3blocks(block_hashes_to_sync)
229  
230          starting_tip_height = self.nodes[3].getblockcount()
231  
232          # Main test loop:
233          # each time through the loop, generate a bunch of transactions,
234          # and then either mine a single new block on the tip, or some-sized reorg.
235          for i in range(40):
236              self.log.info(f"Iteration {i}, generating 2500 transactions {self.restart_counts}")
237              # Generate a bunch of small-ish transactions
238              self.generate_small_transactions(self.nodes[3], 2500, utxo_list)
239              # Pick a random block between current tip, and starting tip
240              current_height = self.nodes[3].getblockcount()
241              random_height = random.randint(starting_tip_height, current_height)
242              self.log.debug(f"At height {current_height}, considering height {random_height}")
243              if random_height > starting_tip_height:
244                  # Randomly reorg from this point with some probability (1/4 for
245                  # tip, 1/5 for tip-1, ...)
246                  if random.random() < 1.0 / (current_height + 4 - random_height):
247                      self.log.debug(f"Invalidating block at height {random_height}")
248                      self.nodes[3].invalidateblock(self.nodes[3].getblockhash(random_height))
249  
250              # Now generate new blocks until we pass the old tip height
251              self.log.debug("Mining longer tip")
252              block_hashes = []
253              while current_height + 1 > self.nodes[3].getblockcount():
254                  block_hashes.extend(self.generatetoaddress(
255                      self.nodes[3],
256                      nblocks=min(10, current_height + 1 - self.nodes[3].getblockcount()),
257                      # new address to avoid mining a block that has just been invalidated
258                      address=getnewdestination()[2],
259                      sync_fun=self.no_op,
260                  ))
261              self.log.debug(f"Syncing {len(block_hashes)} new blocks...")
262              self.sync_node3blocks(block_hashes)
263              self.wallet.rescan_utxos()
264              utxo_list = self.wallet.get_utxos()
265              self.log.debug(f"MiniWallet utxo count: {len(utxo_list)}")
266  
267          # Check that the utxo hashes agree with node3
268          # Useful side effect: each utxo cache gets flushed here, so that we
269          # won't get crashes on shutdown at the end of the test.
270          self.verify_utxo_hash()
271  
272          # Check the test coverage
273          self.log.info(f"Restarted nodes: {self.restart_counts}; crashes on restart: {self.crashed_on_restart}")
274  
275          # If no nodes were restarted, we didn't test anything.
276          assert_not_equal(self.restart_counts, [0, 0, 0])
277  
278          # Make sure we tested the case of crash-during-recovery.
279          assert self.crashed_on_restart > 0
280  
281          # Warn if any of the nodes escaped restart.
282          for i in range(3):
283              if self.restart_counts[i] == 0:
284                  self.log.warning(f"Node {i} never crashed during utxo flush!")
285  
286  
287  if __name__ == "__main__":
288      ChainstateWriteCrashTest(__file__).main()