/ test / functional / feature_dbcrash.py
feature_dbcrash.py
  1  #!/usr/bin/env python3
  2  # Copyright (c) 2017-2022 The Bitcoin Core developers
  3  # Distributed under the MIT software license, see the accompanying
  4  # file COPYING or http://www.opensource.org/licenses/mit-license.php.
  5  """Test recovery from a crash during chainstate writing.
  6  
  7  - 4 nodes
  8    * node0, node1, and node2 will have different dbcrash ratios, and different
  9      dbcache sizes
 10    * node3 will be a regular node, with no crashing.
 11    * The nodes will not connect to each other.
 12  
 13  - use default test framework starting chain. initialize starting_tip_height to
 14    tip height.
 15  
 16  - Main loop:
 17    * generate lots of transactions on node3, enough to fill up a block.
 18    * uniformly randomly pick a tip height from starting_tip_height to
 19      tip_height; with probability 1/(height_difference+4), invalidate this block.
 20    * mine enough blocks to overtake tip_height at start of loop.
 21    * for each node in [node0,node1,node2]:
 22       - for each mined block:
 23         * submit block to node
 24         * if node crashed on/after submitting:
 25           - restart until recovery succeeds
 26           - check that utxo matches node3 using gettxoutsetinfo"""
 27  
 28  import errno
 29  import http.client
 30  import random
 31  import time
 32  
 33  from test_framework.blocktools import COINBASE_MATURITY
 34  from test_framework.messages import (
 35      COIN,
 36  )
 37  from test_framework.test_framework import BitcoinTestFramework
 38  from test_framework.util import (
 39      assert_equal,
 40  )
 41  from test_framework.wallet import (
 42      MiniWallet,
 43      getnewdestination,
 44  )
 45  
 46  
 47  class ChainstateWriteCrashTest(BitcoinTestFramework):
 48      def set_test_params(self):
 49          self.num_nodes = 4
 50          self.rpc_timeout = 480
 51          self.supports_cli = False
 52  
 53          # Set -maxmempool=0 to turn off mempool memory sharing with dbcache
 54          self.base_args = [
 55              "-limitdescendantsize=0",
 56              "-maxmempool=0",
 57              "-dbbatchsize=200000",
 58          ]
 59  
 60          # Set different crash ratios and cache sizes.  Note that not all of
 61          # -dbcache goes to the in-memory coins cache.
 62          self.node0_args = ["-dbcrashratio=8", "-dbcache=4"] + self.base_args
 63          self.node1_args = ["-dbcrashratio=16", "-dbcache=8"] + self.base_args
 64          self.node2_args = ["-dbcrashratio=24", "-dbcache=16"] + self.base_args
 65  
 66          # Node3 is a normal node with default args, except will mine full blocks
 67          # and txs with "dust" outputs
 68          self.node3_args = ["-blockmaxweight=4000000", "-dustrelayfee=0"]
 69          self.extra_args = [self.node0_args, self.node1_args, self.node2_args, self.node3_args]
 70  
 71      def setup_network(self):
 72          self.add_nodes(self.num_nodes, extra_args=self.extra_args)
 73          self.start_nodes()
 74          # Leave them unconnected, we'll use submitblock directly in this test
 75  
 76      def restart_node(self, node_index, expected_tip):
 77          """Start up a given node id, wait for the tip to reach the given block hash, and calculate the utxo hash.
 78  
 79          Exceptions on startup should indicate node crash (due to -dbcrashratio), in which case we try again. Give up
 80          after 60 seconds. Returns the utxo hash of the given node."""
 81  
 82          time_start = time.time()
 83          while time.time() - time_start < 120:
 84              try:
 85                  # Any of these RPC calls could throw due to node crash
 86                  self.start_node(node_index)
 87                  self.nodes[node_index].waitforblock(expected_tip)
 88                  utxo_hash = self.nodes[node_index].gettxoutsetinfo()['hash_serialized_3']
 89                  return utxo_hash
 90              except Exception:
 91                  # An exception here should mean the node is about to crash.
 92                  # If bitcoind exits, then try again.  wait_for_node_exit()
 93                  # should raise an exception if bitcoind doesn't exit.
 94                  self.wait_for_node_exit(node_index, timeout=10)
 95              self.crashed_on_restart += 1
 96              time.sleep(1)
 97  
 98          # If we got here, bitcoind isn't coming back up on restart.  Could be a
 99          # bug in bitcoind, or we've gotten unlucky with our dbcrash ratio --
100          # perhaps we generated a test case that blew up our cache?
101          # TODO: If this happens a lot, we should try to restart without -dbcrashratio
102          # and make sure that recovery happens.
103          raise AssertionError(f"Unable to successfully restart node {node_index} in allotted time")
104  
105      def submit_block_catch_error(self, node_index, block):
106          """Try submitting a block to the given node.
107  
108          Catch any exceptions that indicate the node has crashed.
109          Returns true if the block was submitted successfully; false otherwise."""
110  
111          try:
112              self.nodes[node_index].submitblock(block)
113              return True
114          except (http.client.CannotSendRequest, http.client.RemoteDisconnected) as e:
115              self.log.debug(f"node {node_index} submitblock raised exception: {e}")
116              return False
117          except OSError as e:
118              self.log.debug(f"node {node_index} submitblock raised OSError exception: errno={e.errno}")
119              if e.errno in [errno.EPIPE, errno.ECONNREFUSED, errno.ECONNRESET]:
120                  # The node has likely crashed
121                  return False
122              else:
123                  # Unexpected exception, raise
124                  raise
125  
126      def sync_node3blocks(self, block_hashes):
127          """Use submitblock to sync node3's chain with the other nodes
128  
129          If submitblock fails, restart the node and get the new utxo hash.
130          If any nodes crash while updating, we'll compare utxo hashes to
131          ensure recovery was successful."""
132  
133          node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_3']
134  
135          # Retrieve all the blocks from node3
136          blocks = []
137          for block_hash in block_hashes:
138              blocks.append([block_hash, self.nodes[3].getblock(block_hash, 0)])
139  
140          # Deliver each block to each other node
141          for i in range(3):
142              nodei_utxo_hash = None
143              self.log.debug(f"Syncing blocks to node {i}")
144              for (block_hash, block) in blocks:
145                  # Get the block from node3, and submit to node_i
146                  self.log.debug(f"submitting block {block_hash}")
147                  if not self.submit_block_catch_error(i, block):
148                      # TODO: more carefully check that the crash is due to -dbcrashratio
149                      # (change the exit code perhaps, and check that here?)
150                      self.wait_for_node_exit(i, timeout=30)
151                      self.log.debug(f"Restarting node {i} after block hash {block_hash}")
152                      nodei_utxo_hash = self.restart_node(i, block_hash)
153                      assert nodei_utxo_hash is not None
154                      self.restart_counts[i] += 1
155                  else:
156                      # Clear it out after successful submitblock calls -- the cached
157                      # utxo hash will no longer be correct
158                      nodei_utxo_hash = None
159  
160              # Check that the utxo hash matches node3's utxo set
161              # NOTE: we only check the utxo set if we had to restart the node
162              # after the last block submitted:
163              # - checking the utxo hash causes a cache flush, which we don't
164              # want to do every time; so
165              # - we only update the utxo cache after a node restart, since flushing
166              # the cache is a no-op at that point
167              if nodei_utxo_hash is not None:
168                  self.log.debug(f"Checking txoutsetinfo matches for node {i}")
169                  assert_equal(nodei_utxo_hash, node3_utxo_hash)
170  
171      def verify_utxo_hash(self):
172          """Verify that the utxo hash of each node matches node3.
173  
174          Restart any nodes that crash while querying."""
175          node3_utxo_hash = self.nodes[3].gettxoutsetinfo()['hash_serialized_3']
176          self.log.info("Verifying utxo hash matches for all nodes")
177  
178          for i in range(3):
179              try:
180                  nodei_utxo_hash = self.nodes[i].gettxoutsetinfo()['hash_serialized_3']
181              except OSError:
182                  # probably a crash on db flushing
183                  nodei_utxo_hash = self.restart_node(i, self.nodes[3].getbestblockhash())
184              assert_equal(nodei_utxo_hash, node3_utxo_hash)
185  
186      def generate_small_transactions(self, node, count, utxo_list):
187          FEE = 1000  # TODO: replace this with node relay fee based calculation
188          num_transactions = 0
189          random.shuffle(utxo_list)
190          while len(utxo_list) >= 2 and num_transactions < count:
191              utxos_to_spend = [utxo_list.pop() for _ in range(2)]
192              input_amount = int(sum([utxo['value'] for utxo in utxos_to_spend]) * COIN)
193              if input_amount < FEE:
194                  # Sanity check -- if we chose inputs that are too small, skip
195                  continue
196  
197              self.wallet.send_self_transfer_multi(
198                  from_node=node,
199                  utxos_to_spend=utxos_to_spend,
200                  num_outputs=3,
201                  fee_per_output=FEE // 3,
202              )
203              num_transactions += 1
204  
205      def run_test(self):
206          self.wallet = MiniWallet(self.nodes[3])
207          initial_height = self.nodes[3].getblockcount()
208          self.generate(self.nodes[3], COINBASE_MATURITY, sync_fun=self.no_op)
209  
210          # Track test coverage statistics
211          self.restart_counts = [0, 0, 0]  # Track the restarts for nodes 0-2
212          self.crashed_on_restart = 0      # Track count of crashes during recovery
213  
214          # Start by creating a lot of utxos on node3
215          utxo_list = []
216          for _ in range(5):
217              utxo_list.extend(self.wallet.send_self_transfer_multi(from_node=self.nodes[3], num_outputs=1000)['new_utxos'])
218          self.generate(self.nodes[3], 1, sync_fun=self.no_op)
219          assert_equal(len(self.nodes[3].getrawmempool()), 0)
220          self.log.info(f"Prepped {len(utxo_list)} utxo entries")
221  
222          # Sync these blocks with the other nodes
223          block_hashes_to_sync = []
224          for height in range(initial_height + 1, self.nodes[3].getblockcount() + 1):
225              block_hashes_to_sync.append(self.nodes[3].getblockhash(height))
226  
227          self.log.debug(f"Syncing {len(block_hashes_to_sync)} blocks with other nodes")
228          # Syncing the blocks could cause nodes to crash, so the test begins here.
229          self.sync_node3blocks(block_hashes_to_sync)
230  
231          starting_tip_height = self.nodes[3].getblockcount()
232  
233          # Main test loop:
234          # each time through the loop, generate a bunch of transactions,
235          # and then either mine a single new block on the tip, or some-sized reorg.
236          for i in range(40):
237              self.log.info(f"Iteration {i}, generating 2500 transactions {self.restart_counts}")
238              # Generate a bunch of small-ish transactions
239              self.generate_small_transactions(self.nodes[3], 2500, utxo_list)
240              # Pick a random block between current tip, and starting tip
241              current_height = self.nodes[3].getblockcount()
242              random_height = random.randint(starting_tip_height, current_height)
243              self.log.debug(f"At height {current_height}, considering height {random_height}")
244              if random_height > starting_tip_height:
245                  # Randomly reorg from this point with some probability (1/4 for
246                  # tip, 1/5 for tip-1, ...)
247                  if random.random() < 1.0 / (current_height + 4 - random_height):
248                      self.log.debug(f"Invalidating block at height {random_height}")
249                      self.nodes[3].invalidateblock(self.nodes[3].getblockhash(random_height))
250  
251              # Now generate new blocks until we pass the old tip height
252              self.log.debug("Mining longer tip")
253              block_hashes = []
254              while current_height + 1 > self.nodes[3].getblockcount():
255                  block_hashes.extend(self.generatetoaddress(
256                      self.nodes[3],
257                      nblocks=min(10, current_height + 1 - self.nodes[3].getblockcount()),
258                      # new address to avoid mining a block that has just been invalidated
259                      address=getnewdestination()[2],
260                      sync_fun=self.no_op,
261                  ))
262              self.log.debug(f"Syncing {len(block_hashes)} new blocks...")
263              self.sync_node3blocks(block_hashes)
264              self.wallet.rescan_utxos()
265              utxo_list = self.wallet.get_utxos()
266              self.log.debug(f"MiniWallet utxo count: {len(utxo_list)}")
267  
268          # Check that the utxo hashes agree with node3
269          # Useful side effect: each utxo cache gets flushed here, so that we
270          # won't get crashes on shutdown at the end of the test.
271          self.verify_utxo_hash()
272  
273          # Check the test coverage
274          self.log.info(f"Restarted nodes: {self.restart_counts}; crashes on restart: {self.crashed_on_restart}")
275  
276          # If no nodes were restarted, we didn't test anything.
277          assert self.restart_counts != [0, 0, 0]
278  
279          # Make sure we tested the case of crash-during-recovery.
280          assert self.crashed_on_restart > 0
281  
282          # Warn if any of the nodes escaped restart.
283          for i in range(3):
284              if self.restart_counts[i] == 0:
285                  self.log.warning(f"Node {i} never crashed during utxo flush!")
286  
287  
288  if __name__ == "__main__":
289      ChainstateWriteCrashTest().main()