/ .ci / bench_p2p_sync.sh
bench_p2p_sync.sh
  1  #!/bin/bash
  2  
  3  ###########################################################
  4  # Measures a client syncing 1000 blocks from another client
  5  ###########################################################
  6  
  7  set -eo pipefail # error on any command failure
  8  
  9  network_id=1
 10  min_height=250
 11  
 12  # The total number of validators in the beacon committee.
 13  # This must match the number of validators used when generating the snapshot. 
 14  num_validators=40
 15  
 16  # The number of clients that are syncing
 17  # Note: Because the first indexes 0-39 are resevered for validators, the first client will have index 40.
 18  # The script works around this by manually setting the storage, ports, and log files for the clients. 
 19  num_clients=1
 20  
 21  # Adjust this to show more/less log messages
 22  log_filter="info,snarkos_node::client=trace,snarkos_node_sync=trace,snarkos_node_tcp=warn,snarkos_node_rest=warn"
 23  
 24  max_wait=2400 # Wait for up to 40 minutes
 25  poll_interval=1 # Check block heights every second
 26  
 27  # shellcheck source=SCRIPTDIR/utils.sh
 28  . ./.ci/utils.sh
 29  
 30  # Running sums for variance: use sum and sumsq for unbiased sample variance
 31  sum_speed=0
 32  sumsq_speed=0
 33  samples=0
 34  max_speed=0.0
 35  
 36  # Fetch sync speeds from clients via REST and accumulate stats
 37  function sample_sync_speeds() {
 38    for ((client_index = 1; client_index <= num_clients; client_index++)); do
 39      port=$((3030 + client_index))
 40      resp=$(curl -s "http://127.0.0.1:$port/$network_name/sync/status" || true)
 41  
 42      # Skip if response missing
 43      if [[ -z "$resp" ]]; then
 44        continue
 45      fi
 46  
 47      speed=$(echo "$resp" | jq -r '.sync_speed_bps')
 48  
 49      # Skip null or empty
 50      if [[ -z "$speed" ]] || [[ "$speed" == "null" ]]; then
 51        echo "Invalid speed value $speed"
 52        continue
 53      fi
 54  
 55      # Validate numeric (allow exponent)
 56      if ! (is_float "$speed"); then
 57          echo "Invalid speed value $speed"
 58         continue
 59      fi
 60  
 61      # Convert to fixed decimal for bc -l
 62      speed_dec=$(awk -v x="$speed" 'BEGIN{printf "%.12f", x}')
 63      if [[ -z "$speed_dec" ]]; then
 64        continue
 65      fi
 66  
 67      if (( $(echo "$speed > $max_speed" | bc -l) )); then
 68        max_speed=$speed
 69      fi
 70  
 71      # Accumulate using bc -l for floating point
 72      sum_speed=$(echo "$sum_speed + $speed_dec" | bc -l)
 73      sumsq_speed=$(echo "$sumsq_speed + ($speed_dec * $speed_dec)" | bc -l)
 74      samples=$((samples + 1))
 75    done
 76  }
 77  
 78  branch_name=$(git rev-parse --abbrev-ref HEAD)
 79  echo "On branch: ${branch_name}"
 80  
 81  network_name=$(get_network_name $network_id)
 82  echo "Using network: $network_name (ID: $network_id)"
 83  
 84  snapshot_info=$(<info.txt)
 85  echo "Snapshot_info: ${snapshot_info}"
 86  
 87  # Create log directory
 88  log_dir=".logs-$(date +"%Y%m%d%H%M%S")"
 89  mkdir -p "$log_dir"
 90  
 91  # Define a trap handler that cleans up all processes on exit.
 92  trap stop_nodes EXIT
 93  
 94  # Define a trap handler that prints a message when an error occurs.
 95  trap 'echo "â›”ī¸ Error in $BASH_SOURCE at line $LINENO: \"$BASH_COMMAND\" failed (exit $?)"' ERR
 96  
 97  # Shared flags between all nodes
 98  common_flags=(
 99    --nodisplay --nobanner --noupdater # reduce clutter in the output
100    "--log-filter=$log_filter" # only show the logs we care about
101    "--network=$network_id"
102    --nocdn # don't sync from CDN, so we only benchmark p2p sync
103    "--dev-num-validators=$num_validators"
104    --no-dev-txs
105  )
106  
107  # The client that has the ledger
108  # (runs on the first two cores)
109  $TASKSET1 snarkos start "--dev=$num_validators" --client "${common_flags[@]}" \
110    "--logfile=$log_dir/client-0.log" "--storage=.ledger-$network_id-0" \
111    "--node=127.0.0.1:4130" "--rest=127.0.0.1:3030" &
112  PIDS[0]=$!
113  
114  # Spawn the clients that will sync the ledger
115  # (running on the other two cores)
116  for client_index in $(seq 1 "$num_clients"); do
117    node_index=$((num_validators + client_index))
118    prev_port=$((4130+client_index-1))
119    node_addr="127.0.0.1:$((4130+client_index))"
120    name="client-$client_index"
121  
122    # Ensure there are no old ledger files and the node syncs from scratch
123    snarkos clean "--dev=$node_index" "--network=$network_id" "--path=.ledger-$network_id-$client_index" || true
124  
125    $TASKSET2 snarkos start "--dev=$node_index" --client \
126      "${common_flags[@]}" "--peers=127.0.0.1:$prev_port" "--node=$node_addr" \
127      "--rest=127.0.0.1:$((3030+client_index))" \
128      "--logfile=$log_dir/$name.log" "--storage=.ledger-$network_id-$client_index" &
129    PIDS[client_index]=$!
130  
131    # Add 1-second delay between starting nodes to avoid hitting rate limits
132    sleep 1
133  done
134  
135  # Block until nodes are running and connected to each other.
136  wait_for_nodes $((num_clients+1)) 0
137  
138  # It takes about 30s for nodes to connect. Do not measure this time.
139  SECONDS=0
140  for node_index in $(seq 0 "$num_clients"); do
141    if ! (wait_for_peers "$node_index" $num_clients); then
142      exit 1
143    fi
144  done
145  
146  connect_time=$SECONDS
147  echo "â„šī¸ Nodes are fully connected (took $connect_time secs). Starting block sync measurement."
148  
149  # Ensure the first node actually has the ledger snapshot.
150  # This should succeed instantly in most cases
151  SECONDS=0
152  has_blocks=false
153  while (( SECONDS < 30 )); do
154    if check_heights 0 1 $min_height "$network_name" "0"; then
155      has_blocks=true
156      break
157    fi
158  
159    sleep $poll_interval
160  done
161  
162  if ! $has_blocks; then
163    echo "Node #0 has not reached the expected height. Maybe the ledger snapshot is corrupted or outdated?"
164    exit 1
165  fi
166  
167  # Count the initial startup of node #0 as part of the benchmark as the other node
168  # might already start syncing.
169  # SECONDS=0 
170  
171  # Check heights periodically with a timeout
172  while (( SECONDS < max_wait )); do
173    # Sample sync speed(s) for variance calculation
174    sample_sync_speeds
175  
176    if check_heights 1 $((num_clients+1)) $min_height "$network_name" "$SECONDS"; then
177      total_wait=$SECONDS
178      throughput=$(compute_throughput "$min_height" "$total_wait")
179  
180      # Compute unbiased sample variance of sync_speed_bps (in blocks^2/s^2)
181      if (( samples > 1 )); then
182        mean_speed=$(echo "scale=8; $sum_speed / $samples" | bc -l)
183        variance=$(echo "scale=8; (($sumsq_speed / $samples) - ($mean_speed * $mean_speed)) * ($samples / ($samples - 1))" | bc -l)
184      else
185        mean_speed=$(echo "scale=8; 0" | bc -l)
186        variance=$(echo "scale=8; 0" | bc -l)
187      fi
188  
189      echo "🎉 P2P sync benchmark done! Waited $total_wait seconds for $min_height blocks. Throughput was $throughput blocks/s."
190  
191      # Append data to results file.
192      printf "{ \"name\": \"p2p-sync\", \"unit\": \"blocks/s\", \"value\": %.3f, \"extra\": \"total_wait=%is, target_height=%i, connect_time=%is, %s\" },\n" \
193         "$throughput" "$total_wait" "$min_height" "$connect_time" "$snapshot_info" | tee -a results.json
194      printf "{ \"name\": \"p2p-sync-speed-variance\", \"unit\": \"blocks^2/s^2\", \"value\": %.6f, \"extra\": \"samples=%d, mean_speed=%.6f, max_speed=%.6f, branch=%s, %s\" },\n" \
195         "$variance" "$samples" "$mean_speed" "$max_speed" "$branch_name" "$snapshot_info" | tee -a results.json
196  
197      exit 0
198    fi
199    
200    # Continue waiting
201    sleep $poll_interval
202  done
203  
204  echo "❌ Benchmark failed! Clients did not sync within 40 minutes."
205  
206  # Print logs for debugging
207  print_client_logs "$log_dir" "$num_validators" "$num_clients"
208  
209  exit 1