bench_p2p_sync.sh
1 #!/bin/bash 2 3 ########################################################### 4 # Measures a client syncing 1000 blocks from another client 5 ########################################################### 6 7 set -eo pipefail # error on any command failure 8 9 network_id=1 10 min_height=250 11 12 # The total number of validators in the beacon committee. 13 # This must match the number of validators used when generating the snapshot. 14 num_validators=40 15 16 # The number of clients that are syncing 17 # Note: Because the first indexes 0-39 are resevered for validators, the first client will have index 40. 18 # The script works around this by manually setting the storage, ports, and log files for the clients. 19 num_clients=1 20 21 # Adjust this to show more/less log messages 22 log_filter="info,snarkos_node::client=trace,snarkos_node_sync=trace,snarkos_node_tcp=warn,snarkos_node_rest=warn" 23 24 max_wait=2400 # Wait for up to 40 minutes 25 poll_interval=1 # Check block heights every second 26 27 # shellcheck source=SCRIPTDIR/utils.sh 28 . ./.ci/utils.sh 29 30 # Running sums for variance: use sum and sumsq for unbiased sample variance 31 sum_speed=0 32 sumsq_speed=0 33 samples=0 34 max_speed=0.0 35 36 # Fetch sync speeds from clients via REST and accumulate stats 37 function sample_sync_speeds() { 38 for ((client_index = 1; client_index <= num_clients; client_index++)); do 39 port=$((3030 + client_index)) 40 resp=$(curl -s "http://127.0.0.1:$port/$network_name/sync/status" || true) 41 42 # Skip if response missing 43 if [[ -z "$resp" ]]; then 44 continue 45 fi 46 47 speed=$(echo "$resp" | jq -r '.sync_speed_bps') 48 49 # Skip null or empty 50 if [[ -z "$speed" ]] || [[ "$speed" == "null" ]]; then 51 echo "Invalid speed value $speed" 52 continue 53 fi 54 55 # Validate numeric (allow exponent) 56 if ! (is_float "$speed"); then 57 echo "Invalid speed value $speed" 58 continue 59 fi 60 61 # Convert to fixed decimal for bc -l 62 speed_dec=$(awk -v x="$speed" 'BEGIN{printf "%.12f", x}') 63 if [[ -z "$speed_dec" ]]; then 64 continue 65 fi 66 67 if (( $(echo "$speed > $max_speed" | bc -l) )); then 68 max_speed=$speed 69 fi 70 71 # Accumulate using bc -l for floating point 72 sum_speed=$(echo "$sum_speed + $speed_dec" | bc -l) 73 sumsq_speed=$(echo "$sumsq_speed + ($speed_dec * $speed_dec)" | bc -l) 74 samples=$((samples + 1)) 75 done 76 } 77 78 branch_name=$(git rev-parse --abbrev-ref HEAD) 79 echo "On branch: ${branch_name}" 80 81 network_name=$(get_network_name $network_id) 82 echo "Using network: $network_name (ID: $network_id)" 83 84 snapshot_info=$(<info.txt) 85 echo "Snapshot_info: ${snapshot_info}" 86 87 # Create log directory 88 log_dir=".logs-$(date +"%Y%m%d%H%M%S")" 89 mkdir -p "$log_dir" 90 91 # Define a trap handler that cleans up all processes on exit. 92 trap stop_nodes EXIT 93 94 # Define a trap handler that prints a message when an error occurs. 95 trap 'echo "âī¸ Error in $BASH_SOURCE at line $LINENO: \"$BASH_COMMAND\" failed (exit $?)"' ERR 96 97 # Shared flags between all nodes 98 common_flags=( 99 --nodisplay --nobanner --noupdater # reduce clutter in the output 100 "--log-filter=$log_filter" # only show the logs we care about 101 "--network=$network_id" 102 --nocdn # don't sync from CDN, so we only benchmark p2p sync 103 "--dev-num-validators=$num_validators" 104 --no-dev-txs 105 ) 106 107 # The client that has the ledger 108 # (runs on the first two cores) 109 $TASKSET1 snarkos start "--dev=$num_validators" --client "${common_flags[@]}" \ 110 "--logfile=$log_dir/client-0.log" "--storage=.ledger-$network_id-0" \ 111 "--node=127.0.0.1:4130" "--rest=127.0.0.1:3030" & 112 PIDS[0]=$! 113 114 # Spawn the clients that will sync the ledger 115 # (running on the other two cores) 116 for client_index in $(seq 1 "$num_clients"); do 117 node_index=$((num_validators + client_index)) 118 prev_port=$((4130+client_index-1)) 119 node_addr="127.0.0.1:$((4130+client_index))" 120 name="client-$client_index" 121 122 # Ensure there are no old ledger files and the node syncs from scratch 123 snarkos clean "--dev=$node_index" "--network=$network_id" "--path=.ledger-$network_id-$client_index" || true 124 125 $TASKSET2 snarkos start "--dev=$node_index" --client \ 126 "${common_flags[@]}" "--peers=127.0.0.1:$prev_port" "--node=$node_addr" \ 127 "--rest=127.0.0.1:$((3030+client_index))" \ 128 "--logfile=$log_dir/$name.log" "--storage=.ledger-$network_id-$client_index" & 129 PIDS[client_index]=$! 130 131 # Add 1-second delay between starting nodes to avoid hitting rate limits 132 sleep 1 133 done 134 135 # Block until nodes are running and connected to each other. 136 wait_for_nodes $((num_clients+1)) 0 137 138 # It takes about 30s for nodes to connect. Do not measure this time. 139 SECONDS=0 140 for node_index in $(seq 0 "$num_clients"); do 141 if ! (wait_for_peers "$node_index" $num_clients); then 142 exit 1 143 fi 144 done 145 146 connect_time=$SECONDS 147 echo "âšī¸ Nodes are fully connected (took $connect_time secs). Starting block sync measurement." 148 149 # Ensure the first node actually has the ledger snapshot. 150 # This should succeed instantly in most cases 151 SECONDS=0 152 has_blocks=false 153 while (( SECONDS < 30 )); do 154 if check_heights 0 1 $min_height "$network_name" "0"; then 155 has_blocks=true 156 break 157 fi 158 159 sleep $poll_interval 160 done 161 162 if ! $has_blocks; then 163 echo "Node #0 has not reached the expected height. Maybe the ledger snapshot is corrupted or outdated?" 164 exit 1 165 fi 166 167 # Count the initial startup of node #0 as part of the benchmark as the other node 168 # might already start syncing. 169 # SECONDS=0 170 171 # Check heights periodically with a timeout 172 while (( SECONDS < max_wait )); do 173 # Sample sync speed(s) for variance calculation 174 sample_sync_speeds 175 176 if check_heights 1 $((num_clients+1)) $min_height "$network_name" "$SECONDS"; then 177 total_wait=$SECONDS 178 throughput=$(compute_throughput "$min_height" "$total_wait") 179 180 # Compute unbiased sample variance of sync_speed_bps (in blocks^2/s^2) 181 if (( samples > 1 )); then 182 mean_speed=$(echo "scale=8; $sum_speed / $samples" | bc -l) 183 variance=$(echo "scale=8; (($sumsq_speed / $samples) - ($mean_speed * $mean_speed)) * ($samples / ($samples - 1))" | bc -l) 184 else 185 mean_speed=$(echo "scale=8; 0" | bc -l) 186 variance=$(echo "scale=8; 0" | bc -l) 187 fi 188 189 echo "đ P2P sync benchmark done! Waited $total_wait seconds for $min_height blocks. Throughput was $throughput blocks/s." 190 191 # Append data to results file. 192 printf "{ \"name\": \"p2p-sync\", \"unit\": \"blocks/s\", \"value\": %.3f, \"extra\": \"total_wait=%is, target_height=%i, connect_time=%is, %s\" },\n" \ 193 "$throughput" "$total_wait" "$min_height" "$connect_time" "$snapshot_info" | tee -a results.json 194 printf "{ \"name\": \"p2p-sync-speed-variance\", \"unit\": \"blocks^2/s^2\", \"value\": %.6f, \"extra\": \"samples=%d, mean_speed=%.6f, max_speed=%.6f, branch=%s, %s\" },\n" \ 195 "$variance" "$samples" "$mean_speed" "$max_speed" "$branch_name" "$snapshot_info" | tee -a results.json 196 197 exit 0 198 fi 199 200 # Continue waiting 201 sleep $poll_interval 202 done 203 204 echo "â Benchmark failed! Clients did not sync within 40 minutes." 205 206 # Print logs for debugging 207 print_client_logs "$log_dir" "$num_validators" "$num_clients" 208 209 exit 1