/ src / util / asmap.cpp
asmap.cpp
  1  // Copyright (c) 2019-present The Bitcoin Core developers
  2  // Distributed under the MIT software license, see the accompanying
  3  // file COPYING or http://www.opensource.org/licenses/mit-license.php.
  4  
  5  #include <util/asmap.h>
  6  
  7  #include <clientversion.h>
  8  #include <hash.h>
  9  #include <serialize.h>
 10  #include <streams.h>
 11  #include <uint256.h>
 12  #include <util/fs.h>
 13  #include <util/log.h>
 14  
 15  #include <algorithm>
 16  #include <bit>
 17  #include <cassert>
 18  #include <cstddef>
 19  #include <cstdio>
 20  #include <span>
 21  #include <utility>
 22  #include <vector>
 23  
 24  /*
 25   * ASMap (Autonomous System Map) Implementation
 26   *
 27   * Provides a compressed mapping from IP address prefixes to Autonomous System Numbers (ASNs).
 28   * Uses a binary trie structure encoded as bytecode instructions that are interpreted
 29   * at runtime to find the ASN for a given IP address.
 30   *
 31   * The format of the asmap data is a bit-packed binary format where the entire mapping
 32   * is treated as a continuous sequence of bits. Instructions and their arguments are
 33   * encoded using variable numbers of bits and concatenated together without regard for
 34   * byte boundaries. The bits are stored in bytes using little-endian bit ordering.
 35   *
 36   * The data structure internally represents the mapping as a binary trie where:
 37   * - Unassigned subnets (no ASN mapping present) map to 0
 38   * - Subnets mapped entirely to one ASN become leaf nodes
 39   * - Subnets whose lower and upper halves have different mappings branch into subtrees
 40   *
 41   * The encoding uses variable-length integers and four instruction types (RETURN, JUMP,
 42   * MATCH, DEFAULT) to efficiently represent the trie.
 43   */
 44  
 45  namespace {
 46  
 47  // Indicates decoding errors or invalid data
 48  constexpr uint32_t INVALID = 0xFFFFFFFF;
 49  
 50  /**
 51   * Extract a single bit from byte array using little-endian bit ordering (LSB first).
 52   * Used for ASMap data.
 53   */
 54  inline bool ConsumeBitLE(size_t& bitpos, std::span<const std::byte> bytes) noexcept
 55  {
 56      const bool bit = (std::to_integer<uint8_t>(bytes[bitpos / 8]) >> (bitpos % 8)) & 1;
 57      ++bitpos;
 58      return bit;
 59  }
 60  
 61  /**
 62   * Extract a single bit from byte array using big-endian bit ordering (MSB first).
 63   * Used for IP addresses to match network byte order conventions.
 64   */
 65  inline bool ConsumeBitBE(uint8_t& bitpos, std::span<const std::byte> bytes) noexcept
 66  {
 67      const bool bit = (std::to_integer<uint8_t>(bytes[bitpos / 8]) >> (7 - (bitpos % 8))) & 1;
 68      ++bitpos;
 69      return bit;
 70  }
 71  
 72  /**
 73   * Variable-length integer decoder using a custom encoding scheme.
 74   *
 75   * The encoding is easiest to describe using an example. Let's say minval=100 and
 76   * bit_sizes=[4,2,2,3]. In that case:
 77   * - x in [100..115]: encoded as [0] + [4-bit BE encoding of (x-100)]
 78   * - x in [116..119]: encoded as [1,0] + [2-bit BE encoding of (x-116)]
 79   * - x in [120..123]: encoded as [1,1,0] + [2-bit BE encoding of (x-120)]
 80   * - x in [124..131]: encoded as [1,1,1] + [3-bit BE encoding of (x-124)]
 81   *
 82   * In general, every number is encoded as:
 83   * - First, k "1"-bits, where k is the class the number falls in
 84   * - Then, a "0"-bit, unless k is the highest class
 85   * - Lastly, bit_sizes[k] bits encoding in big endian the position within that class
 86   */
 87  uint32_t DecodeBits(size_t& bitpos, const std::span<const std::byte> data, uint8_t minval, const std::span<const uint8_t> bit_sizes)
 88  {
 89      uint32_t val = minval;  // Start with minimum encodable value
 90      bool bit;
 91      for (auto bit_sizes_it = bit_sizes.begin(); bit_sizes_it != bit_sizes.end(); ++bit_sizes_it) {
 92          // Read continuation bit to determine if we're in this class
 93          if (bit_sizes_it + 1 != bit_sizes.end()) {  // Unless we're in the last class
 94              if (bitpos >= data.size() * 8) break;
 95              bit = ConsumeBitLE(bitpos, data);
 96          } else {
 97              bit = 0;  // Last class has no continuation bit
 98          }
 99          if (bit) {
100              // If the value will not fit in this class, subtract its range from val,
101              // emit a "1" bit and continue with the next class
102              val += (1 << *bit_sizes_it);  // Add size of this class
103          } else {
104              // Decode the position within this class in big endian
105              for (int b = 0; b < *bit_sizes_it; b++) {
106                  if (bitpos >= data.size() * 8) return INVALID; // Reached EOF in mantissa
107                  bit = ConsumeBitLE(bitpos, data);
108                  val += bit << (*bit_sizes_it - 1 - b); // Big-endian within the class
109              }
110              return val;
111          }
112      }
113      return INVALID; // Reached EOF in exponent
114  }
115  
116  /**
117   * Instruction Set
118   *
119   * The instruction set is designed to efficiently encode a binary trie
120   * that maps IP prefixes to ASNs. Each instruction type serves a specific
121   * role in trie traversal and evaluation.
122   */
123  enum class Instruction : uint32_t
124  {
125      // A return instruction, encoded as [0], returns a constant ASN.
126      // It is followed by an integer using the ASN encoding.
127      RETURN = 0,
128      // A jump instruction, encoded as [1,0], inspects the next unused bit in the input
129      // and either continues execution (if 0), or skips a specified number of bits (if 1).
130      // It is followed by an integer using jump encoding.
131      JUMP = 1,
132      // A match instruction, encoded as [1,1,0], inspects 1 or more of the next unused bits
133      // in the input. If they all match, execution continues. If not, the default ASN is returned
134      // (or 0 if unset). The match value encodes both the pattern and its length.
135      MATCH = 2,
136      // A default instruction, encoded as [1,1,1], sets the default variable to its argument,
137      // and continues execution. It is followed by an integer in ASN encoding.
138      DEFAULT = 3,
139  };
140  
141  // Instruction type encoding: RETURN=[0], JUMP=[1,0], MATCH=[1,1,0], DEFAULT=[1,1,1]
142  constexpr uint8_t TYPE_BIT_SIZES[]{0, 0, 1};
143  Instruction DecodeType(size_t& bitpos, const std::span<const std::byte> data)
144  {
145      return Instruction(DecodeBits(bitpos, data, 0, TYPE_BIT_SIZES));
146  }
147  
148  // ASN encoding: Can encode ASNs from 1 to ~16.7 million.
149  // Uses variable-length encoding optimized for real-world ASN distribution.
150  // ASN 0 is reserved and used if there isn't a match.
151  constexpr uint8_t ASN_BIT_SIZES[]{15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
152  uint32_t DecodeASN(size_t& bitpos, const std::span<const std::byte> data)
153  {
154      return DecodeBits(bitpos, data, 1, ASN_BIT_SIZES);
155  }
156  
157  // MATCH argument: Values in [2, 511]. The highest set bit determines the match length
158  // n ∈ [1,8]; the lower n-1 bits are the pattern to compare.
159  constexpr uint8_t MATCH_BIT_SIZES[]{1, 2, 3, 4, 5, 6, 7, 8};
160  uint32_t DecodeMatch(size_t& bitpos, const std::span<const std::byte> data)
161  {
162      return DecodeBits(bitpos, data, 2, MATCH_BIT_SIZES);
163  }
164  
165  // JUMP offset: Minimum value 17. Variable-length coded and may be large
166  // for skipping big subtrees.
167  constexpr uint8_t JUMP_BIT_SIZES[]{5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30};
168  uint32_t DecodeJump(size_t& bitpos, const std::span<const std::byte> data)
169  {
170      return DecodeBits(bitpos, data, 17, JUMP_BIT_SIZES);
171  }
172  
173  } // anonymous namespace
174  
175  /**
176   * Execute the ASMap bytecode to find the ASN for an IP
177   *
178   * This function interprets the asmap bytecode and uses bits from the IP
179   * address to navigate through the encoded trie structure, ultimately
180   * returning an ASN value.
181   */
182  uint32_t Interpret(const std::span<const std::byte> asmap, const std::span<const std::byte> ip)
183  {
184      size_t pos{0};
185      const size_t endpos{asmap.size() * 8};
186      uint8_t ip_bit{0};
187      const uint8_t ip_bits_end = ip.size() * 8;
188      uint32_t default_asn = 0;
189      while (pos < endpos) {
190          Instruction opcode = DecodeType(pos, asmap);
191          if (opcode == Instruction::RETURN) {
192              // Found leaf node - return the ASN
193              uint32_t asn = DecodeASN(pos, asmap);
194              if (asn == INVALID) break; // ASN straddles EOF
195              return asn;
196          } else if (opcode == Instruction::JUMP) {
197              // Binary branch: if IP bit is 1, jump forward; else continue
198              uint32_t jump = DecodeJump(pos, asmap);
199              if (jump == INVALID) break; // Jump offset straddles EOF
200              if (ip_bit == ip_bits_end) break; // No input bits left
201              if (int64_t{jump} >= static_cast<int64_t>(endpos - pos)) break; // Jumping past EOF
202              if (ConsumeBitBE(ip_bit, ip)) {  // Check next IP bit (big-endian)
203                  pos += jump;  // Bit = 1: skip to right subtree
204              }
205              // Bit = 0: fall through to left subtree
206          } else if (opcode == Instruction::MATCH) {
207              // Compare multiple IP bits against a pattern
208              // The match value encodes both length and pattern:
209              // - highest set bit position determines length (bit_width - 1)
210              // - lower bits contain the pattern to compare
211              uint32_t match = DecodeMatch(pos, asmap);
212              if (match == INVALID) break; // Match bits straddle EOF
213              int matchlen = std::bit_width(match) - 1;  // An n-bit value matches n-1 input bits
214              if ((ip_bits_end - ip_bit) < matchlen) break; // Not enough input bits
215              for (int bit = 0; bit < matchlen; bit++) {
216                  if (ConsumeBitBE(ip_bit, ip) != ((match >> (matchlen - 1 - bit)) & 1)) {
217                      return default_asn;  // Pattern mismatch - use default
218                  }
219              }
220              // Pattern matched - continue execution
221          } else if (opcode == Instruction::DEFAULT) {
222              // Update the default ASN for subsequent MATCH failures
223              default_asn = DecodeASN(pos, asmap);
224              if (default_asn == INVALID) break; // ASN straddles EOF
225          } else {
226              break; // Instruction straddles EOF
227          }
228      }
229      // Reached EOF without RETURN, or aborted (see any of the breaks above)
230      // - should have been caught by SanityCheckAsmap below
231      assert(false);
232      return 0; // 0 is not a valid ASN
233  }
234  
235  /**
236   * Validates ASMap structure by simulating all possible execution paths.
237   * Ensures well-formed bytecode, valid jumps, and proper termination.
238   */
239  bool SanityCheckAsmap(const std::span<const std::byte> asmap, int bits)
240  {
241      size_t pos{0};
242      const size_t endpos{asmap.size() * 8};
243      std::vector<std::pair<uint32_t, int>> jumps; // All future positions we may jump to (bit offset in asmap -> bits to consume left)
244      jumps.reserve(bits);
245      Instruction prevopcode = Instruction::JUMP;
246      bool had_incomplete_match = false;  // Track <8 bit matches for efficiency check
247  
248      while (pos != endpos) {
249          // There was a jump into the middle of the previous instruction
250          if (!jumps.empty() && pos >= jumps.back().first) return false;
251  
252          Instruction opcode = DecodeType(pos, asmap);
253          if (opcode == Instruction::RETURN) {
254              // There should not be any RETURN immediately after a DEFAULT (could be combined into just RETURN)
255              if (prevopcode == Instruction::DEFAULT) return false;
256              uint32_t asn = DecodeASN(pos, asmap);
257              if (asn == INVALID) return false; // ASN straddles EOF
258              if (jumps.empty()) {
259                  // Nothing to execute anymore
260                  if (endpos - pos > 7) return false; // Excessive padding
261                  while (pos != endpos) {
262                      if (ConsumeBitLE(pos, asmap)) return false; // Nonzero padding bit
263                  }
264                  return true; // Sanely reached EOF
265              } else {
266                  // Continue by pretending we jumped to the next instruction
267                  if (pos != jumps.back().first) return false; // Unreachable code
268                  bits = jumps.back().second; // Restore the number of bits we would have had left after this jump
269                  jumps.pop_back();
270                  prevopcode = Instruction::JUMP;
271              }
272          } else if (opcode == Instruction::JUMP) {
273              uint32_t jump = DecodeJump(pos, asmap);
274              if (jump == INVALID) return false; // Jump offset straddles EOF
275              if (int64_t{jump} > static_cast<int64_t>(endpos - pos)) return false; // Jump out of range
276              if (bits == 0) return false; // Consuming bits past the end of the input
277              --bits;
278              uint32_t jump_offset = pos + jump;
279              if (!jumps.empty() && jump_offset >= jumps.back().first) return false; // Intersecting jumps
280              jumps.emplace_back(jump_offset, bits);  // Queue jump target for validation
281              prevopcode = Instruction::JUMP;
282          } else if (opcode == Instruction::MATCH) {
283              uint32_t match = DecodeMatch(pos, asmap);
284              if (match == INVALID) return false; // Match bits straddle EOF
285              int matchlen = std::bit_width(match) - 1;
286              if (prevopcode != Instruction::MATCH) had_incomplete_match = false;
287              // Within a sequence of matches only at most one should be incomplete
288              if (matchlen < 8 && had_incomplete_match) return false;
289              had_incomplete_match = (matchlen < 8);
290              if (bits < matchlen) return false; // Consuming bits past the end of the input
291              bits -= matchlen;
292              prevopcode = Instruction::MATCH;
293          } else if (opcode == Instruction::DEFAULT) {
294              // There should not be two successive DEFAULTs (they could be combined into one)
295              if (prevopcode == Instruction::DEFAULT) return false;
296              uint32_t asn = DecodeASN(pos, asmap);
297              if (asn == INVALID) return false; // ASN straddles EOF
298              prevopcode = Instruction::DEFAULT;
299          } else {
300              return false; // Instruction straddles EOF
301          }
302      }
303      return false; // Reached EOF without RETURN instruction
304  }
305  
306  /**
307   * Provides a safe interface for validating ASMap data before use.
308   * Returns true if the data is valid for 128 bits long inputs.
309   */
310  bool CheckStandardAsmap(const std::span<const std::byte> data)
311  {
312      if (!SanityCheckAsmap(data, 128)) {
313          LogWarning("Sanity check of asmap data failed\n");
314          return false;
315      }
316      return true;
317  }
318  
319  /**
320   * Loads an ASMap file from disk and validates it.
321   */
322  std::vector<std::byte> DecodeAsmap(fs::path path)
323  {
324      FILE *filestr = fsbridge::fopen(path, "rb");
325      AutoFile file{filestr};
326      if (file.IsNull()) {
327          LogWarning("Failed to open asmap file from disk");
328          return {};
329      }
330      int64_t length{file.size()};
331      LogInfo("Opened asmap file %s (%d bytes) from disk", fs::quoted(fs::PathToString(path)), length);
332  
333      // Read entire file into memory
334      std::vector<std::byte> buffer(length);
335      file.read(buffer);
336  
337      if (!CheckStandardAsmap(buffer)) {
338          LogWarning("Sanity check of asmap file %s failed", fs::quoted(fs::PathToString(path)));
339          return {};
340      }
341  
342      return buffer;
343  }
344  
345  /**
346   * Computes SHA256 hash of ASMap data for versioning and consistency checks.
347   */
348  uint256 AsmapVersion(const std::span<const std::byte> data)
349  {
350      if (data.empty()) return {};
351  
352      HashWriter asmap_hasher;
353      asmap_hasher << data;
354      return asmap_hasher.GetHash();
355  }