/ src / util / asmap.cpp
asmap.cpp
  1  // Copyright (c) 2019-present The Bitcoin Core developers
  2  // Distributed under the MIT software license, see the accompanying
  3  // file COPYING or http://www.opensource.org/licenses/mit-license.php.
  4  
  5  #include <util/asmap.h>
  6  
  7  #include <hash.h>
  8  #include <streams.h>
  9  #include <uint256.h>
 10  #include <util/check.h>
 11  #include <util/fs.h>
 12  #include <util/log.h>
 13  
 14  #include <bit>
 15  #include <cstddef>
 16  #include <cstdio>
 17  #include <span>
 18  #include <string>
 19  #include <utility>
 20  #include <vector>
 21  
 22  /*
 23   * ASMap (Autonomous System Map) Implementation
 24   *
 25   * Provides a compressed mapping from IP address prefixes to Autonomous System Numbers (ASNs).
 26   * Uses a binary trie structure encoded as bytecode instructions that are interpreted
 27   * at runtime to find the ASN for a given IP address.
 28   *
 29   * The format of the asmap data is a bit-packed binary format where the entire mapping
 30   * is treated as a continuous sequence of bits. Instructions and their arguments are
 31   * encoded using variable numbers of bits and concatenated together without regard for
 32   * byte boundaries. The bits are stored in bytes using little-endian bit ordering.
 33   *
 34   * The data structure internally represents the mapping as a binary trie where:
 35   * - Unassigned subnets (no ASN mapping present) map to 0
 36   * - Subnets mapped entirely to one ASN become leaf nodes
 37   * - Subnets whose lower and upper halves have different mappings branch into subtrees
 38   *
 39   * The encoding uses variable-length integers and four instruction types (RETURN, JUMP,
 40   * MATCH, DEFAULT) to efficiently represent the trie.
 41   */
 42  
 43  namespace {
 44  
 45  // Indicates decoding errors or invalid data
 46  constexpr uint32_t INVALID = 0xFFFFFFFF;
 47  
 48  /**
 49   * Extract a single bit from byte array using little-endian bit ordering (LSB first).
 50   * Used for ASMap data.
 51   */
 52  inline bool ConsumeBitLE(size_t& bitpos, std::span<const std::byte> bytes) noexcept
 53  {
 54      const bool bit = (std::to_integer<uint8_t>(bytes[bitpos / 8]) >> (bitpos % 8)) & 1;
 55      ++bitpos;
 56      return bit;
 57  }
 58  
 59  /**
 60   * Extract a single bit from byte array using big-endian bit ordering (MSB first).
 61   * Used for IP addresses to match network byte order conventions.
 62   */
 63  inline bool ConsumeBitBE(uint8_t& bitpos, std::span<const std::byte> bytes) noexcept
 64  {
 65      const bool bit = (std::to_integer<uint8_t>(bytes[bitpos / 8]) >> (7 - (bitpos % 8))) & 1;
 66      ++bitpos;
 67      return bit;
 68  }
 69  
 70  /**
 71   * Variable-length integer decoder using a custom encoding scheme.
 72   *
 73   * The encoding is easiest to describe using an example. Let's say minval=100 and
 74   * bit_sizes=[4,2,2,3]. In that case:
 75   * - x in [100..115]: encoded as [0] + [4-bit BE encoding of (x-100)]
 76   * - x in [116..119]: encoded as [1,0] + [2-bit BE encoding of (x-116)]
 77   * - x in [120..123]: encoded as [1,1,0] + [2-bit BE encoding of (x-120)]
 78   * - x in [124..131]: encoded as [1,1,1] + [3-bit BE encoding of (x-124)]
 79   *
 80   * In general, every number is encoded as:
 81   * - First, k "1"-bits, where k is the class the number falls in
 82   * - Then, a "0"-bit, unless k is the highest class
 83   * - Lastly, bit_sizes[k] bits encoding in big endian the position within that class
 84   */
 85  uint32_t DecodeBits(size_t& bitpos, const std::span<const std::byte> data, uint8_t minval, const std::span<const uint8_t> bit_sizes)
 86  {
 87      uint32_t val = minval;  // Start with minimum encodable value
 88      bool bit;
 89      for (auto bit_sizes_it = bit_sizes.begin(); bit_sizes_it != bit_sizes.end(); ++bit_sizes_it) {
 90          // Read continuation bit to determine if we're in this class
 91          if (bit_sizes_it + 1 != bit_sizes.end()) {  // Unless we're in the last class
 92              if (bitpos >= data.size() * 8) break;
 93              bit = ConsumeBitLE(bitpos, data);
 94          } else {
 95              bit = 0;  // Last class has no continuation bit
 96          }
 97          if (bit) {
 98              // If the value will not fit in this class, subtract its range from val,
 99              // emit a "1" bit and continue with the next class
100              val += (1 << *bit_sizes_it);  // Add size of this class
101          } else {
102              // Decode the position within this class in big endian
103              for (int b = 0; b < *bit_sizes_it; b++) {
104                  if (bitpos >= data.size() * 8) return INVALID; // Reached EOF in mantissa
105                  bit = ConsumeBitLE(bitpos, data);
106                  val += bit << (*bit_sizes_it - 1 - b); // Big-endian within the class
107              }
108              return val;
109          }
110      }
111      return INVALID; // Reached EOF in exponent
112  }
113  
114  /**
115   * Instruction Set
116   *
117   * The instruction set is designed to efficiently encode a binary trie
118   * that maps IP prefixes to ASNs. Each instruction type serves a specific
119   * role in trie traversal and evaluation.
120   */
121  enum class Instruction : uint32_t
122  {
123      // A return instruction, encoded as [0], returns a constant ASN.
124      // It is followed by an integer using the ASN encoding.
125      RETURN = 0,
126      // A jump instruction, encoded as [1,0], inspects the next unused bit in the input
127      // and either continues execution (if 0), or skips a specified number of bits (if 1).
128      // It is followed by an integer using jump encoding.
129      JUMP = 1,
130      // A match instruction, encoded as [1,1,0], inspects 1 or more of the next unused bits
131      // in the input. If they all match, execution continues. If not, the default ASN is returned
132      // (or 0 if unset). The match value encodes both the pattern and its length.
133      MATCH = 2,
134      // A default instruction, encoded as [1,1,1], sets the default variable to its argument,
135      // and continues execution. It is followed by an integer in ASN encoding.
136      DEFAULT = 3,
137  };
138  
139  // Instruction type encoding: RETURN=[0], JUMP=[1,0], MATCH=[1,1,0], DEFAULT=[1,1,1]
140  constexpr uint8_t TYPE_BIT_SIZES[]{0, 0, 1};
141  Instruction DecodeType(size_t& bitpos, const std::span<const std::byte> data)
142  {
143      return Instruction(DecodeBits(bitpos, data, 0, TYPE_BIT_SIZES));
144  }
145  
146  // ASN encoding: Can encode ASNs from 1 to ~16.7 million.
147  // Uses variable-length encoding optimized for real-world ASN distribution.
148  // ASN 0 is reserved and used if there isn't a match.
149  constexpr uint8_t ASN_BIT_SIZES[]{15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
150  uint32_t DecodeASN(size_t& bitpos, const std::span<const std::byte> data)
151  {
152      return DecodeBits(bitpos, data, 1, ASN_BIT_SIZES);
153  }
154  
155  // MATCH argument: Values in [2, 511]. The highest set bit determines the match length
156  // n ∈ [1,8]; the lower n-1 bits are the pattern to compare.
157  constexpr uint8_t MATCH_BIT_SIZES[]{1, 2, 3, 4, 5, 6, 7, 8};
158  uint32_t DecodeMatch(size_t& bitpos, const std::span<const std::byte> data)
159  {
160      return DecodeBits(bitpos, data, 2, MATCH_BIT_SIZES);
161  }
162  
163  // JUMP offset: Minimum value 17. Variable-length coded and may be large
164  // for skipping big subtrees.
165  constexpr uint8_t JUMP_BIT_SIZES[]{5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30};
166  uint32_t DecodeJump(size_t& bitpos, const std::span<const std::byte> data)
167  {
168      return DecodeBits(bitpos, data, 17, JUMP_BIT_SIZES);
169  }
170  
171  } // anonymous namespace
172  
173  /**
174   * Execute the ASMap bytecode to find the ASN for an IP
175   *
176   * This function interprets the asmap bytecode and uses bits from the IP
177   * address to navigate through the encoded trie structure, ultimately
178   * returning an ASN value.
179   */
180  uint32_t Interpret(const std::span<const std::byte> asmap, const std::span<const std::byte> ip)
181  {
182      size_t pos{0};
183      const size_t endpos{asmap.size() * 8};
184      uint8_t ip_bit{0};
185      const uint8_t ip_bits_end = ip.size() * 8;
186      uint32_t default_asn = 0;
187      while (pos < endpos) {
188          Instruction opcode = DecodeType(pos, asmap);
189          if (opcode == Instruction::RETURN) {
190              // Found leaf node - return the ASN
191              uint32_t asn = DecodeASN(pos, asmap);
192              if (asn == INVALID) break; // ASN straddles EOF
193              return asn;
194          } else if (opcode == Instruction::JUMP) {
195              // Binary branch: if IP bit is 1, jump forward; else continue
196              uint32_t jump = DecodeJump(pos, asmap);
197              if (jump == INVALID) break; // Jump offset straddles EOF
198              if (ip_bit == ip_bits_end) break; // No input bits left
199              if (int64_t{jump} >= static_cast<int64_t>(endpos - pos)) break; // Jumping past EOF
200              if (ConsumeBitBE(ip_bit, ip)) {  // Check next IP bit (big-endian)
201                  pos += jump;  // Bit = 1: skip to right subtree
202              }
203              // Bit = 0: fall through to left subtree
204          } else if (opcode == Instruction::MATCH) {
205              // Compare multiple IP bits against a pattern
206              // The match value encodes both length and pattern:
207              // - highest set bit position determines length (bit_width - 1)
208              // - lower bits contain the pattern to compare
209              uint32_t match = DecodeMatch(pos, asmap);
210              if (match == INVALID) break; // Match bits straddle EOF
211              int matchlen = std::bit_width(match) - 1;  // An n-bit value matches n-1 input bits
212              if ((ip_bits_end - ip_bit) < matchlen) break; // Not enough input bits
213              for (int bit = 0; bit < matchlen; bit++) {
214                  if (ConsumeBitBE(ip_bit, ip) != ((match >> (matchlen - 1 - bit)) & 1)) {
215                      return default_asn;  // Pattern mismatch - use default
216                  }
217              }
218              // Pattern matched - continue execution
219          } else if (opcode == Instruction::DEFAULT) {
220              // Update the default ASN for subsequent MATCH failures
221              default_asn = DecodeASN(pos, asmap);
222              if (default_asn == INVALID) break; // ASN straddles EOF
223          } else {
224              break; // Instruction straddles EOF
225          }
226      }
227      // Reached EOF without RETURN, or aborted (see any of the breaks above)
228      // - should have been caught by SanityCheckAsmap below
229      assert(false);
230      return 0; // 0 is not a valid ASN
231  }
232  
233  /**
234   * Validates ASMap structure by simulating all possible execution paths.
235   * Ensures well-formed bytecode, valid jumps, and proper termination.
236   */
237  bool SanityCheckAsmap(const std::span<const std::byte> asmap, int bits)
238  {
239      size_t pos{0};
240      const size_t endpos{asmap.size() * 8};
241      std::vector<std::pair<uint32_t, int>> jumps; // All future positions we may jump to (bit offset in asmap -> bits to consume left)
242      jumps.reserve(bits);
243      Instruction prevopcode = Instruction::JUMP;
244      bool had_incomplete_match = false;  // Track <8 bit matches for efficiency check
245  
246      while (pos != endpos) {
247          // There was a jump into the middle of the previous instruction
248          if (!jumps.empty() && pos >= jumps.back().first) return false;
249  
250          Instruction opcode = DecodeType(pos, asmap);
251          if (opcode == Instruction::RETURN) {
252              // There should not be any RETURN immediately after a DEFAULT (could be combined into just RETURN)
253              if (prevopcode == Instruction::DEFAULT) return false;
254              uint32_t asn = DecodeASN(pos, asmap);
255              if (asn == INVALID) return false; // ASN straddles EOF
256              if (jumps.empty()) {
257                  // Nothing to execute anymore
258                  if (endpos - pos > 7) return false; // Excessive padding
259                  while (pos != endpos) {
260                      if (ConsumeBitLE(pos, asmap)) return false; // Nonzero padding bit
261                  }
262                  return true; // Sanely reached EOF
263              } else {
264                  // Continue by pretending we jumped to the next instruction
265                  if (pos != jumps.back().first) return false; // Unreachable code
266                  bits = jumps.back().second; // Restore the number of bits we would have had left after this jump
267                  jumps.pop_back();
268                  prevopcode = Instruction::JUMP;
269              }
270          } else if (opcode == Instruction::JUMP) {
271              uint32_t jump = DecodeJump(pos, asmap);
272              if (jump == INVALID) return false; // Jump offset straddles EOF
273              if (int64_t{jump} > static_cast<int64_t>(endpos - pos)) return false; // Jump out of range
274              if (bits == 0) return false; // Consuming bits past the end of the input
275              --bits;
276              uint32_t jump_offset = pos + jump;
277              if (!jumps.empty() && jump_offset >= jumps.back().first) return false; // Intersecting jumps
278              jumps.emplace_back(jump_offset, bits);  // Queue jump target for validation
279              prevopcode = Instruction::JUMP;
280          } else if (opcode == Instruction::MATCH) {
281              uint32_t match = DecodeMatch(pos, asmap);
282              if (match == INVALID) return false; // Match bits straddle EOF
283              int matchlen = std::bit_width(match) - 1;
284              if (prevopcode != Instruction::MATCH) had_incomplete_match = false;
285              // Within a sequence of matches only at most one should be incomplete
286              if (matchlen < 8 && had_incomplete_match) return false;
287              had_incomplete_match = (matchlen < 8);
288              if (bits < matchlen) return false; // Consuming bits past the end of the input
289              bits -= matchlen;
290              prevopcode = Instruction::MATCH;
291          } else if (opcode == Instruction::DEFAULT) {
292              // There should not be two successive DEFAULTs (they could be combined into one)
293              if (prevopcode == Instruction::DEFAULT) return false;
294              uint32_t asn = DecodeASN(pos, asmap);
295              if (asn == INVALID) return false; // ASN straddles EOF
296              prevopcode = Instruction::DEFAULT;
297          } else {
298              return false; // Instruction straddles EOF
299          }
300      }
301      return false; // Reached EOF without RETURN instruction
302  }
303  
304  /**
305   * Provides a safe interface for validating ASMap data before use.
306   * Returns true if the data is valid for 128 bits long inputs.
307   */
308  bool CheckStandardAsmap(const std::span<const std::byte> data)
309  {
310      if (!SanityCheckAsmap(data, 128)) {
311          LogWarning("Sanity check of asmap data failed\n");
312          return false;
313      }
314      return true;
315  }
316  
317  /**
318   * Loads an ASMap file from disk and validates it.
319   */
320  std::vector<std::byte> DecodeAsmap(fs::path path)
321  {
322      FILE *filestr = fsbridge::fopen(path, "rb");
323      AutoFile file{filestr};
324      if (file.IsNull()) {
325          LogWarning("Failed to open asmap file from disk");
326          return {};
327      }
328      int64_t length{file.size()};
329      LogInfo("Opened asmap file %s (%d bytes) from disk", fs::quoted(fs::PathToString(path)), length);
330  
331      // Read entire file into memory
332      std::vector<std::byte> buffer(length);
333      file.read(buffer);
334  
335      if (!CheckStandardAsmap(buffer)) {
336          LogWarning("Sanity check of asmap file %s failed", fs::quoted(fs::PathToString(path)));
337          return {};
338      }
339  
340      return buffer;
341  }
342  
343  /**
344   * Computes SHA256 hash of ASMap data for versioning and consistency checks.
345   */
346  uint256 AsmapVersion(const std::span<const std::byte> data)
347  {
348      if (data.empty()) return {};
349  
350      HashWriter asmap_hasher;
351      asmap_hasher << data;
352      return asmap_hasher.GetHash();
353  }