/ compiler / parser-lossless / src / tokens.rs
tokens.rs
  1  // Copyright (C) 2019-2025 ADnet Contributors
  2  // This file is part of the ADL library.
  3  
  4  // The ADL library is free software: you can redistribute it and/or modify
  5  // it under the terms of the GNU General Public License as published by
  6  // the Free Software Foundation, either version 3 of the License, or
  7  // (at your option) any later version.
  8  
  9  // The ADL library is distributed in the hope that it will be useful,
 10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
 11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 12  // GNU General Public License for more details.
 13  
 14  // You should have received a copy of the GNU General Public License
 15  // along with the ADL library. If not, see <https://www.gnu.org/licenses/>.
 16  
 17  use adl_errors::{Handler, ParserError};
 18  use logos::Logos;
 19  use std::sync::LazyLock;
 20  
 21  #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 22  pub enum IdVariants {
 23      Identifier,
 24      Intrinsic,
 25      Path,
 26      ProgramId,
 27      Locator,
 28  }
 29  
 30  fn id_variant(lex: &mut logos::Lexer<Token>) -> IdVariants {
 31      // Use LazyLock to not recompile these regexes every time.
 32      // Support both .alpha (ALPHA chain) and .delta (DELTA chain) program IDs
 33      static REGEX_LOCATOR: LazyLock<regex::Regex> =
 34          LazyLock::new(|| regex::Regex::new(r"^\.(alpha|delta)/[a-zA-Z][a-zA-Z0-9_]*").unwrap());
 35      static REGEX_PROGRAM_ID: LazyLock<regex::Regex> =
 36          LazyLock::new(|| regex::Regex::new(r"^\.(alpha|delta)\b").unwrap());
 37      static REGEX_PATH: LazyLock<regex::Regex> =
 38          LazyLock::new(|| regex::Regex::new(r"^(?:::[a-zA-Z][a-zA-Z0-9_]*)+").unwrap());
 39  
 40      if let Some(found) = REGEX_LOCATOR.find(lex.remainder()) {
 41          lex.bump(found.len());
 42          IdVariants::Locator
 43      } else if let Some(found) = REGEX_PROGRAM_ID.find(lex.remainder()) {
 44          lex.bump(found.len());
 45          IdVariants::ProgramId
 46      } else if let Some(found) = REGEX_PATH.find(lex.remainder()) {
 47          lex.bump(found.len());
 48          IdVariants::Path
 49      } else if lex.remainder().starts_with("_") {
 50          IdVariants::Intrinsic
 51      } else {
 52          IdVariants::Identifier
 53      }
 54  }
 55  
 56  fn comment_block(lex: &mut logos::Lexer<Token>) -> bool {
 57      let mut last_asterisk = false;
 58      for (index, c) in lex.remainder().char_indices() {
 59          if c == '*' {
 60              last_asterisk = true;
 61          } else if c == '/' && last_asterisk {
 62              lex.bump(index + 1);
 63              return true;
 64          } else if matches!(c,
 65              '\u{202A}'..='\u{202E}' |
 66              '\u{2066}'..='\u{2069}'
 67          ) {
 68              // It's a bidi character - end the comment token
 69              // so we can report that error.
 70              lex.bump(index);
 71              return true;
 72          } else {
 73              last_asterisk = false;
 74          }
 75      }
 76      false
 77  }
 78  
 79  #[derive(Clone, Copy, Debug, PartialEq, Eq, Logos)]
 80  pub enum Token {
 81      #[regex(r"[ \t\f]+")]
 82      Whitespace,
 83  
 84      #[regex(r"\r?\n")]
 85      Linebreak,
 86  
 87      // Comments don't include line breaks or bidi characters.
 88      #[regex(r"//[^\r\n\u{202A}-\u{202E}\u{2066}-\u{2069}]*")]
 89      CommentLine,
 90  
 91      // Can't match block comments in a regex without lazy quantifiers,
 92      // so use a callback.
 93      #[token(r"/*", comment_block)]
 94      CommentBlock,
 95  
 96      // We want to lex these four categories as separate token types:
 97      // 1. identifiers like `abc`
 98      // 2. paths like `abc::def::ghi`
 99      // 3. program ids like `abc.alpha`
100      // 4. locators like `abc.alpha/def`
101      // We can't do this directly with logos regexes due to the lack of backtracking.
102      // So we do it with this callback.
103      //
104      // As an alternative design, we could simply treat the individual components of these as separate tokens,
105      // so that `abc.alpha/def` would be tokenized as `[abc, ., alpha, /, def]`. This is challenging to handle
106      // with an LR(1) parser - we potentially get shift-reduce conflicts and other ambiguities between
107      // member accesses, program ids, tuple accesses, etc. We could make it work but let's just cut to the
108      // chase here.
109  
110      // Catch identifiers starting with underscore
111      #[regex(r"_[a-zA-Z][a-zA-Z0-9_]*", |_| IdVariants::Intrinsic)]
112      #[regex(r"[a-zA-Z][a-zA-Z0-9_]*", id_variant)]
113      // We need to special case `group::abc` and `signature::abc` as otherwise these are keywords.
114      #[regex(r"group::[a-zA-Z][a-zA-Z0-9_]*", |_| IdVariants::Path)]
115      #[regex(r"signature::[a-zA-Z][a-zA-Z0-9_]*", |_| IdVariants::Path)]
116      #[regex(r"Future::[a-zA-Z][a-zA-Z0-9_]*", |_| IdVariants::Path)]
117      IdVariants(IdVariants),
118  
119      // Address literals should have exactly 58 characters after the prefix, but we lex other lengths
120      // and flag an error later. AX addresses use "ax1" prefix (61 chars total).
121      #[regex(r"ax1[a-z0-9]*")]
122      AddressLiteral,
123  
124      // As with the previous parser, avoid lowercase letters to avoid ambiguity with the `field` postfix.
125      // Allow invalid digits for each radix so we can report an error about them later.
126      #[regex(r"0x[0-9A-Z_]+")]
127      #[regex(r"0o[0-9A-Z_]+")]
128      #[regex(r"0b[0-9A-Z_]+")]
129      #[regex(r"[0-9][0-9A-Z_]*")]
130      Integer,
131  
132      #[regex(r#""[^"]*""#)]
133      StaticString,
134  
135      // Symbols
136      #[token("=")]
137      Assign,
138      #[token("!")]
139      Not,
140      #[token("&&")]
141      And,
142      #[token("&&=")]
143      AndAssign,
144      #[token("||")]
145      Or,
146      #[token("||=")]
147      OrAssign,
148      #[token("&")]
149      BitAnd,
150      #[token("&=")]
151      BitAndAssign,
152      #[token("|")]
153      BitOr,
154      #[token("|=")]
155      BitOrAssign,
156      #[token("^")]
157      BitXor,
158      #[token("^=")]
159      BitXorAssign,
160      #[token("==")]
161      Eq,
162      #[token("!=")]
163      NotEq,
164      #[token("<")]
165      Lt,
166      #[token("<=")]
167      LtEq,
168      #[token(">")]
169      Gt,
170      #[token(">=")]
171      GtEq,
172      #[token("+")]
173      Add,
174      #[token("+=")]
175      AddAssign,
176      #[token("-")]
177      Sub,
178      #[token("-=")]
179      SubAssign,
180      #[token("*")]
181      Mul,
182      #[token("*=")]
183      MulAssign,
184      #[token("/")]
185      Div,
186      #[token("/=")]
187      DivAssign,
188      #[token("**")]
189      Pow,
190      #[token("**=")]
191      PowAssign,
192      #[token("%")]
193      Rem,
194      #[token("%=")]
195      RemAssign,
196      #[token("<<")]
197      Shl,
198      #[token("<<=")]
199      ShlAssign,
200      #[token(">>")]
201      Shr,
202      #[token(">>=")]
203      ShrAssign,
204      #[token("(")]
205      LeftParen,
206      #[token(")")]
207      RightParen,
208      #[token("[")]
209      LeftSquare,
210      #[token("]")]
211      RightSquare,
212      #[token("{")]
213      LeftCurly,
214      #[token("}")]
215      RightCurly,
216      #[token(",")]
217      Comma,
218      #[token(".")]
219      Dot,
220      #[token("..")]
221      DotDot,
222      #[token(";")]
223      Semicolon,
224      #[token(":")]
225      Colon,
226      #[token("::")]
227      DoubleColon,
228      #[token("?")]
229      Question,
230      #[token("->")]
231      Arrow,
232      #[token("=>")]
233      BigArrow,
234      #[token("_")]
235      Underscore,
236      #[token("@")]
237      At,
238  
239      // Keywords
240      #[token("true")]
241      True,
242      #[token("false")]
243      False,
244      #[token("none")]
245      None,
246      #[token("address")]
247      Address,
248      #[token("bool")]
249      Bool,
250      #[token("field")]
251      Field,
252      #[token("group")]
253      Group,
254      #[token("i8")]
255      I8,
256      #[token("i16")]
257      I16,
258      #[token("i32")]
259      I32,
260      #[token("i64")]
261      I64,
262      #[token("i128")]
263      I128,
264      #[token("record")]
265      Record,
266      #[token("scalar")]
267      Scalar,
268      #[token("signature")]
269      Signature,
270      #[token("string")]
271      String,
272      #[token("struct")]
273      Struct,
274      #[token("u8")]
275      U8,
276      #[token("u16")]
277      U16,
278      #[token("u32")]
279      U32,
280      #[token("u64")]
281      U64,
282      #[token("u128")]
283      U128,
284  
285      #[token("alpha")]
286      Alpha,
287      #[token("delta")]
288      Delta,
289      #[token("as")]
290      As,
291      #[token("assert")]
292      Assert,
293      #[token("assert_eq")]
294      AssertEq,
295      #[token("assert_neq")]
296      AssertNeq,
297      #[token("async")]
298      Async,
299      #[token("block")]
300      Block,
301      #[token("const")]
302      Const,
303      #[token("constant")]
304      Constant,
305      #[token("constructor")]
306      Constructor,
307      #[token("else")]
308      Else,
309      #[token("Fn")]
310      Fn,
311      #[token("for")]
312      For,
313      #[token("function")]
314      Function,
315      #[token("Future")]
316      Future,
317      #[token("if")]
318      If,
319      #[token("import")]
320      Import,
321      #[token("in")]
322      In,
323      #[token("inline")]
324      Inline,
325      #[token("let")]
326      Let,
327      #[token("mapping")]
328      Mapping,
329      #[token("storage")]
330      Storage,
331      #[token("network")]
332      Network,
333      #[token("private")]
334      Private,
335      #[token("program")]
336      Program,
337      #[token("public")]
338      Public,
339      #[token("return")]
340      Return,
341      #[token("script")]
342      Script,
343      #[token("self")]
344      SelfLower,
345      #[token("transition")]
346      Transition,
347  
348      // Unicode bidirectional control characters are a potential risk in
349      // source. We detect them so we can report them as an error.
350      #[regex(r"[\u{202A}-\u{202E}\u{2066}-\u{2069}]")]
351      Bidi,
352  
353      // This token is never produced; we use it in grammar.lalrpop
354      // to ensure a given production doesn't happen.
355      Never,
356  }
357  
358  impl Token {
359      /// A `str` describing the token suitable for use in error messages.
360      ///
361      /// * `token_s` - The str as reported by logos.
362      pub fn str_user(token_s: &str) -> Option<&'static str> {
363          let v = match token_s {
364              // These variants we don't want to report to the user.
365              // Whitespace,
366              // Linebreak,
367              // CommentLine,
368              // CommentBlock,
369              "Identifier" => "an identifier",
370              "AddressLiteral" => "an address literal",
371              "ProgramId" => "a program id",
372  
373              "Integer" => "an integer literal",
374  
375              "StaticString" => "a static string",
376  
377              // Symbols
378              "Assign" => "'='",
379              "Not" => "'!'",
380              "And" => "'&&'",
381              "AndAssign" => "'&&='",
382              "Or" => "'||'",
383              "OrAssign" => "'||='",
384              "BitAnd" => "'&'",
385              "BitAndAssign" => "'&='",
386              "BitOr" => "'|'",
387              "BitOrAssign" => "'|='",
388              "BitXor" => "'^'",
389              "BitXorAssign" => "'&='",
390              "Eq" => "'=='",
391              "NotEq" => "'!='",
392              "Lt" => "'<'",
393              "LtEq" => "'<='",
394              "Gt" => "'>'",
395              "GtEq" => "'>='",
396              "Add" => "'+'",
397              "AddAssign" => "'+='",
398              "Sub" => "'-'",
399              "SubAssign" => "'-='",
400              "Mul" => "'*'",
401              "MulAssign" => "'*='",
402              "Div" => "'/'",
403              "DivAssign" => "'/='",
404              "Pow" => "'**'",
405              "PowAssign" => "'**='",
406              "Rem" => "'%'",
407              "RemAssign" => "'%='",
408              "Shl" => "'<<'",
409              "ShlAssign" => "'<<='",
410              "Shr" => "'>>'",
411              "ShrAssign" => "'>>='",
412              "LeftParen" => "'('",
413              "RightParen" => "')'",
414              "LeftSquare" => "'['",
415              "RightSquare" => "']'",
416              "LeftCurly" => "'{'",
417              "RightCurly" => "'}'",
418              "Comma" => "','",
419              "Dot" => "'.'",
420              "DotDot" => "'..'",
421              "Semicolon" => "';'",
422              "Colon" => "':'",
423              "DoubleColon" => "'::'",
424              "Question" => "'?'",
425              "Arrow" => "'->'",
426              "BigArrow" => "'=>'",
427              "Underscore" => "'_'",
428              "At" => "'@'",
429  
430              // Keywords
431              "True" => "'true'",
432              "False" => "'false'",
433              "Address" => "'address",
434              "Bool" => "'bool'",
435              "Field" => "'field'",
436              "Group" => "'group'",
437              "I8" => "'i8'",
438              "I16" => "'i16'",
439              "I32" => "'i32'",
440              "I64" => "'i64'",
441              "I128" => "'i128'",
442              "Record" => "'record'",
443              "Scalar" => "'scalar'",
444              "Signature" => "'signature'",
445              "String" => "a string",
446              "Struct" => "'struct'",
447              "U8" => "'u8'",
448              "U16" => "'u16'",
449              "U32" => "'u32'",
450              "U64" => "'u64'",
451              "U128" => "'u128'",
452  
453              "Alpha" => "'alpha'",
454              "Delta" => "'delta'",
455              "As" => "'as'",
456              "Assert" => "'assert'",
457              "AssertEq" => "'assert_eq'",
458              "AssertNeq" => "'assert_neq'",
459              "Async" => "'async'",
460              "Block" => "'block'",
461              "Const" => "'const'",
462              "Constant" => "'constant'",
463              "Constructor" => "'constructor'",
464              "Else" => "'else'",
465              "Fn" => "'Fn'",
466              "For" => "'for'",
467              "Function" => "'function'",
468              "Future" => "'future'",
469              "If" => "'if'",
470              "Import" => "'import'",
471              "In" => "'in'",
472              "Inline" => "'inline'",
473              "Let" => "'let'",
474              "Mapping" => "'mapping'",
475              "Storage" => "'storage'",
476              "Network" => "'network'",
477              "Private" => "'private'",
478              "Program" => "'program'",
479              "Public" => "'public'",
480              "Return" => "'return'",
481              "Script" => "'script'",
482              "SelfLower" => "'self'",
483              "Transition" => "'transition'",
484  
485              "Never" => return None,
486  
487              _ => return None,
488          };
489          Some(v)
490      }
491  }
492  
493  /// The token type we present to LALRPOP.
494  #[derive(Clone, Debug, PartialEq, Eq)]
495  pub struct LalrToken<'a> {
496      pub token: Token,
497      pub text: &'a str,
498      pub span: adl_span::Span,
499  }
500  
501  /// The lexer we present to LALRPOP.
502  pub struct Lexer<'a> {
503      logos_lexer: logos::Lexer<'a, Token>,
504      start_pos: u32,
505      handler: Handler,
506  }
507  
508  impl<'a> Lexer<'a> {
509      pub fn new(text: &'a str, start_pos: u32, handler: Handler) -> Self {
510          Self { logos_lexer: Token::lexer(text), start_pos, handler }
511      }
512  }
513  
514  impl<'a> Iterator for Lexer<'a> {
515      type Item = (usize, LalrToken<'a>, usize);
516  
517      fn next(&mut self) -> Option<Self::Item> {
518          let next = self.logos_lexer.next()?;
519          let logos_span = self.logos_lexer.span();
520          let span =
521              adl_span::Span { lo: self.start_pos + logos_span.start as u32, hi: self.start_pos + logos_span.end as u32 };
522  
523          let text = self.logos_lexer.slice();
524  
525          let Ok(token) = next else {
526              self.handler.emit_err(ParserError::could_not_lex_span(text.trim(), span));
527              return None;
528          };
529  
530          if matches!(token, Token::Bidi) {
531              self.handler.emit_err(ParserError::lexer_bidi_override_span(span));
532              return None;
533          } else if matches!(token, Token::Integer) {
534              let (s, radix) = if let Some(s) = text.strip_prefix("0x") {
535                  (s, 16)
536              } else if let Some(s) = text.strip_prefix("0o") {
537                  (s, 8)
538              } else if let Some(s) = text.strip_prefix("0b") {
539                  (s, 2)
540              } else {
541                  (text, 10)
542              };
543  
544              if let Some(c) = s.chars().find(|&c| c != '_' && !c.is_digit(radix)) {
545                  self.handler.emit_err(ParserError::wrong_digit_for_radix_span(c, radix, text, span));
546              }
547          }
548  
549          let lalr_token = LalrToken { token, text, span };
550  
551          Some((span.lo as usize, lalr_token, span.hi as usize))
552      }
553  }