tokens.rs
1 // Copyright (C) 2019-2025 ADnet Contributors 2 // This file is part of the ADL library. 3 4 // The ADL library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 9 // The ADL library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU General Public License for more details. 13 14 // You should have received a copy of the GNU General Public License 15 // along with the ADL library. If not, see <https://www.gnu.org/licenses/>. 16 17 use adl_errors::{Handler, ParserError}; 18 use logos::Logos; 19 use std::sync::LazyLock; 20 21 #[derive(Clone, Copy, Debug, PartialEq, Eq)] 22 pub enum IdVariants { 23 Identifier, 24 Intrinsic, 25 Path, 26 ProgramId, 27 Locator, 28 } 29 30 fn id_variant(lex: &mut logos::Lexer<Token>) -> IdVariants { 31 // Use LazyLock to not recompile these regexes every time. 32 // Support both .alpha (ALPHA chain) and .delta (DELTA chain) program IDs 33 static REGEX_LOCATOR: LazyLock<regex::Regex> = 34 LazyLock::new(|| regex::Regex::new(r"^\.(alpha|delta)/[a-zA-Z][a-zA-Z0-9_]*").unwrap()); 35 static REGEX_PROGRAM_ID: LazyLock<regex::Regex> = 36 LazyLock::new(|| regex::Regex::new(r"^\.(alpha|delta)\b").unwrap()); 37 static REGEX_PATH: LazyLock<regex::Regex> = 38 LazyLock::new(|| regex::Regex::new(r"^(?:::[a-zA-Z][a-zA-Z0-9_]*)+").unwrap()); 39 40 if let Some(found) = REGEX_LOCATOR.find(lex.remainder()) { 41 lex.bump(found.len()); 42 IdVariants::Locator 43 } else if let Some(found) = REGEX_PROGRAM_ID.find(lex.remainder()) { 44 lex.bump(found.len()); 45 IdVariants::ProgramId 46 } else if let Some(found) = REGEX_PATH.find(lex.remainder()) { 47 lex.bump(found.len()); 48 IdVariants::Path 49 } else if lex.remainder().starts_with("_") { 50 IdVariants::Intrinsic 51 } else { 52 IdVariants::Identifier 53 } 54 } 55 56 fn comment_block(lex: &mut logos::Lexer<Token>) -> bool { 57 let mut last_asterisk = false; 58 for (index, c) in lex.remainder().char_indices() { 59 if c == '*' { 60 last_asterisk = true; 61 } else if c == '/' && last_asterisk { 62 lex.bump(index + 1); 63 return true; 64 } else if matches!(c, 65 '\u{202A}'..='\u{202E}' | 66 '\u{2066}'..='\u{2069}' 67 ) { 68 // It's a bidi character - end the comment token 69 // so we can report that error. 70 lex.bump(index); 71 return true; 72 } else { 73 last_asterisk = false; 74 } 75 } 76 false 77 } 78 79 #[derive(Clone, Copy, Debug, PartialEq, Eq, Logos)] 80 pub enum Token { 81 #[regex(r"[ \t\f]+")] 82 Whitespace, 83 84 #[regex(r"\r?\n")] 85 Linebreak, 86 87 // Comments don't include line breaks or bidi characters. 88 #[regex(r"//[^\r\n\u{202A}-\u{202E}\u{2066}-\u{2069}]*")] 89 CommentLine, 90 91 // Can't match block comments in a regex without lazy quantifiers, 92 // so use a callback. 93 #[token(r"/*", comment_block)] 94 CommentBlock, 95 96 // We want to lex these four categories as separate token types: 97 // 1. identifiers like `abc` 98 // 2. paths like `abc::def::ghi` 99 // 3. program ids like `abc.alpha` 100 // 4. locators like `abc.alpha/def` 101 // We can't do this directly with logos regexes due to the lack of backtracking. 102 // So we do it with this callback. 103 // 104 // As an alternative design, we could simply treat the individual components of these as separate tokens, 105 // so that `abc.alpha/def` would be tokenized as `[abc, ., alpha, /, def]`. This is challenging to handle 106 // with an LR(1) parser - we potentially get shift-reduce conflicts and other ambiguities between 107 // member accesses, program ids, tuple accesses, etc. We could make it work but let's just cut to the 108 // chase here. 109 110 // Catch identifiers starting with underscore 111 #[regex(r"_[a-zA-Z][a-zA-Z0-9_]*", |_| IdVariants::Intrinsic)] 112 #[regex(r"[a-zA-Z][a-zA-Z0-9_]*", id_variant)] 113 // We need to special case `group::abc` and `signature::abc` as otherwise these are keywords. 114 #[regex(r"group::[a-zA-Z][a-zA-Z0-9_]*", |_| IdVariants::Path)] 115 #[regex(r"signature::[a-zA-Z][a-zA-Z0-9_]*", |_| IdVariants::Path)] 116 #[regex(r"Future::[a-zA-Z][a-zA-Z0-9_]*", |_| IdVariants::Path)] 117 IdVariants(IdVariants), 118 119 // Address literals should have exactly 58 characters after the prefix, but we lex other lengths 120 // and flag an error later. AX addresses use "ax1" prefix (61 chars total). 121 #[regex(r"ax1[a-z0-9]*")] 122 AddressLiteral, 123 124 // As with the previous parser, avoid lowercase letters to avoid ambiguity with the `field` postfix. 125 // Allow invalid digits for each radix so we can report an error about them later. 126 #[regex(r"0x[0-9A-Z_]+")] 127 #[regex(r"0o[0-9A-Z_]+")] 128 #[regex(r"0b[0-9A-Z_]+")] 129 #[regex(r"[0-9][0-9A-Z_]*")] 130 Integer, 131 132 #[regex(r#""[^"]*""#)] 133 StaticString, 134 135 // Symbols 136 #[token("=")] 137 Assign, 138 #[token("!")] 139 Not, 140 #[token("&&")] 141 And, 142 #[token("&&=")] 143 AndAssign, 144 #[token("||")] 145 Or, 146 #[token("||=")] 147 OrAssign, 148 #[token("&")] 149 BitAnd, 150 #[token("&=")] 151 BitAndAssign, 152 #[token("|")] 153 BitOr, 154 #[token("|=")] 155 BitOrAssign, 156 #[token("^")] 157 BitXor, 158 #[token("^=")] 159 BitXorAssign, 160 #[token("==")] 161 Eq, 162 #[token("!=")] 163 NotEq, 164 #[token("<")] 165 Lt, 166 #[token("<=")] 167 LtEq, 168 #[token(">")] 169 Gt, 170 #[token(">=")] 171 GtEq, 172 #[token("+")] 173 Add, 174 #[token("+=")] 175 AddAssign, 176 #[token("-")] 177 Sub, 178 #[token("-=")] 179 SubAssign, 180 #[token("*")] 181 Mul, 182 #[token("*=")] 183 MulAssign, 184 #[token("/")] 185 Div, 186 #[token("/=")] 187 DivAssign, 188 #[token("**")] 189 Pow, 190 #[token("**=")] 191 PowAssign, 192 #[token("%")] 193 Rem, 194 #[token("%=")] 195 RemAssign, 196 #[token("<<")] 197 Shl, 198 #[token("<<=")] 199 ShlAssign, 200 #[token(">>")] 201 Shr, 202 #[token(">>=")] 203 ShrAssign, 204 #[token("(")] 205 LeftParen, 206 #[token(")")] 207 RightParen, 208 #[token("[")] 209 LeftSquare, 210 #[token("]")] 211 RightSquare, 212 #[token("{")] 213 LeftCurly, 214 #[token("}")] 215 RightCurly, 216 #[token(",")] 217 Comma, 218 #[token(".")] 219 Dot, 220 #[token("..")] 221 DotDot, 222 #[token(";")] 223 Semicolon, 224 #[token(":")] 225 Colon, 226 #[token("::")] 227 DoubleColon, 228 #[token("?")] 229 Question, 230 #[token("->")] 231 Arrow, 232 #[token("=>")] 233 BigArrow, 234 #[token("_")] 235 Underscore, 236 #[token("@")] 237 At, 238 239 // Keywords 240 #[token("true")] 241 True, 242 #[token("false")] 243 False, 244 #[token("none")] 245 None, 246 #[token("address")] 247 Address, 248 #[token("bool")] 249 Bool, 250 #[token("field")] 251 Field, 252 #[token("group")] 253 Group, 254 #[token("i8")] 255 I8, 256 #[token("i16")] 257 I16, 258 #[token("i32")] 259 I32, 260 #[token("i64")] 261 I64, 262 #[token("i128")] 263 I128, 264 #[token("record")] 265 Record, 266 #[token("scalar")] 267 Scalar, 268 #[token("signature")] 269 Signature, 270 #[token("string")] 271 String, 272 #[token("struct")] 273 Struct, 274 #[token("u8")] 275 U8, 276 #[token("u16")] 277 U16, 278 #[token("u32")] 279 U32, 280 #[token("u64")] 281 U64, 282 #[token("u128")] 283 U128, 284 285 #[token("alpha")] 286 Alpha, 287 #[token("delta")] 288 Delta, 289 #[token("as")] 290 As, 291 #[token("assert")] 292 Assert, 293 #[token("assert_eq")] 294 AssertEq, 295 #[token("assert_neq")] 296 AssertNeq, 297 #[token("async")] 298 Async, 299 #[token("block")] 300 Block, 301 #[token("const")] 302 Const, 303 #[token("constant")] 304 Constant, 305 #[token("constructor")] 306 Constructor, 307 #[token("else")] 308 Else, 309 #[token("Fn")] 310 Fn, 311 #[token("for")] 312 For, 313 #[token("function")] 314 Function, 315 #[token("Future")] 316 Future, 317 #[token("if")] 318 If, 319 #[token("import")] 320 Import, 321 #[token("in")] 322 In, 323 #[token("inline")] 324 Inline, 325 #[token("let")] 326 Let, 327 #[token("mapping")] 328 Mapping, 329 #[token("storage")] 330 Storage, 331 #[token("network")] 332 Network, 333 #[token("private")] 334 Private, 335 #[token("program")] 336 Program, 337 #[token("public")] 338 Public, 339 #[token("return")] 340 Return, 341 #[token("script")] 342 Script, 343 #[token("self")] 344 SelfLower, 345 #[token("transition")] 346 Transition, 347 348 // Unicode bidirectional control characters are a potential risk in 349 // source. We detect them so we can report them as an error. 350 #[regex(r"[\u{202A}-\u{202E}\u{2066}-\u{2069}]")] 351 Bidi, 352 353 // This token is never produced; we use it in grammar.lalrpop 354 // to ensure a given production doesn't happen. 355 Never, 356 } 357 358 impl Token { 359 /// A `str` describing the token suitable for use in error messages. 360 /// 361 /// * `token_s` - The str as reported by logos. 362 pub fn str_user(token_s: &str) -> Option<&'static str> { 363 let v = match token_s { 364 // These variants we don't want to report to the user. 365 // Whitespace, 366 // Linebreak, 367 // CommentLine, 368 // CommentBlock, 369 "Identifier" => "an identifier", 370 "AddressLiteral" => "an address literal", 371 "ProgramId" => "a program id", 372 373 "Integer" => "an integer literal", 374 375 "StaticString" => "a static string", 376 377 // Symbols 378 "Assign" => "'='", 379 "Not" => "'!'", 380 "And" => "'&&'", 381 "AndAssign" => "'&&='", 382 "Or" => "'||'", 383 "OrAssign" => "'||='", 384 "BitAnd" => "'&'", 385 "BitAndAssign" => "'&='", 386 "BitOr" => "'|'", 387 "BitOrAssign" => "'|='", 388 "BitXor" => "'^'", 389 "BitXorAssign" => "'&='", 390 "Eq" => "'=='", 391 "NotEq" => "'!='", 392 "Lt" => "'<'", 393 "LtEq" => "'<='", 394 "Gt" => "'>'", 395 "GtEq" => "'>='", 396 "Add" => "'+'", 397 "AddAssign" => "'+='", 398 "Sub" => "'-'", 399 "SubAssign" => "'-='", 400 "Mul" => "'*'", 401 "MulAssign" => "'*='", 402 "Div" => "'/'", 403 "DivAssign" => "'/='", 404 "Pow" => "'**'", 405 "PowAssign" => "'**='", 406 "Rem" => "'%'", 407 "RemAssign" => "'%='", 408 "Shl" => "'<<'", 409 "ShlAssign" => "'<<='", 410 "Shr" => "'>>'", 411 "ShrAssign" => "'>>='", 412 "LeftParen" => "'('", 413 "RightParen" => "')'", 414 "LeftSquare" => "'['", 415 "RightSquare" => "']'", 416 "LeftCurly" => "'{'", 417 "RightCurly" => "'}'", 418 "Comma" => "','", 419 "Dot" => "'.'", 420 "DotDot" => "'..'", 421 "Semicolon" => "';'", 422 "Colon" => "':'", 423 "DoubleColon" => "'::'", 424 "Question" => "'?'", 425 "Arrow" => "'->'", 426 "BigArrow" => "'=>'", 427 "Underscore" => "'_'", 428 "At" => "'@'", 429 430 // Keywords 431 "True" => "'true'", 432 "False" => "'false'", 433 "Address" => "'address", 434 "Bool" => "'bool'", 435 "Field" => "'field'", 436 "Group" => "'group'", 437 "I8" => "'i8'", 438 "I16" => "'i16'", 439 "I32" => "'i32'", 440 "I64" => "'i64'", 441 "I128" => "'i128'", 442 "Record" => "'record'", 443 "Scalar" => "'scalar'", 444 "Signature" => "'signature'", 445 "String" => "a string", 446 "Struct" => "'struct'", 447 "U8" => "'u8'", 448 "U16" => "'u16'", 449 "U32" => "'u32'", 450 "U64" => "'u64'", 451 "U128" => "'u128'", 452 453 "Alpha" => "'alpha'", 454 "Delta" => "'delta'", 455 "As" => "'as'", 456 "Assert" => "'assert'", 457 "AssertEq" => "'assert_eq'", 458 "AssertNeq" => "'assert_neq'", 459 "Async" => "'async'", 460 "Block" => "'block'", 461 "Const" => "'const'", 462 "Constant" => "'constant'", 463 "Constructor" => "'constructor'", 464 "Else" => "'else'", 465 "Fn" => "'Fn'", 466 "For" => "'for'", 467 "Function" => "'function'", 468 "Future" => "'future'", 469 "If" => "'if'", 470 "Import" => "'import'", 471 "In" => "'in'", 472 "Inline" => "'inline'", 473 "Let" => "'let'", 474 "Mapping" => "'mapping'", 475 "Storage" => "'storage'", 476 "Network" => "'network'", 477 "Private" => "'private'", 478 "Program" => "'program'", 479 "Public" => "'public'", 480 "Return" => "'return'", 481 "Script" => "'script'", 482 "SelfLower" => "'self'", 483 "Transition" => "'transition'", 484 485 "Never" => return None, 486 487 _ => return None, 488 }; 489 Some(v) 490 } 491 } 492 493 /// The token type we present to LALRPOP. 494 #[derive(Clone, Debug, PartialEq, Eq)] 495 pub struct LalrToken<'a> { 496 pub token: Token, 497 pub text: &'a str, 498 pub span: adl_span::Span, 499 } 500 501 /// The lexer we present to LALRPOP. 502 pub struct Lexer<'a> { 503 logos_lexer: logos::Lexer<'a, Token>, 504 start_pos: u32, 505 handler: Handler, 506 } 507 508 impl<'a> Lexer<'a> { 509 pub fn new(text: &'a str, start_pos: u32, handler: Handler) -> Self { 510 Self { logos_lexer: Token::lexer(text), start_pos, handler } 511 } 512 } 513 514 impl<'a> Iterator for Lexer<'a> { 515 type Item = (usize, LalrToken<'a>, usize); 516 517 fn next(&mut self) -> Option<Self::Item> { 518 let next = self.logos_lexer.next()?; 519 let logos_span = self.logos_lexer.span(); 520 let span = 521 adl_span::Span { lo: self.start_pos + logos_span.start as u32, hi: self.start_pos + logos_span.end as u32 }; 522 523 let text = self.logos_lexer.slice(); 524 525 let Ok(token) = next else { 526 self.handler.emit_err(ParserError::could_not_lex_span(text.trim(), span)); 527 return None; 528 }; 529 530 if matches!(token, Token::Bidi) { 531 self.handler.emit_err(ParserError::lexer_bidi_override_span(span)); 532 return None; 533 } else if matches!(token, Token::Integer) { 534 let (s, radix) = if let Some(s) = text.strip_prefix("0x") { 535 (s, 16) 536 } else if let Some(s) = text.strip_prefix("0o") { 537 (s, 8) 538 } else if let Some(s) = text.strip_prefix("0b") { 539 (s, 2) 540 } else { 541 (text, 10) 542 }; 543 544 if let Some(c) = s.chars().find(|&c| c != '_' && !c.is_digit(radix)) { 545 self.handler.emit_err(ParserError::wrong_digit_for_radix_span(c, radix, text, span)); 546 } 547 } 548 549 let lalr_token = LalrToken { token, text, span }; 550 551 Some((span.lo as usize, lalr_token, span.hi as usize)) 552 } 553 }