tokenizer.rs
1 use smallvec::SmallVec; 2 3 use crate::Kind::{ 4 self, 5 *, 6 }; 7 8 /// Returns an iterator of tokens that reference the given string. 9 pub fn tokenize(source: &str) -> impl Iterator<Item = (Kind, &str)> { 10 Tokenizer::new(source) 11 } 12 13 /// Returns whether this identifier can be represented without quotes. 14 pub fn is_valid_plain_identifier(s: &str) -> bool { 15 let mut chars = s.chars(); 16 17 chars 18 .by_ref() 19 .next() 20 .is_some_and(is_valid_initial_plain_identifier_character) 21 && chars.all(is_valid_plain_identifier_character) 22 } 23 24 fn is_valid_initial_plain_identifier_character(c: char) -> bool { 25 let invalid = c.is_ascii_digit() || c == '-' || c == '\''; 26 27 !invalid && is_valid_plain_identifier_character(c) 28 } 29 30 fn is_valid_plain_identifier_character(c: char) -> bool { 31 c.is_alphanumeric() || matches!(c, '_' | '-' | '\'') 32 } 33 34 fn is_valid_path_character(c: char) -> bool { 35 c.is_alphanumeric() || matches!(c, '.' | '/' | '_' | '-' | '\\' | '(' | ')') 36 } 37 38 #[derive(Debug, Clone, Copy, PartialEq, Eq)] 39 enum Context<'a> { 40 Path, 41 PathEnd, 42 43 Delimited { 44 before: Option<&'a str>, 45 end: char, 46 }, 47 DelimitedEnd { 48 before: Option<&'a str>, 49 end: char, 50 }, 51 52 InterpolationStart, 53 Interpolation { 54 parentheses: usize, 55 }, 56 } 57 58 #[derive(Debug, Clone, PartialEq, Eq)] 59 struct Tokenizer<'a> { 60 source: &'a str, 61 offset: usize, 62 63 context: SmallVec<Context<'a>, 4>, 64 } 65 66 impl<'a> Iterator for Tokenizer<'a> { 67 type Item = (Kind, &'a str); 68 69 fn next(&mut self) -> Option<Self::Item> { 70 let start = self.offset; 71 72 self 73 .consume_kind() 74 .map(|kind| (kind, self.consumed_since(start))) 75 } 76 } 77 78 impl<'a> Tokenizer<'a> { 79 fn new(source: &'a str) -> Self { 80 Self { 81 source, 82 offset: 0, 83 context: SmallVec::new(), 84 } 85 } 86 87 fn context_push(&mut self, context: Context<'a>) { 88 self.context.push(context); 89 } 90 91 fn context_pop(&mut self, context: Context) { 92 assert_eq!(self.context.last(), Some(&context)); 93 self.context.pop(); 94 } 95 96 fn remaining(&self) -> &str { 97 &self.source[self.offset..] 98 } 99 100 fn peek_character_nth(&self, n: usize) -> Option<char> { 101 self.remaining().chars().nth(n) 102 } 103 104 fn peek_character(&self) -> Option<char> { 105 self.peek_character_nth(0) 106 } 107 108 fn consume_while(&mut self, predicate: impl Fn(char) -> bool) -> usize { 109 let len: usize = self 110 .remaining() 111 .chars() 112 .take_while(|&c| predicate(c)) 113 .map(char::len_utf8) 114 .sum(); 115 116 self.offset += len; 117 len 118 } 119 120 fn try_consume_character(&mut self, c: char) -> bool { 121 let starts_with = self.peek_character() == Some(c); 122 123 if starts_with { 124 self.offset += c.len_utf8(); 125 } 126 127 starts_with 128 } 129 130 fn try_consume_string(&mut self, s: &str) -> bool { 131 let starts_with = self.remaining().starts_with(s); 132 133 if starts_with { 134 self.offset += s.len(); 135 } 136 137 starts_with 138 } 139 140 fn consumed_since(&self, offset: usize) -> &'a str { 141 &self.source[offset..self.offset] 142 } 143 144 fn consume_character(&mut self) -> Option<char> { 145 let next = self.peek_character()?; 146 self.offset += next.len_utf8(); 147 Some(next) 148 } 149 150 fn consume_delimited_segment(&mut self) -> Option<Kind> { 151 match self 152 .peek_character() 153 .expect("caller must ensure there is content") 154 { 155 '\\' if self.peek_character_nth(1) == Some('(') => { 156 self.context_push(Context::InterpolationStart); 157 158 Some(TOKEN_CONTENT) 159 }, 160 161 '\\' => { 162 self.consume_character(); 163 self.consume_character(); 164 165 None 166 }, 167 168 _ => { 169 self.consume_character(); 170 171 None 172 }, 173 } 174 } 175 176 fn consume_delimited(&mut self, before: Option<&'a str>, end: char) -> Kind { 177 loop { 178 let remaining = self.remaining(); 179 180 if before.is_none_or(|before| remaining.starts_with(before)) 181 && remaining 182 .get(before.map_or(0, str::len)..) 183 .is_some_and(|remaining| remaining.starts_with(end)) 184 { 185 self.context_pop(Context::Delimited { before, end }); 186 self.context_push(Context::DelimitedEnd { before, end }); 187 188 return TOKEN_CONTENT; 189 } 190 191 if self.peek_character().is_none() { 192 self.context_pop(Context::Delimited { before, end }); 193 194 return TOKEN_CONTENT; 195 } 196 197 if let Some(kind) = self.consume_delimited_segment() { 198 return kind; 199 } 200 } 201 } 202 203 fn consume_path(&mut self) -> Kind { 204 loop { 205 if self 206 .peek_character() 207 .is_none_or(|c| !is_valid_path_character(c)) 208 { 209 self.context_pop(Context::Path); 210 self.context_push(Context::PathEnd); 211 212 return TOKEN_CONTENT; 213 } 214 215 if let Some(kind) = self.consume_delimited_segment() { 216 return kind; 217 } 218 } 219 } 220 221 fn consume_scientific_base10(&mut self) -> Option<Kind> { 222 if !(self.try_consume_character('e') || self.try_consume_character('E')) { 223 return None; 224 } 225 226 let _ = self.try_consume_character('+') || self.try_consume_character('-'); 227 228 let exponent_len = self.consume_while(|c| c.is_ascii_digit() || c == '_'); 229 let exponent = self.consumed_since(self.offset - exponent_len); 230 231 Some( 232 if exponent.is_empty() || exponent.bytes().all(|c| c == b'_') { 233 TOKEN_ERROR_FLOAT_NO_EXPONENT 234 } else { 235 TOKEN_FLOAT 236 }, 237 ) 238 } 239 240 fn consume_scientific_base16(&mut self) -> Option<Kind> { 241 if !(self.try_consume_character('p') || self.try_consume_character('P')) { 242 return None; 243 } 244 245 let _ = self.try_consume_character('+') || self.try_consume_character('-'); 246 247 let exponent_len = self.consume_while(|c| c.is_ascii_hexdigit() || c == '_'); 248 let exponent = self.consumed_since(self.offset - exponent_len); 249 250 Some( 251 if exponent.is_empty() || exponent.bytes().all(|c| c == b'_') { 252 TOKEN_ERROR_FLOAT_NO_EXPONENT 253 } else { 254 TOKEN_FLOAT 255 }, 256 ) 257 } 258 259 #[expect( 260 clippy::cognitive_complexity, 261 reason = "it's not complex, it's just a single match expression" 262 )] 263 fn consume_kind(&mut self) -> Option<Kind> { 264 let start = self.offset; 265 266 match self.context.last().copied() { 267 Some(Context::Path) => { 268 return Some(self.consume_path()); 269 }, 270 Some(Context::PathEnd) => { 271 self.context_pop(Context::PathEnd); 272 273 return Some(TOKEN_PATH_END); 274 }, 275 276 Some(Context::Delimited { before, end }) => { 277 return Some(self.consume_delimited(before, end)); 278 }, 279 Some(Context::DelimitedEnd { before, end }) => { 280 if let Some(before) = before { 281 assert!(self.try_consume_string(before)); 282 } 283 assert_eq!(self.consume_character(), Some(end)); 284 285 self.context_pop(Context::DelimitedEnd { before, end }); 286 287 return Some(match end { 288 '`' => TOKEN_QUOTED_IDENTIFIER_END, 289 '"' => TOKEN_STRING_END, 290 '\'' => TOKEN_CHAR_END, 291 _ => unreachable!(), 292 }); 293 }, 294 295 Some(Context::InterpolationStart) => { 296 assert!(self.try_consume_string(r"\(")); 297 298 self.context_pop(Context::InterpolationStart); 299 self.context_push(Context::Interpolation { parentheses: 0 }); 300 return Some(TOKEN_INTERPOLATION_START); 301 }, 302 Some(Context::Interpolation { .. }) => {}, 303 304 None => {}, 305 } 306 307 Some(match self.consume_character()? { 308 c if c.is_whitespace() => { 309 self.consume_while(char::is_whitespace); 310 311 TOKEN_SPACE 312 }, 313 314 '#' if self.peek_character() == Some('=') => { 315 let equals_len = self.consume_while(|c| c == '='); 316 let equals = self.consumed_since(self.offset - equals_len); 317 318 loop { 319 match self.peek_character() { 320 Some('=') 321 if let remaining = self.remaining() 322 && remaining.starts_with(equals) 323 && remaining.as_bytes().get(equals_len).copied() == Some(b'#') => 324 { 325 // Hard code a 1 here because that comparison up top is a byte. 326 self.offset += equals_len + 1; 327 328 break TOKEN_COMMENT; 329 }, 330 331 // #= ==# is not a closed comment. 332 Some('=') => { 333 self.consume_while(|c| c == '='); 334 }, 335 336 Some('#') if self.peek_character_nth(1) == Some('=') => { 337 self.consume_kind(); 338 }, 339 340 Some(_) => { 341 self.consume_character(); 342 }, 343 344 None => { 345 break TOKEN_COMMENT; 346 }, 347 } 348 } 349 }, 350 351 '#' => { 352 self.consume_while(|c| !matches!(c, '\n')); 353 354 TOKEN_COMMENT 355 }, 356 357 ',' => TOKEN_COMMA, 358 ';' => TOKEN_SEMICOLON, 359 360 '<' if self.try_consume_character('|') => TOKEN_LESS_PIPE, 361 '|' if self.try_consume_character('>') => TOKEN_PIPE_MORE, 362 363 '(' if let Some(&mut Context::Interpolation { 364 ref mut parentheses, 365 }) = self.context.last_mut() => 366 { 367 *parentheses += 1; 368 TOKEN_PARENTHESIS_LEFT 369 }, 370 ')' if let Some(&mut Context::Interpolation { 371 ref mut parentheses, 372 }) = self.context.last_mut() => 373 { 374 match parentheses.checked_sub(1) { 375 Some(new) => { 376 *parentheses = new; 377 TOKEN_PARENTHESIS_RIGHT 378 }, 379 380 None => { 381 self.context_pop(Context::Interpolation { parentheses: 0 }); 382 TOKEN_INTERPOLATION_END 383 }, 384 } 385 }, 386 387 '(' => TOKEN_PARENTHESIS_LEFT, 388 ')' => TOKEN_PARENTHESIS_RIGHT, 389 390 '=' if self.try_consume_character('>') => TOKEN_EQUAL_MORE, 391 392 ':' => TOKEN_COLON, 393 '+' if self.try_consume_character('+') => TOKEN_PLUS_PLUS, 394 '[' => TOKEN_BRACKET_LEFT, 395 ']' => TOKEN_BRACKET_RIGHT, 396 397 '/' if self.try_consume_character('/') => TOKEN_SLASH_SLASH, 398 '{' => TOKEN_CURLYBRACE_LEFT, 399 '}' => TOKEN_CURLYBRACE_RIGHT, 400 401 '<' if self.try_consume_character('=') => TOKEN_LESS_EQUAL, 402 '<' => TOKEN_LESS, 403 '>' if self.try_consume_character('=') => TOKEN_MORE_EQUAL, 404 '>' => TOKEN_MORE, 405 406 '!' if self.try_consume_character('=') => TOKEN_EXCLAMATION_EQUAL, 407 '=' => TOKEN_EQUAL, 408 409 '&' if self.try_consume_character('&') => TOKEN_AMPERSAND_AMPERSAND, 410 '|' if self.try_consume_character('|') => TOKEN_PIPE_PIPE, 411 '!' => TOKEN_EXCLAMATION, 412 '-' if self.try_consume_character('>') => TOKEN_MINUS_MORE, 413 414 '&' => TOKEN_AMPERSAND, 415 '|' => TOKEN_PIPE, 416 417 '+' => TOKEN_PLUS, 418 '-' => TOKEN_MINUS, 419 '*' => TOKEN_ASTERISK, 420 '^' => TOKEN_CARET, 421 '/' if self 422 .peek_character() 423 .is_none_or(|c| !is_valid_path_character(c)) => 424 { 425 TOKEN_SLASH 426 }, 427 428 '0' if let Some('b' | 'B' | 'o' | 'O' | 'x' | 'X') = self.peek_character() => { 429 #[expect(clippy::type_complexity)] 430 let (is_valid_digit, consume_scientific): ( 431 fn(char) -> bool, 432 fn(&mut Self) -> Option<Kind>, 433 ) = match self.consume_character() { 434 Some('b' | 'B') => (|c| matches!(c, '0' | '1' | '_'), |_| None), 435 Some('o' | 'O') => (|c| matches!(c, '0'..='7' | '_'), |_| None), 436 Some('x' | 'X') => { 437 ( 438 |c| c.is_ascii_hexdigit() || c == '_', 439 |this| this.consume_scientific_base16(), 440 ) 441 }, 442 _ => unreachable!(), 443 }; 444 445 let digits_len = self.consume_while(is_valid_digit); 446 let digits = self.consumed_since(self.offset - digits_len); 447 let mut error_kind = (digits.is_empty() || digits.bytes().all(|c| c == b'_')) 448 .then_some(TOKEN_ERROR_NUMBER_NO_DIGIT); 449 450 let default_kind = if self.peek_character() == Some('.') 451 && self.peek_character_nth(1).is_some_and(is_valid_digit) 452 { 453 self.consume_character(); 454 455 error_kind = error_kind.and({ 456 let digits_len = self.consume_while(is_valid_digit); 457 let digits = self.consumed_since(self.offset - digits_len); 458 digits.is_empty().then_some(TOKEN_ERROR_NUMBER_NO_DIGIT) 459 }); 460 461 TOKEN_FLOAT 462 } else { 463 TOKEN_INTEGER 464 }; 465 466 error_kind 467 .or(consume_scientific(self)) 468 .unwrap_or(default_kind) 469 }, 470 471 '.' if let is_valid_digit = (|c: char| c.is_ascii_digit() || c == '_') 472 && self.peek_character().is_some_and(is_valid_digit) => 473 { 474 self.consume_while(is_valid_digit); 475 476 self.consume_scientific_base10().unwrap_or(TOKEN_FLOAT) 477 }, 478 479 initial_digit if initial_digit.is_ascii_digit() => { 480 let is_valid_digit = |c: char| c.is_ascii_digit() || c == '_'; 481 482 self.consume_while(is_valid_digit); 483 484 let default_kind = if self.peek_character() == Some('.') 485 && self.peek_character_nth(1).is_some_and(is_valid_digit) 486 { 487 self.consume_character(); 488 self.consume_while(is_valid_digit); 489 TOKEN_FLOAT 490 } else { 491 TOKEN_INTEGER 492 }; 493 494 self.consume_scientific_base10().unwrap_or(default_kind) 495 }, 496 497 // After the `.123` literal parsing. 498 '.' => TOKEN_PERIOD, 499 500 initial_letter if is_valid_initial_plain_identifier_character(initial_letter) => { 501 const KEYWORDS: phf::Map<&'static str, Kind> = phf::phf_map! { 502 "if" => TOKEN_KEYWORD_IF, 503 "then" => TOKEN_KEYWORD_THEN, 504 "else" => TOKEN_KEYWORD_ELSE, 505 }; 506 507 self.consume_while(is_valid_plain_identifier_character); 508 509 KEYWORDS 510 .get(self.consumed_since(start)) 511 .copied() 512 .unwrap_or(TOKEN_IDENTIFIER) 513 }, 514 515 // \(foo)/bar/baz.txt 516 start @ '\\' => { 517 self.offset -= start.len_utf8(); 518 self.context_push(Context::Path); 519 520 TOKEN_PATH_START 521 }, 522 // /bar/baz.txt 523 start @ '/' if self.peek_character().is_some_and(is_valid_path_character) => { 524 self.offset -= start.len_utf8(); 525 self.context_push(Context::Path); 526 527 TOKEN_PATH_START 528 }, 529 530 '@' => TOKEN_AT, 531 532 start @ ('`' | '"' | '\'') => { 533 let equals_len = self.consume_while(|c| c == '='); 534 let equals = self.consumed_since(self.offset - equals_len); 535 536 self.context_push(Context::Delimited { 537 before: Some(equals), 538 end: start, 539 }); 540 541 match start { 542 '`' => TOKEN_QUOTED_IDENTIFIER_START, 543 '\"' => TOKEN_STRING_START, 544 '\'' => TOKEN_CHAR_START, 545 _ => unreachable!(), 546 } 547 }, 548 549 _ => TOKEN_ERROR_UNKNOWN, 550 }) 551 } 552 } 553 554 #[cfg(test)] 555 mod tests { 556 use super::*; 557 558 macro_rules! assert_matches { 559 ($expression:expr, $pattern:pat) => { 560 assert!(matches!($expression, $pattern)) 561 }; 562 } 563 564 macro_rules! assert_token_matches { 565 ($string:literal, $($pattern:pat),* $(,)?) => {{ 566 let mut tokens = tokenize($string); 567 568 $(assert_matches!(tokens.next(), Some($pattern));)* 569 570 assert!(tokens.next().is_none()); 571 }}; 572 } 573 574 #[test] 575 fn empty_tokens() { 576 assert_token_matches!( 577 r#""foo \(bar)""#, 578 (TOKEN_STRING_START, r#"""#), 579 (TOKEN_CONTENT, "foo "), 580 (TOKEN_INTERPOLATION_START, r"\("), 581 (TOKEN_IDENTIFIER, "bar"), 582 (TOKEN_INTERPOLATION_END, ")"), 583 (TOKEN_CONTENT, ""), 584 (TOKEN_STRING_END, r#"""#), 585 ); 586 } 587 588 #[test] 589 fn number_errors() { 590 assert_token_matches!( 591 "0b__e 0x0 0x123.0e 0o777.0e", 592 (TOKEN_ERROR_NUMBER_NO_DIGIT, "0b__"), 593 (TOKEN_IDENTIFIER, "e"), 594 (TOKEN_SPACE, " "), 595 (TOKEN_INTEGER, "0x0"), 596 (TOKEN_SPACE, " "), 597 (TOKEN_FLOAT, "0x123.0e"), // e is a valid hexadecimal digit. 598 (TOKEN_SPACE, " "), 599 (TOKEN_FLOAT, "0o777.0"), 600 (TOKEN_IDENTIFIER, "e"), 601 ); 602 } 603 604 #[test] 605 fn path() { 606 assert_token_matches!( 607 r"/foo\(𓃰)///baz", 608 (TOKEN_PATH_START, ""), 609 (TOKEN_CONTENT, "/foo"), 610 (TOKEN_INTERPOLATION_START, r"\("), 611 (TOKEN_IDENTIFIER, "𓃰"), 612 (TOKEN_INTERPOLATION_END, ")"), 613 (TOKEN_CONTENT, "///baz"), 614 (TOKEN_PATH_END, ""), 615 ); 616 } 617 618 #[test] 619 fn errors_are_individual() { 620 assert_token_matches!( 621 "~~~", 622 (TOKEN_ERROR_UNKNOWN, "~"), 623 (TOKEN_ERROR_UNKNOWN, "~"), 624 (TOKEN_ERROR_UNKNOWN, "~") 625 ); 626 } 627 }