/ cab / syntax / tokenizer.rs
tokenizer.rs
  1  use smallvec::SmallVec;
  2  
  3  use crate::Kind::{
  4     self,
  5     *,
  6  };
  7  
  8  /// Returns an iterator of tokens that reference the given string.
  9  pub fn tokenize(source: &str) -> impl Iterator<Item = (Kind, &str)> {
 10     Tokenizer::new(source)
 11  }
 12  
 13  /// Returns whether this identifier can be represented without quotes.
 14  pub fn is_valid_plain_identifier(s: &str) -> bool {
 15     let mut chars = s.chars();
 16  
 17     chars
 18        .by_ref()
 19        .next()
 20        .is_some_and(is_valid_initial_plain_identifier_character)
 21        && chars.all(is_valid_plain_identifier_character)
 22  }
 23  
 24  fn is_valid_initial_plain_identifier_character(c: char) -> bool {
 25     let invalid = c.is_ascii_digit() || c == '-' || c == '\'';
 26  
 27     !invalid && is_valid_plain_identifier_character(c)
 28  }
 29  
 30  fn is_valid_plain_identifier_character(c: char) -> bool {
 31     c.is_alphanumeric() || matches!(c, '_' | '-' | '\'')
 32  }
 33  
 34  fn is_valid_path_character(c: char) -> bool {
 35     c.is_alphanumeric() || matches!(c, '.' | '/' | '_' | '-' | '\\' | '(' | ')')
 36  }
 37  
 38  #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 39  enum Context<'a> {
 40     Path,
 41     PathEnd,
 42  
 43     Delimited {
 44        before: Option<&'a str>,
 45        end:    char,
 46     },
 47     DelimitedEnd {
 48        before: Option<&'a str>,
 49        end:    char,
 50     },
 51  
 52     InterpolationStart,
 53     Interpolation {
 54        parentheses: usize,
 55     },
 56  }
 57  
 58  #[derive(Debug, Clone, PartialEq, Eq)]
 59  struct Tokenizer<'a> {
 60     source: &'a str,
 61     offset: usize,
 62  
 63     context: SmallVec<Context<'a>, 4>,
 64  }
 65  
 66  impl<'a> Iterator for Tokenizer<'a> {
 67     type Item = (Kind, &'a str);
 68  
 69     fn next(&mut self) -> Option<Self::Item> {
 70        let start = self.offset;
 71  
 72        self
 73           .consume_kind()
 74           .map(|kind| (kind, self.consumed_since(start)))
 75     }
 76  }
 77  
 78  impl<'a> Tokenizer<'a> {
 79     fn new(source: &'a str) -> Self {
 80        Self {
 81           source,
 82           offset: 0,
 83           context: SmallVec::new(),
 84        }
 85     }
 86  
 87     fn context_push(&mut self, context: Context<'a>) {
 88        self.context.push(context);
 89     }
 90  
 91     fn context_pop(&mut self, context: Context) {
 92        assert_eq!(self.context.last(), Some(&context));
 93        self.context.pop();
 94     }
 95  
 96     fn remaining(&self) -> &str {
 97        &self.source[self.offset..]
 98     }
 99  
100     fn peek_character_nth(&self, n: usize) -> Option<char> {
101        self.remaining().chars().nth(n)
102     }
103  
104     fn peek_character(&self) -> Option<char> {
105        self.peek_character_nth(0)
106     }
107  
108     fn consume_while(&mut self, predicate: impl Fn(char) -> bool) -> usize {
109        let len: usize = self
110           .remaining()
111           .chars()
112           .take_while(|&c| predicate(c))
113           .map(char::len_utf8)
114           .sum();
115  
116        self.offset += len;
117        len
118     }
119  
120     fn try_consume_character(&mut self, c: char) -> bool {
121        let starts_with = self.peek_character() == Some(c);
122  
123        if starts_with {
124           self.offset += c.len_utf8();
125        }
126  
127        starts_with
128     }
129  
130     fn try_consume_string(&mut self, s: &str) -> bool {
131        let starts_with = self.remaining().starts_with(s);
132  
133        if starts_with {
134           self.offset += s.len();
135        }
136  
137        starts_with
138     }
139  
140     fn consumed_since(&self, offset: usize) -> &'a str {
141        &self.source[offset..self.offset]
142     }
143  
144     fn consume_character(&mut self) -> Option<char> {
145        let next = self.peek_character()?;
146        self.offset += next.len_utf8();
147        Some(next)
148     }
149  
150     fn consume_delimited_segment(&mut self) -> Option<Kind> {
151        match self
152           .peek_character()
153           .expect("caller must ensure there is content")
154        {
155           '\\' if self.peek_character_nth(1) == Some('(') => {
156              self.context_push(Context::InterpolationStart);
157  
158              Some(TOKEN_CONTENT)
159           },
160  
161           '\\' => {
162              self.consume_character();
163              self.consume_character();
164  
165              None
166           },
167  
168           _ => {
169              self.consume_character();
170  
171              None
172           },
173        }
174     }
175  
176     fn consume_delimited(&mut self, before: Option<&'a str>, end: char) -> Kind {
177        loop {
178           let remaining = self.remaining();
179  
180           if before.is_none_or(|before| remaining.starts_with(before))
181              && remaining
182                 .get(before.map_or(0, str::len)..)
183                 .is_some_and(|remaining| remaining.starts_with(end))
184           {
185              self.context_pop(Context::Delimited { before, end });
186              self.context_push(Context::DelimitedEnd { before, end });
187  
188              return TOKEN_CONTENT;
189           }
190  
191           if self.peek_character().is_none() {
192              self.context_pop(Context::Delimited { before, end });
193  
194              return TOKEN_CONTENT;
195           }
196  
197           if let Some(kind) = self.consume_delimited_segment() {
198              return kind;
199           }
200        }
201     }
202  
203     fn consume_path(&mut self) -> Kind {
204        loop {
205           if self
206              .peek_character()
207              .is_none_or(|c| !is_valid_path_character(c))
208           {
209              self.context_pop(Context::Path);
210              self.context_push(Context::PathEnd);
211  
212              return TOKEN_CONTENT;
213           }
214  
215           if let Some(kind) = self.consume_delimited_segment() {
216              return kind;
217           }
218        }
219     }
220  
221     fn consume_scientific_base10(&mut self) -> Option<Kind> {
222        if !(self.try_consume_character('e') || self.try_consume_character('E')) {
223           return None;
224        }
225  
226        let _ = self.try_consume_character('+') || self.try_consume_character('-');
227  
228        let exponent_len = self.consume_while(|c| c.is_ascii_digit() || c == '_');
229        let exponent = self.consumed_since(self.offset - exponent_len);
230  
231        Some(
232           if exponent.is_empty() || exponent.bytes().all(|c| c == b'_') {
233              TOKEN_ERROR_FLOAT_NO_EXPONENT
234           } else {
235              TOKEN_FLOAT
236           },
237        )
238     }
239  
240     fn consume_scientific_base16(&mut self) -> Option<Kind> {
241        if !(self.try_consume_character('p') || self.try_consume_character('P')) {
242           return None;
243        }
244  
245        let _ = self.try_consume_character('+') || self.try_consume_character('-');
246  
247        let exponent_len = self.consume_while(|c| c.is_ascii_hexdigit() || c == '_');
248        let exponent = self.consumed_since(self.offset - exponent_len);
249  
250        Some(
251           if exponent.is_empty() || exponent.bytes().all(|c| c == b'_') {
252              TOKEN_ERROR_FLOAT_NO_EXPONENT
253           } else {
254              TOKEN_FLOAT
255           },
256        )
257     }
258  
259     #[expect(
260        clippy::cognitive_complexity,
261        reason = "it's not complex, it's just a single match expression"
262     )]
263     fn consume_kind(&mut self) -> Option<Kind> {
264        let start = self.offset;
265  
266        match self.context.last().copied() {
267           Some(Context::Path) => {
268              return Some(self.consume_path());
269           },
270           Some(Context::PathEnd) => {
271              self.context_pop(Context::PathEnd);
272  
273              return Some(TOKEN_PATH_END);
274           },
275  
276           Some(Context::Delimited { before, end }) => {
277              return Some(self.consume_delimited(before, end));
278           },
279           Some(Context::DelimitedEnd { before, end }) => {
280              if let Some(before) = before {
281                 assert!(self.try_consume_string(before));
282              }
283              assert_eq!(self.consume_character(), Some(end));
284  
285              self.context_pop(Context::DelimitedEnd { before, end });
286  
287              return Some(match end {
288                 '`' => TOKEN_QUOTED_IDENTIFIER_END,
289                 '"' => TOKEN_STRING_END,
290                 '\'' => TOKEN_CHAR_END,
291                 _ => unreachable!(),
292              });
293           },
294  
295           Some(Context::InterpolationStart) => {
296              assert!(self.try_consume_string(r"\("));
297  
298              self.context_pop(Context::InterpolationStart);
299              self.context_push(Context::Interpolation { parentheses: 0 });
300              return Some(TOKEN_INTERPOLATION_START);
301           },
302           Some(Context::Interpolation { .. }) => {},
303  
304           None => {},
305        }
306  
307        Some(match self.consume_character()? {
308           c if c.is_whitespace() => {
309              self.consume_while(char::is_whitespace);
310  
311              TOKEN_SPACE
312           },
313  
314           '#' if self.peek_character() == Some('=') => {
315              let equals_len = self.consume_while(|c| c == '=');
316              let equals = self.consumed_since(self.offset - equals_len);
317  
318              loop {
319                 match self.peek_character() {
320                    Some('=')
321                       if let remaining = self.remaining()
322                          && remaining.starts_with(equals)
323                          && remaining.as_bytes().get(equals_len).copied() == Some(b'#') =>
324                    {
325                       // Hard code a 1 here because that comparison up top is a byte.
326                       self.offset += equals_len + 1;
327  
328                       break TOKEN_COMMENT;
329                    },
330  
331                    // #= ==# is not a closed comment.
332                    Some('=') => {
333                       self.consume_while(|c| c == '=');
334                    },
335  
336                    Some('#') if self.peek_character_nth(1) == Some('=') => {
337                       self.consume_kind();
338                    },
339  
340                    Some(_) => {
341                       self.consume_character();
342                    },
343  
344                    None => {
345                       break TOKEN_COMMENT;
346                    },
347                 }
348              }
349           },
350  
351           '#' => {
352              self.consume_while(|c| !matches!(c, '\n'));
353  
354              TOKEN_COMMENT
355           },
356  
357           ',' => TOKEN_COMMA,
358           ';' => TOKEN_SEMICOLON,
359  
360           '<' if self.try_consume_character('|') => TOKEN_LESS_PIPE,
361           '|' if self.try_consume_character('>') => TOKEN_PIPE_MORE,
362  
363           '(' if let Some(&mut Context::Interpolation {
364              ref mut parentheses,
365           }) = self.context.last_mut() =>
366           {
367              *parentheses += 1;
368              TOKEN_PARENTHESIS_LEFT
369           },
370           ')' if let Some(&mut Context::Interpolation {
371              ref mut parentheses,
372           }) = self.context.last_mut() =>
373           {
374              match parentheses.checked_sub(1) {
375                 Some(new) => {
376                    *parentheses = new;
377                    TOKEN_PARENTHESIS_RIGHT
378                 },
379  
380                 None => {
381                    self.context_pop(Context::Interpolation { parentheses: 0 });
382                    TOKEN_INTERPOLATION_END
383                 },
384              }
385           },
386  
387           '(' => TOKEN_PARENTHESIS_LEFT,
388           ')' => TOKEN_PARENTHESIS_RIGHT,
389  
390           '=' if self.try_consume_character('>') => TOKEN_EQUAL_MORE,
391  
392           ':' => TOKEN_COLON,
393           '+' if self.try_consume_character('+') => TOKEN_PLUS_PLUS,
394           '[' => TOKEN_BRACKET_LEFT,
395           ']' => TOKEN_BRACKET_RIGHT,
396  
397           '/' if self.try_consume_character('/') => TOKEN_SLASH_SLASH,
398           '{' => TOKEN_CURLYBRACE_LEFT,
399           '}' => TOKEN_CURLYBRACE_RIGHT,
400  
401           '<' if self.try_consume_character('=') => TOKEN_LESS_EQUAL,
402           '<' => TOKEN_LESS,
403           '>' if self.try_consume_character('=') => TOKEN_MORE_EQUAL,
404           '>' => TOKEN_MORE,
405  
406           '!' if self.try_consume_character('=') => TOKEN_EXCLAMATION_EQUAL,
407           '=' => TOKEN_EQUAL,
408  
409           '&' if self.try_consume_character('&') => TOKEN_AMPERSAND_AMPERSAND,
410           '|' if self.try_consume_character('|') => TOKEN_PIPE_PIPE,
411           '!' => TOKEN_EXCLAMATION,
412           '-' if self.try_consume_character('>') => TOKEN_MINUS_MORE,
413  
414           '&' => TOKEN_AMPERSAND,
415           '|' => TOKEN_PIPE,
416  
417           '+' => TOKEN_PLUS,
418           '-' => TOKEN_MINUS,
419           '*' => TOKEN_ASTERISK,
420           '^' => TOKEN_CARET,
421           '/' if self
422              .peek_character()
423              .is_none_or(|c| !is_valid_path_character(c)) =>
424           {
425              TOKEN_SLASH
426           },
427  
428           '0' if let Some('b' | 'B' | 'o' | 'O' | 'x' | 'X') = self.peek_character() => {
429              #[expect(clippy::type_complexity)]
430              let (is_valid_digit, consume_scientific): (
431                 fn(char) -> bool,
432                 fn(&mut Self) -> Option<Kind>,
433              ) = match self.consume_character() {
434                 Some('b' | 'B') => (|c| matches!(c, '0' | '1' | '_'), |_| None),
435                 Some('o' | 'O') => (|c| matches!(c, '0'..='7' | '_'), |_| None),
436                 Some('x' | 'X') => {
437                    (
438                       |c| c.is_ascii_hexdigit() || c == '_',
439                       |this| this.consume_scientific_base16(),
440                    )
441                 },
442                 _ => unreachable!(),
443              };
444  
445              let digits_len = self.consume_while(is_valid_digit);
446              let digits = self.consumed_since(self.offset - digits_len);
447              let mut error_kind = (digits.is_empty() || digits.bytes().all(|c| c == b'_'))
448                 .then_some(TOKEN_ERROR_NUMBER_NO_DIGIT);
449  
450              let default_kind = if self.peek_character() == Some('.')
451                 && self.peek_character_nth(1).is_some_and(is_valid_digit)
452              {
453                 self.consume_character();
454  
455                 error_kind = error_kind.and({
456                    let digits_len = self.consume_while(is_valid_digit);
457                    let digits = self.consumed_since(self.offset - digits_len);
458                    digits.is_empty().then_some(TOKEN_ERROR_NUMBER_NO_DIGIT)
459                 });
460  
461                 TOKEN_FLOAT
462              } else {
463                 TOKEN_INTEGER
464              };
465  
466              error_kind
467                 .or(consume_scientific(self))
468                 .unwrap_or(default_kind)
469           },
470  
471           '.' if let is_valid_digit = (|c: char| c.is_ascii_digit() || c == '_')
472              && self.peek_character().is_some_and(is_valid_digit) =>
473           {
474              self.consume_while(is_valid_digit);
475  
476              self.consume_scientific_base10().unwrap_or(TOKEN_FLOAT)
477           },
478  
479           initial_digit if initial_digit.is_ascii_digit() => {
480              let is_valid_digit = |c: char| c.is_ascii_digit() || c == '_';
481  
482              self.consume_while(is_valid_digit);
483  
484              let default_kind = if self.peek_character() == Some('.')
485                 && self.peek_character_nth(1).is_some_and(is_valid_digit)
486              {
487                 self.consume_character();
488                 self.consume_while(is_valid_digit);
489                 TOKEN_FLOAT
490              } else {
491                 TOKEN_INTEGER
492              };
493  
494              self.consume_scientific_base10().unwrap_or(default_kind)
495           },
496  
497           // After the `.123` literal parsing.
498           '.' => TOKEN_PERIOD,
499  
500           initial_letter if is_valid_initial_plain_identifier_character(initial_letter) => {
501              const KEYWORDS: phf::Map<&'static str, Kind> = phf::phf_map! {
502                  "if" => TOKEN_KEYWORD_IF,
503                  "then" => TOKEN_KEYWORD_THEN,
504                  "else" => TOKEN_KEYWORD_ELSE,
505              };
506  
507              self.consume_while(is_valid_plain_identifier_character);
508  
509              KEYWORDS
510                 .get(self.consumed_since(start))
511                 .copied()
512                 .unwrap_or(TOKEN_IDENTIFIER)
513           },
514  
515           // \(foo)/bar/baz.txt
516           start @ '\\' => {
517              self.offset -= start.len_utf8();
518              self.context_push(Context::Path);
519  
520              TOKEN_PATH_START
521           },
522           // /bar/baz.txt
523           start @ '/' if self.peek_character().is_some_and(is_valid_path_character) => {
524              self.offset -= start.len_utf8();
525              self.context_push(Context::Path);
526  
527              TOKEN_PATH_START
528           },
529  
530           '@' => TOKEN_AT,
531  
532           start @ ('`' | '"' | '\'') => {
533              let equals_len = self.consume_while(|c| c == '=');
534              let equals = self.consumed_since(self.offset - equals_len);
535  
536              self.context_push(Context::Delimited {
537                 before: Some(equals),
538                 end:    start,
539              });
540  
541              match start {
542                 '`' => TOKEN_QUOTED_IDENTIFIER_START,
543                 '\"' => TOKEN_STRING_START,
544                 '\'' => TOKEN_CHAR_START,
545                 _ => unreachable!(),
546              }
547           },
548  
549           _ => TOKEN_ERROR_UNKNOWN,
550        })
551     }
552  }
553  
554  #[cfg(test)]
555  mod tests {
556     use super::*;
557  
558     macro_rules! assert_matches {
559        ($expression:expr, $pattern:pat) => {
560           assert!(matches!($expression, $pattern))
561        };
562     }
563  
564     macro_rules! assert_token_matches {
565        ($string:literal, $($pattern:pat),* $(,)?) => {{
566           let mut tokens = tokenize($string);
567  
568           $(assert_matches!(tokens.next(), Some($pattern));)*
569  
570           assert!(tokens.next().is_none());
571        }};
572     }
573  
574     #[test]
575     fn empty_tokens() {
576        assert_token_matches!(
577           r#""foo \(bar)""#,
578           (TOKEN_STRING_START, r#"""#),
579           (TOKEN_CONTENT, "foo "),
580           (TOKEN_INTERPOLATION_START, r"\("),
581           (TOKEN_IDENTIFIER, "bar"),
582           (TOKEN_INTERPOLATION_END, ")"),
583           (TOKEN_CONTENT, ""),
584           (TOKEN_STRING_END, r#"""#),
585        );
586     }
587  
588     #[test]
589     fn number_errors() {
590        assert_token_matches!(
591           "0b__e 0x0 0x123.0e 0o777.0e",
592           (TOKEN_ERROR_NUMBER_NO_DIGIT, "0b__"),
593           (TOKEN_IDENTIFIER, "e"),
594           (TOKEN_SPACE, " "),
595           (TOKEN_INTEGER, "0x0"),
596           (TOKEN_SPACE, " "),
597           (TOKEN_FLOAT, "0x123.0e"), // e is a valid hexadecimal digit.
598           (TOKEN_SPACE, " "),
599           (TOKEN_FLOAT, "0o777.0"),
600           (TOKEN_IDENTIFIER, "e"),
601        );
602     }
603  
604     #[test]
605     fn path() {
606        assert_token_matches!(
607           r"/foo\(𓃰)///baz",
608           (TOKEN_PATH_START, ""),
609           (TOKEN_CONTENT, "/foo"),
610           (TOKEN_INTERPOLATION_START, r"\("),
611           (TOKEN_IDENTIFIER, "𓃰"),
612           (TOKEN_INTERPOLATION_END, ")"),
613           (TOKEN_CONTENT, "///baz"),
614           (TOKEN_PATH_END, ""),
615        );
616     }
617  
618     #[test]
619     fn errors_are_individual() {
620        assert_token_matches!(
621           "~~~",
622           (TOKEN_ERROR_UNKNOWN, "~"),
623           (TOKEN_ERROR_UNKNOWN, "~"),
624           (TOKEN_ERROR_UNKNOWN, "~")
625        );
626     }
627  }