/ console / network / environment / src / helpers / sanitizer.rs
sanitizer.rs
  1  // Copyright (c) 2025-2026 ACDC Network
  2  // This file is part of the alphavm library.
  3  //
  4  // Alpha Chain | Delta Chain Protocol
  5  // International Monetary Graphite.
  6  //
  7  // Derived from Aleo (https://aleo.org) and ProvableHQ (https://provable.com).
  8  // They built world-class ZK infrastructure. We installed the EASY button.
  9  // Their cryptography: elegant. Our modifications: bureaucracy-compatible.
 10  // Original brilliance: theirs. Robert's Rules: ours. Bugs: definitely ours.
 11  //
 12  // Original Aleo/ProvableHQ code subject to Apache 2.0 https://www.apache.org/licenses/LICENSE-2.0
 13  // All modifications and new work: CC0 1.0 Universal Public Domain Dedication.
 14  // No rights reserved. No permission required. No warranty. No refunds.
 15  //
 16  // https://creativecommons.org/publicdomain/zero/1.0/
 17  // SPDX-License-Identifier: CC0-1.0
 18  
 19  use crate::{string_parser::is_char_supported, ParserResult};
 20  
 21  use nom::{
 22      branch::alt,
 23      bytes::complete::tag,
 24      character::complete::{anychar, char, line_ending, multispace1},
 25      combinator::{cut, map, recognize, value, verify},
 26      error::{ErrorKind, VerboseError, VerboseErrorKind},
 27      multi::fold_many0,
 28      sequence::{preceded, terminated},
 29  };
 30  
 31  pub struct Sanitizer;
 32  
 33  impl Sanitizer {
 34      /// Removes all leading whitespaces and comments from the given input, returning the sanitized input.
 35      pub fn parse(string: &str) -> ParserResult<'_, &str> {
 36          preceded(Self::parse_whitespaces, Self::parse_comments)(string)
 37      }
 38  
 39      /// Removes leading whitespaces from the given input.
 40      pub fn parse_whitespaces(string: &str) -> ParserResult<'_, &str> {
 41          recognize(Self::many0_(alt((multispace1, tag("\\\n")))))(string)
 42      }
 43  
 44      /// Removes multiple leading comments from the given input.
 45      pub fn parse_comments(string: &str) -> ParserResult<'_, &str> {
 46          recognize(Self::many0_(terminated(Self::parse_comment, Self::parse_whitespaces)))(string)
 47      }
 48  
 49      /// Removes the first leading comment from the given input.
 50      pub fn parse_comment(string: &str) -> ParserResult<'_, &str> {
 51          preceded(
 52              char('/'),
 53              alt((preceded(char('/'), cut(Self::str_till_eol)), preceded(char('*'), cut(Self::str_till_star_slash)))),
 54          )(string)
 55      }
 56  
 57      /// Parse a safe character (in the sense explained in [is_char_supported]).
 58      /// Returns an error if no character is found or a non-safe character is found.
 59      /// The character is returned, along with the remaining input.
 60      ///
 61      /// This is used for otherwise unconstrained characters
 62      /// in (line and block) comments and in string literals.
 63      ///
 64      /// Note also that the `nom` documentation for `anychar` says that
 65      /// it matches one byte as a character.
 66      /// However, simple experiments show that it matches a Unicode character,
 67      /// e.g. attempting to parse `"\u{4141}"` yields one CJK character and exhausts the input,
 68      /// as opposed to returning `A` and leaving another `A` in the input.
 69      pub fn parse_safe_char(string: &str) -> ParserResult<'_, char> {
 70          fn is_safe(ch: &char) -> bool {
 71              is_char_supported(*ch)
 72          }
 73          verify(anychar, is_safe)(string)
 74      }
 75  }
 76  
 77  impl Sanitizer {
 78      /// End-of-input parser.
 79      ///
 80      /// Yields `()` if the parser is at the end of the input; an error otherwise.
 81      fn eoi(string: &str) -> ParserResult<'_, ()> {
 82          match string.is_empty() {
 83              true => Ok((string, ())),
 84              false => {
 85                  Err(nom::Err::Error(VerboseError { errors: vec![(string, VerboseErrorKind::Nom(ErrorKind::Eof))] }))
 86              }
 87          }
 88      }
 89  
 90      /// A parser that accepts:
 91      /// - A newline, either `CR LF` or just `LF`.
 92      /// - The end of input.
 93      fn eol(string: &str) -> ParserResult<'_, ()> {
 94          alt((
 95              Self::eoi, // this one goes first because it’s very cheap
 96              value((), line_ending),
 97          ))(string)
 98      }
 99  
100      /// Apply the `f` parser until `g` succeeds. Both parsers consume the input.
101      fn till<'a, A, B, F, G>(mut f: F, mut g: G) -> impl FnMut(&'a str) -> ParserResult<'a, ()>
102      where
103          F: FnMut(&'a str) -> ParserResult<'a, A>,
104          G: FnMut(&'a str) -> ParserResult<'a, B>,
105      {
106          move |mut i| loop {
107              if let Ok((i2, _)) = g(i) {
108                  break Ok((i2, ()));
109              }
110  
111              let (i2, _) = f(i)?;
112              i = i2;
113          }
114      }
115  
116      /// Parse a string until the end of line.
117      ///
118      /// This parser accepts the multiline annotation (`\ LF`) to break the string on several lines.
119      ///
120      /// The line may end with a newline (either `CR LF` or just `LF`), or it may end with the input.
121      ///
122      /// Return the body of the comment, i.e. what is between `//` and the end of line.
123      /// If the line ends with `CR LF`, the `CR` is included in the returned body.
124      /// The `LF`, if present, is never included in the returned body.
125      fn str_till_eol(string: &str) -> ParserResult<'_, &str> {
126          // A heuristic approach is applied here in order to avoid costly parsing operations in the
127          // most common scenarios: non-parsing methods are used to verify if the string has multiple
128          // lines and if there are any unsafe characters.
129          if let Some((before, after)) = string.split_once('\n') {
130              let is_multiline = before.ends_with('\\'); // is `LF` preceded by `\`?
131  
132              if !is_multiline {
133                  let contains_unsafe_chars = !before.chars().all(is_char_supported);
134  
135                  if !contains_unsafe_chars {
136                      Ok((after, before))
137                  } else {
138                      // `eoi` is used here instead of `eol`, since the earlier call to `split_once`
139                      // already removed the `LF`. This will fail at the first unsafe character,
140                      // which is known to exist because we are under the condition contains_unsafe_chars.
141                      recognize(Self::till(value((), Sanitizer::parse_safe_char), Self::eoi))(before)
142                  }
143              } else {
144                  map(
145                      recognize(Self::till(
146                          alt((value((), tag("\\\n")), value((), Sanitizer::parse_safe_char))),
147                          Self::eol,
148                      )),
149                      |i| {
150                          // Exclude the final `LF`, if any, from the comment body.
151                          if i.as_bytes().last() == Some(&b'\n') {
152                              &i[0..i.len() - 1]
153                          } else {
154                              i
155                          }
156                      },
157                  )(string)
158              }
159          } else if string.chars().all(is_char_supported) {
160              // There is no `LF`. We return all the characters up to the end of file.
161              Ok(("", string))
162          } else {
163              // `eoi` is used here because we are under the condition that there is no newline.
164              // This will fail at the first unsafe character, which is known to exist because
165              // we are under the condition that not all characters are safe.
166              recognize(Self::till(value((), Sanitizer::parse_safe_char), Self::eoi))(string)
167          }
168      }
169  
170      /// Parse a string until `*/` is encountered.
171      ///
172      /// This is used to parse the body of a block comment, after the opening `/*`.
173      ///
174      /// Return the body of the comment, i.e. what is between `/*` and `*/`.
175      fn str_till_star_slash(string: &str) -> ParserResult<'_, &str> {
176          map(recognize(Self::till(value((), Sanitizer::parse_safe_char), tag("*/"))), |i| {
177              &i[0..i.len() - 2] // subtract 2 to discard the closing `*/`
178          })(string)
179      }
180  
181      /// A version of many0 that discards the result of the parser, preventing allocating.
182      fn many0_<'a, A, F>(mut f: F) -> impl FnMut(&'a str) -> ParserResult<'a, ()>
183      where
184          F: FnMut(&'a str) -> ParserResult<'a, A>,
185      {
186          move |string| fold_many0(&mut f, || (), |_, _| ())(string)
187      }
188  }
189  
190  #[cfg(test)]
191  mod tests {
192      use super::*;
193  
194      #[test]
195      fn test_parse_safe_char() {
196          // test correct acceptance of ASCII and non-ASCII:
197          assert_eq!(("", 'A'), Sanitizer::parse_safe_char("A").unwrap());
198          assert_eq!((" and more", 'A'), Sanitizer::parse_safe_char("A and more").unwrap());
199          assert_eq!(("", '\u{4141}'), Sanitizer::parse_safe_char("\u{4141}").unwrap());
200          assert_eq!((" and more", '\u{4141}'), Sanitizer::parse_safe_char("\u{4141} and more").unwrap());
201  
202          // test rejection and acceptance of ASCII control characters:
203          assert!(Sanitizer::parse_safe_char("\x00").is_err());
204          assert!(Sanitizer::parse_safe_char("\x01").is_err());
205          assert!(Sanitizer::parse_safe_char("\x02").is_err());
206          assert!(Sanitizer::parse_safe_char("\x03").is_err());
207          assert!(Sanitizer::parse_safe_char("\x04").is_err());
208          assert!(Sanitizer::parse_safe_char("\x05").is_err());
209          assert!(Sanitizer::parse_safe_char("\x06").is_err());
210          assert!(Sanitizer::parse_safe_char("\x07").is_err());
211          assert!(Sanitizer::parse_safe_char("\x08").is_err());
212          assert!(Sanitizer::parse_safe_char("\x09").is_ok());
213          assert!(Sanitizer::parse_safe_char("\x0a").is_ok());
214          assert!(Sanitizer::parse_safe_char("\x0b").is_err());
215          assert!(Sanitizer::parse_safe_char("\x0c").is_err());
216          assert!(Sanitizer::parse_safe_char("\x0d").is_ok());
217          assert!(Sanitizer::parse_safe_char("\x0e").is_err());
218          assert!(Sanitizer::parse_safe_char("\x0f").is_err());
219          assert!(Sanitizer::parse_safe_char("\x10").is_err());
220          assert!(Sanitizer::parse_safe_char("\x11").is_err());
221          assert!(Sanitizer::parse_safe_char("\x12").is_err());
222          assert!(Sanitizer::parse_safe_char("\x13").is_err());
223          assert!(Sanitizer::parse_safe_char("\x14").is_err());
224          assert!(Sanitizer::parse_safe_char("\x15").is_err());
225          assert!(Sanitizer::parse_safe_char("\x16").is_err());
226          assert!(Sanitizer::parse_safe_char("\x17").is_err());
227          assert!(Sanitizer::parse_safe_char("\x18").is_err());
228          assert!(Sanitizer::parse_safe_char("\x19").is_err());
229          assert!(Sanitizer::parse_safe_char("\x1a").is_err());
230          assert!(Sanitizer::parse_safe_char("\x1b").is_err());
231          assert!(Sanitizer::parse_safe_char("\x1c").is_err());
232          assert!(Sanitizer::parse_safe_char("\x1d").is_err());
233          assert!(Sanitizer::parse_safe_char("\x1e").is_err());
234          assert!(Sanitizer::parse_safe_char("\x1f").is_err());
235          assert!(Sanitizer::parse_safe_char("\x7f").is_err());
236  
237          // test rejection of bidi characters, and acceptance of the ones just above/below:
238          assert!(Sanitizer::parse_safe_char("\u{2029}").is_ok());
239          assert!(Sanitizer::parse_safe_char("\u{202a}").is_err());
240          assert!(Sanitizer::parse_safe_char("\u{202b}").is_err());
241          assert!(Sanitizer::parse_safe_char("\u{202c}").is_err());
242          assert!(Sanitizer::parse_safe_char("\u{202d}").is_err());
243          assert!(Sanitizer::parse_safe_char("\u{202e}").is_err());
244          assert!(Sanitizer::parse_safe_char("\u{202f}").is_ok());
245          assert!(Sanitizer::parse_safe_char("\u{2065}").is_ok());
246          assert!(Sanitizer::parse_safe_char("\u{2066}").is_err());
247          assert!(Sanitizer::parse_safe_char("\u{2067}").is_err());
248          assert!(Sanitizer::parse_safe_char("\u{2068}").is_err());
249          assert!(Sanitizer::parse_safe_char("\u{2069}").is_err());
250          assert!(Sanitizer::parse_safe_char("\u{206a}").is_ok());
251      }
252  
253      #[test]
254      fn test_sanitize() {
255          // Whitespaces
256          assert_eq!(("hello world", ""), Sanitizer::parse("hello world").unwrap());
257          assert_eq!(("hello world", ""), Sanitizer::parse(" hello world").unwrap());
258          assert_eq!(("hello world", ""), Sanitizer::parse("  hello world").unwrap());
259          assert_eq!(("hello world", ""), Sanitizer::parse("\nhello world").unwrap());
260          assert_eq!(("hello world", ""), Sanitizer::parse(" \nhello world").unwrap());
261          assert_eq!(("hello world ", ""), Sanitizer::parse("hello world ").unwrap());
262  
263          // Comments
264          assert_eq!(("hello world", "// hello\n"), Sanitizer::parse("// hello\nhello world").unwrap());
265          assert_eq!(("hello world", "/* hello */"), Sanitizer::parse("/* hello */hello world").unwrap());
266          assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse("/* hello */\nhello world").unwrap());
267          assert_eq!(("hello world", "/** hello */"), Sanitizer::parse("/** hello */hello world").unwrap());
268          assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse("/** hello */\nhello world").unwrap());
269          assert_eq!(("/\nhello world", ""), Sanitizer::parse("/\nhello world").unwrap());
270  
271          // Whitespaces and comments
272          assert_eq!(("hello world", "// hello\n"), Sanitizer::parse(" \n// hello\nhello world").unwrap());
273          assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse(" \n /* hello */\nhello world").unwrap());
274          assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse(" \n\t  /** hello */\nhello world").unwrap());
275          assert_eq!(("/\nhello world", ""), Sanitizer::parse(" /\nhello world").unwrap());
276      }
277  
278      #[test]
279      fn test_whitespaces() {
280          assert_eq!(("hello world", ""), Sanitizer::parse_whitespaces("hello world").unwrap());
281          assert_eq!(("hello world", " "), Sanitizer::parse_whitespaces(" hello world").unwrap());
282          assert_eq!(("hello world", "  "), Sanitizer::parse_whitespaces("  hello world").unwrap());
283          assert_eq!(("hello world", "\n"), Sanitizer::parse_whitespaces("\nhello world").unwrap());
284          assert_eq!(("hello world", " \n"), Sanitizer::parse_whitespaces(" \nhello world").unwrap());
285          assert_eq!(("hello world", "\t"), Sanitizer::parse_whitespaces("\thello world").unwrap());
286          assert_eq!(("hello world", " \t"), Sanitizer::parse_whitespaces(" \thello world").unwrap());
287          assert_eq!(("hello world", " \n\t"), Sanitizer::parse_whitespaces(" \n\thello world").unwrap());
288          assert_eq!(("hello world ", ""), Sanitizer::parse_whitespaces("hello world ").unwrap());
289      }
290  
291      #[test]
292      fn test_comments() {
293          assert_eq!(("hello world", "// hello\n"), Sanitizer::parse_comments("// hello\nhello world").unwrap());
294          assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse_comments("/* hello */\nhello world").unwrap());
295          assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse_comments("/** hello */\nhello world").unwrap());
296          assert_eq!(("/\nhello world", ""), Sanitizer::parse_comments("/\nhello world").unwrap());
297          assert_eq!(
298              ("hello world", "// hel\u{4141}lo\n"),
299              Sanitizer::parse_comments("// hel\u{4141}lo\nhello world").unwrap()
300          );
301          assert_eq!(
302              ("hello world", "/* multi\n   line comment\n*/\n"),
303              Sanitizer::parse_comments("/* multi\n   line comment\n*/\nhello world").unwrap()
304          );
305          assert_eq!(
306              ("hello world", "// multiple\n// line\n// comments\n"),
307              Sanitizer::parse_comments("// multiple\n// line\n// comments\nhello world").unwrap()
308          );
309          assert_eq!(
310              ("hello world", "/* multi\n   line comment\n*/\n/* and\n   another\n   one\n*/\n"),
311              Sanitizer::parse_comments("/* multi\n   line comment\n*/\n/* and\n   another\n   one\n*/\nhello world")
312                  .unwrap()
313          );
314          assert_eq!(
315              ("hello world", "/* multi\n   line comment\n*/\n// two single\n// line comments\n/* and\n   another\n   multi-liner\n*/\n"),
316              Sanitizer::parse_comments("/* multi\n   line comment\n*/\n// two single\n// line comments\n/* and\n   another\n   multi-liner\n*/\nhello world").unwrap()
317          );
318          assert!(Sanitizer::parse_comments("// hel\x08lo\nhello world").is_err());
319          assert!(Sanitizer::parse_comments("// hel\u{2066}lo\nhello world").is_err());
320          assert!(Sanitizer::parse_comments("/* hel\x7flo */\nhello world").is_err());
321          assert!(Sanitizer::parse_comments("/* hel\u{202d}lo */\nhello world").is_err());
322          assert!(Sanitizer::parse_comments("/** hel\x00lo */\nhello world").is_err());
323          assert!(Sanitizer::parse_comments("/** hel\u{202a}lo */\nhello world").is_err());
324          assert!(Sanitizer::parse_comments("// unsafe \u{202a} no newline").is_err());
325      }
326  }