sanitizer.rs
1 // Copyright (c) 2025-2026 ACDC Network 2 // This file is part of the alphavm library. 3 // 4 // Alpha Chain | Delta Chain Protocol 5 // International Monetary Graphite. 6 // 7 // Derived from Aleo (https://aleo.org) and ProvableHQ (https://provable.com). 8 // They built world-class ZK infrastructure. We installed the EASY button. 9 // Their cryptography: elegant. Our modifications: bureaucracy-compatible. 10 // Original brilliance: theirs. Robert's Rules: ours. Bugs: definitely ours. 11 // 12 // Original Aleo/ProvableHQ code subject to Apache 2.0 https://www.apache.org/licenses/LICENSE-2.0 13 // All modifications and new work: CC0 1.0 Universal Public Domain Dedication. 14 // No rights reserved. No permission required. No warranty. No refunds. 15 // 16 // https://creativecommons.org/publicdomain/zero/1.0/ 17 // SPDX-License-Identifier: CC0-1.0 18 19 use crate::{string_parser::is_char_supported, ParserResult}; 20 21 use nom::{ 22 branch::alt, 23 bytes::complete::tag, 24 character::complete::{anychar, char, line_ending, multispace1}, 25 combinator::{cut, map, recognize, value, verify}, 26 error::{ErrorKind, VerboseError, VerboseErrorKind}, 27 multi::fold_many0, 28 sequence::{preceded, terminated}, 29 }; 30 31 pub struct Sanitizer; 32 33 impl Sanitizer { 34 /// Removes all leading whitespaces and comments from the given input, returning the sanitized input. 35 pub fn parse(string: &str) -> ParserResult<'_, &str> { 36 preceded(Self::parse_whitespaces, Self::parse_comments)(string) 37 } 38 39 /// Removes leading whitespaces from the given input. 40 pub fn parse_whitespaces(string: &str) -> ParserResult<'_, &str> { 41 recognize(Self::many0_(alt((multispace1, tag("\\\n")))))(string) 42 } 43 44 /// Removes multiple leading comments from the given input. 45 pub fn parse_comments(string: &str) -> ParserResult<'_, &str> { 46 recognize(Self::many0_(terminated(Self::parse_comment, Self::parse_whitespaces)))(string) 47 } 48 49 /// Removes the first leading comment from the given input. 50 pub fn parse_comment(string: &str) -> ParserResult<'_, &str> { 51 preceded( 52 char('/'), 53 alt((preceded(char('/'), cut(Self::str_till_eol)), preceded(char('*'), cut(Self::str_till_star_slash)))), 54 )(string) 55 } 56 57 /// Parse a safe character (in the sense explained in [is_char_supported]). 58 /// Returns an error if no character is found or a non-safe character is found. 59 /// The character is returned, along with the remaining input. 60 /// 61 /// This is used for otherwise unconstrained characters 62 /// in (line and block) comments and in string literals. 63 /// 64 /// Note also that the `nom` documentation for `anychar` says that 65 /// it matches one byte as a character. 66 /// However, simple experiments show that it matches a Unicode character, 67 /// e.g. attempting to parse `"\u{4141}"` yields one CJK character and exhausts the input, 68 /// as opposed to returning `A` and leaving another `A` in the input. 69 pub fn parse_safe_char(string: &str) -> ParserResult<'_, char> { 70 fn is_safe(ch: &char) -> bool { 71 is_char_supported(*ch) 72 } 73 verify(anychar, is_safe)(string) 74 } 75 } 76 77 impl Sanitizer { 78 /// End-of-input parser. 79 /// 80 /// Yields `()` if the parser is at the end of the input; an error otherwise. 81 fn eoi(string: &str) -> ParserResult<'_, ()> { 82 match string.is_empty() { 83 true => Ok((string, ())), 84 false => { 85 Err(nom::Err::Error(VerboseError { errors: vec![(string, VerboseErrorKind::Nom(ErrorKind::Eof))] })) 86 } 87 } 88 } 89 90 /// A parser that accepts: 91 /// - A newline, either `CR LF` or just `LF`. 92 /// - The end of input. 93 fn eol(string: &str) -> ParserResult<'_, ()> { 94 alt(( 95 Self::eoi, // this one goes first because it’s very cheap 96 value((), line_ending), 97 ))(string) 98 } 99 100 /// Apply the `f` parser until `g` succeeds. Both parsers consume the input. 101 fn till<'a, A, B, F, G>(mut f: F, mut g: G) -> impl FnMut(&'a str) -> ParserResult<'a, ()> 102 where 103 F: FnMut(&'a str) -> ParserResult<'a, A>, 104 G: FnMut(&'a str) -> ParserResult<'a, B>, 105 { 106 move |mut i| loop { 107 if let Ok((i2, _)) = g(i) { 108 break Ok((i2, ())); 109 } 110 111 let (i2, _) = f(i)?; 112 i = i2; 113 } 114 } 115 116 /// Parse a string until the end of line. 117 /// 118 /// This parser accepts the multiline annotation (`\ LF`) to break the string on several lines. 119 /// 120 /// The line may end with a newline (either `CR LF` or just `LF`), or it may end with the input. 121 /// 122 /// Return the body of the comment, i.e. what is between `//` and the end of line. 123 /// If the line ends with `CR LF`, the `CR` is included in the returned body. 124 /// The `LF`, if present, is never included in the returned body. 125 fn str_till_eol(string: &str) -> ParserResult<'_, &str> { 126 // A heuristic approach is applied here in order to avoid costly parsing operations in the 127 // most common scenarios: non-parsing methods are used to verify if the string has multiple 128 // lines and if there are any unsafe characters. 129 if let Some((before, after)) = string.split_once('\n') { 130 let is_multiline = before.ends_with('\\'); // is `LF` preceded by `\`? 131 132 if !is_multiline { 133 let contains_unsafe_chars = !before.chars().all(is_char_supported); 134 135 if !contains_unsafe_chars { 136 Ok((after, before)) 137 } else { 138 // `eoi` is used here instead of `eol`, since the earlier call to `split_once` 139 // already removed the `LF`. This will fail at the first unsafe character, 140 // which is known to exist because we are under the condition contains_unsafe_chars. 141 recognize(Self::till(value((), Sanitizer::parse_safe_char), Self::eoi))(before) 142 } 143 } else { 144 map( 145 recognize(Self::till( 146 alt((value((), tag("\\\n")), value((), Sanitizer::parse_safe_char))), 147 Self::eol, 148 )), 149 |i| { 150 // Exclude the final `LF`, if any, from the comment body. 151 if i.as_bytes().last() == Some(&b'\n') { 152 &i[0..i.len() - 1] 153 } else { 154 i 155 } 156 }, 157 )(string) 158 } 159 } else if string.chars().all(is_char_supported) { 160 // There is no `LF`. We return all the characters up to the end of file. 161 Ok(("", string)) 162 } else { 163 // `eoi` is used here because we are under the condition that there is no newline. 164 // This will fail at the first unsafe character, which is known to exist because 165 // we are under the condition that not all characters are safe. 166 recognize(Self::till(value((), Sanitizer::parse_safe_char), Self::eoi))(string) 167 } 168 } 169 170 /// Parse a string until `*/` is encountered. 171 /// 172 /// This is used to parse the body of a block comment, after the opening `/*`. 173 /// 174 /// Return the body of the comment, i.e. what is between `/*` and `*/`. 175 fn str_till_star_slash(string: &str) -> ParserResult<'_, &str> { 176 map(recognize(Self::till(value((), Sanitizer::parse_safe_char), tag("*/"))), |i| { 177 &i[0..i.len() - 2] // subtract 2 to discard the closing `*/` 178 })(string) 179 } 180 181 /// A version of many0 that discards the result of the parser, preventing allocating. 182 fn many0_<'a, A, F>(mut f: F) -> impl FnMut(&'a str) -> ParserResult<'a, ()> 183 where 184 F: FnMut(&'a str) -> ParserResult<'a, A>, 185 { 186 move |string| fold_many0(&mut f, || (), |_, _| ())(string) 187 } 188 } 189 190 #[cfg(test)] 191 mod tests { 192 use super::*; 193 194 #[test] 195 fn test_parse_safe_char() { 196 // test correct acceptance of ASCII and non-ASCII: 197 assert_eq!(("", 'A'), Sanitizer::parse_safe_char("A").unwrap()); 198 assert_eq!((" and more", 'A'), Sanitizer::parse_safe_char("A and more").unwrap()); 199 assert_eq!(("", '\u{4141}'), Sanitizer::parse_safe_char("\u{4141}").unwrap()); 200 assert_eq!((" and more", '\u{4141}'), Sanitizer::parse_safe_char("\u{4141} and more").unwrap()); 201 202 // test rejection and acceptance of ASCII control characters: 203 assert!(Sanitizer::parse_safe_char("\x00").is_err()); 204 assert!(Sanitizer::parse_safe_char("\x01").is_err()); 205 assert!(Sanitizer::parse_safe_char("\x02").is_err()); 206 assert!(Sanitizer::parse_safe_char("\x03").is_err()); 207 assert!(Sanitizer::parse_safe_char("\x04").is_err()); 208 assert!(Sanitizer::parse_safe_char("\x05").is_err()); 209 assert!(Sanitizer::parse_safe_char("\x06").is_err()); 210 assert!(Sanitizer::parse_safe_char("\x07").is_err()); 211 assert!(Sanitizer::parse_safe_char("\x08").is_err()); 212 assert!(Sanitizer::parse_safe_char("\x09").is_ok()); 213 assert!(Sanitizer::parse_safe_char("\x0a").is_ok()); 214 assert!(Sanitizer::parse_safe_char("\x0b").is_err()); 215 assert!(Sanitizer::parse_safe_char("\x0c").is_err()); 216 assert!(Sanitizer::parse_safe_char("\x0d").is_ok()); 217 assert!(Sanitizer::parse_safe_char("\x0e").is_err()); 218 assert!(Sanitizer::parse_safe_char("\x0f").is_err()); 219 assert!(Sanitizer::parse_safe_char("\x10").is_err()); 220 assert!(Sanitizer::parse_safe_char("\x11").is_err()); 221 assert!(Sanitizer::parse_safe_char("\x12").is_err()); 222 assert!(Sanitizer::parse_safe_char("\x13").is_err()); 223 assert!(Sanitizer::parse_safe_char("\x14").is_err()); 224 assert!(Sanitizer::parse_safe_char("\x15").is_err()); 225 assert!(Sanitizer::parse_safe_char("\x16").is_err()); 226 assert!(Sanitizer::parse_safe_char("\x17").is_err()); 227 assert!(Sanitizer::parse_safe_char("\x18").is_err()); 228 assert!(Sanitizer::parse_safe_char("\x19").is_err()); 229 assert!(Sanitizer::parse_safe_char("\x1a").is_err()); 230 assert!(Sanitizer::parse_safe_char("\x1b").is_err()); 231 assert!(Sanitizer::parse_safe_char("\x1c").is_err()); 232 assert!(Sanitizer::parse_safe_char("\x1d").is_err()); 233 assert!(Sanitizer::parse_safe_char("\x1e").is_err()); 234 assert!(Sanitizer::parse_safe_char("\x1f").is_err()); 235 assert!(Sanitizer::parse_safe_char("\x7f").is_err()); 236 237 // test rejection of bidi characters, and acceptance of the ones just above/below: 238 assert!(Sanitizer::parse_safe_char("\u{2029}").is_ok()); 239 assert!(Sanitizer::parse_safe_char("\u{202a}").is_err()); 240 assert!(Sanitizer::parse_safe_char("\u{202b}").is_err()); 241 assert!(Sanitizer::parse_safe_char("\u{202c}").is_err()); 242 assert!(Sanitizer::parse_safe_char("\u{202d}").is_err()); 243 assert!(Sanitizer::parse_safe_char("\u{202e}").is_err()); 244 assert!(Sanitizer::parse_safe_char("\u{202f}").is_ok()); 245 assert!(Sanitizer::parse_safe_char("\u{2065}").is_ok()); 246 assert!(Sanitizer::parse_safe_char("\u{2066}").is_err()); 247 assert!(Sanitizer::parse_safe_char("\u{2067}").is_err()); 248 assert!(Sanitizer::parse_safe_char("\u{2068}").is_err()); 249 assert!(Sanitizer::parse_safe_char("\u{2069}").is_err()); 250 assert!(Sanitizer::parse_safe_char("\u{206a}").is_ok()); 251 } 252 253 #[test] 254 fn test_sanitize() { 255 // Whitespaces 256 assert_eq!(("hello world", ""), Sanitizer::parse("hello world").unwrap()); 257 assert_eq!(("hello world", ""), Sanitizer::parse(" hello world").unwrap()); 258 assert_eq!(("hello world", ""), Sanitizer::parse(" hello world").unwrap()); 259 assert_eq!(("hello world", ""), Sanitizer::parse("\nhello world").unwrap()); 260 assert_eq!(("hello world", ""), Sanitizer::parse(" \nhello world").unwrap()); 261 assert_eq!(("hello world ", ""), Sanitizer::parse("hello world ").unwrap()); 262 263 // Comments 264 assert_eq!(("hello world", "// hello\n"), Sanitizer::parse("// hello\nhello world").unwrap()); 265 assert_eq!(("hello world", "/* hello */"), Sanitizer::parse("/* hello */hello world").unwrap()); 266 assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse("/* hello */\nhello world").unwrap()); 267 assert_eq!(("hello world", "/** hello */"), Sanitizer::parse("/** hello */hello world").unwrap()); 268 assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse("/** hello */\nhello world").unwrap()); 269 assert_eq!(("/\nhello world", ""), Sanitizer::parse("/\nhello world").unwrap()); 270 271 // Whitespaces and comments 272 assert_eq!(("hello world", "// hello\n"), Sanitizer::parse(" \n// hello\nhello world").unwrap()); 273 assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse(" \n /* hello */\nhello world").unwrap()); 274 assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse(" \n\t /** hello */\nhello world").unwrap()); 275 assert_eq!(("/\nhello world", ""), Sanitizer::parse(" /\nhello world").unwrap()); 276 } 277 278 #[test] 279 fn test_whitespaces() { 280 assert_eq!(("hello world", ""), Sanitizer::parse_whitespaces("hello world").unwrap()); 281 assert_eq!(("hello world", " "), Sanitizer::parse_whitespaces(" hello world").unwrap()); 282 assert_eq!(("hello world", " "), Sanitizer::parse_whitespaces(" hello world").unwrap()); 283 assert_eq!(("hello world", "\n"), Sanitizer::parse_whitespaces("\nhello world").unwrap()); 284 assert_eq!(("hello world", " \n"), Sanitizer::parse_whitespaces(" \nhello world").unwrap()); 285 assert_eq!(("hello world", "\t"), Sanitizer::parse_whitespaces("\thello world").unwrap()); 286 assert_eq!(("hello world", " \t"), Sanitizer::parse_whitespaces(" \thello world").unwrap()); 287 assert_eq!(("hello world", " \n\t"), Sanitizer::parse_whitespaces(" \n\thello world").unwrap()); 288 assert_eq!(("hello world ", ""), Sanitizer::parse_whitespaces("hello world ").unwrap()); 289 } 290 291 #[test] 292 fn test_comments() { 293 assert_eq!(("hello world", "// hello\n"), Sanitizer::parse_comments("// hello\nhello world").unwrap()); 294 assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse_comments("/* hello */\nhello world").unwrap()); 295 assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse_comments("/** hello */\nhello world").unwrap()); 296 assert_eq!(("/\nhello world", ""), Sanitizer::parse_comments("/\nhello world").unwrap()); 297 assert_eq!( 298 ("hello world", "// hel\u{4141}lo\n"), 299 Sanitizer::parse_comments("// hel\u{4141}lo\nhello world").unwrap() 300 ); 301 assert_eq!( 302 ("hello world", "/* multi\n line comment\n*/\n"), 303 Sanitizer::parse_comments("/* multi\n line comment\n*/\nhello world").unwrap() 304 ); 305 assert_eq!( 306 ("hello world", "// multiple\n// line\n// comments\n"), 307 Sanitizer::parse_comments("// multiple\n// line\n// comments\nhello world").unwrap() 308 ); 309 assert_eq!( 310 ("hello world", "/* multi\n line comment\n*/\n/* and\n another\n one\n*/\n"), 311 Sanitizer::parse_comments("/* multi\n line comment\n*/\n/* and\n another\n one\n*/\nhello world") 312 .unwrap() 313 ); 314 assert_eq!( 315 ("hello world", "/* multi\n line comment\n*/\n// two single\n// line comments\n/* and\n another\n multi-liner\n*/\n"), 316 Sanitizer::parse_comments("/* multi\n line comment\n*/\n// two single\n// line comments\n/* and\n another\n multi-liner\n*/\nhello world").unwrap() 317 ); 318 assert!(Sanitizer::parse_comments("// hel\x08lo\nhello world").is_err()); 319 assert!(Sanitizer::parse_comments("// hel\u{2066}lo\nhello world").is_err()); 320 assert!(Sanitizer::parse_comments("/* hel\x7flo */\nhello world").is_err()); 321 assert!(Sanitizer::parse_comments("/* hel\u{202d}lo */\nhello world").is_err()); 322 assert!(Sanitizer::parse_comments("/** hel\x00lo */\nhello world").is_err()); 323 assert!(Sanitizer::parse_comments("/** hel\u{202a}lo */\nhello world").is_err()); 324 assert!(Sanitizer::parse_comments("// unsafe \u{202a} no newline").is_err()); 325 } 326 }