parse_string.rs
1 // Copyright (c) 2025-2026 ACDC Network 2 // This file is part of the alphavm library. 3 // 4 // Alpha Chain | Delta Chain Protocol 5 // International Monetary Graphite. 6 // 7 // Derived from Aleo (https://aleo.org) and ProvableHQ (https://provable.com). 8 // They built world-class ZK infrastructure. We installed the EASY button. 9 // Their cryptography: elegant. Our modifications: bureaucracy-compatible. 10 // Original brilliance: theirs. Robert's Rules: ours. Bugs: definitely ours. 11 // 12 // Original Aleo/ProvableHQ code subject to Apache 2.0 https://www.apache.org/licenses/LICENSE-2.0 13 // All modifications and new work: CC0 1.0 Universal Public Domain Dedication. 14 // No rights reserved. No permission required. No warranty. No refunds. 15 // 16 // https://creativecommons.org/publicdomain/zero/1.0/ 17 // SPDX-License-Identifier: CC0-1.0 18 19 /// From <https://github.com/Geal/nom/blob/main/examples/string.rs> 20 pub mod string_parser { 21 /// This example shows an example of how to parse an escaped string. The 22 /// rules for the string are similar to JSON and rust. A string is: 23 /// 24 /// - Enclosed by double quotes 25 /// - Can contain any raw unescaped code point besides \ and " 26 /// - Matches the following escape sequences: \b, \f, \n, \r, \t, \", \\, \/ 27 /// - Matches code points like Rust: \u{XXXX}, where XXXX can be up to 6 28 /// hex characters 29 /// - an escape followed by whitespace consumes all whitespace between the 30 /// escape and the next non-whitespace character 31 use nom::{ 32 branch::alt, 33 bytes::streaming::{is_not, take_while_m_n}, 34 character::streaming::{char, multispace1}, 35 combinator::{map, map_opt, map_res, value, verify}, 36 error::{ErrorKind, FromExternalError, ParseError}, 37 multi::fold_many0, 38 sequence::{delimited, preceded}, 39 Err::Error, 40 IResult, 41 }; 42 43 /// Checks for supported code points. 44 /// 45 /// We regard the following characters as safe: 46 /// - Horizontal tab (code 9). 47 /// - Line feed (code 10). 48 /// - Carriage return (code 13). 49 /// - Space (code 32). 50 /// - Visible ASCII (codes 33-126). 51 /// - Non-ASCII Unicode scalar values (codes 128+) except 52 /// * bidi embeddings, overrides and their termination (codes U+202A-U+202E) 53 /// * isolates (codes U+2066-U+2069) 54 /// 55 /// The Unicode bidi characters are well-known for presenting Trojan Source dangers. 56 /// The ASCII backspace (code 8) can be also used to make text look different from what it is, 57 /// and a similar danger may apply to delete (126). 58 /// Other ASCII control characters 59 /// (except for horizontal tab, space, line feed, and carriage return, which are allowed) 60 /// may or may not present dangers, but we see no good reason for allowing them. 61 /// At some point we may want to disallow additional non-ASCII characters, 62 /// if we see no good reason to allow them. 63 /// 64 /// Note that we say 'Unicode scalar values' above, 65 /// because we read UTF-8-decoded characters, 66 /// and thus we will never encounter surrogate code points, 67 /// and we do not need to explicitly exclude them in this function. 68 pub fn is_char_supported(c: char) -> bool { 69 !is_char_unsupported(c) 70 } 71 72 /// Checks for unsupported "invisible" code points. 73 fn is_char_unsupported(c: char) -> bool { 74 let code = c as u32; 75 76 // A quick early return, as anything above is supported. 77 if code > 0x2069 { 78 return false; 79 } 80 81 // A "divide and conquer" approach for greater performance; ranges are 82 // checked before single values and all the comparisons get "reused". 83 if code < 0x202a { 84 if code <= 31 { 85 !(9..14).contains(&code) || code == 11 || code == 12 86 } else { 87 code == 127 88 } 89 } else { 90 code <= 0x202e || code >= 0x2066 91 } 92 } 93 94 /// Parse a Unicode sequence, of the form u{XXXX}, where XXXX is 1 to 6 95 /// hexadecimal numerals. We will combine this later with [parse_escaped_char] 96 /// to parse sequences like \u{00AC}. 97 fn parse_unicode<'a, E>(input: &'a str) -> IResult<&'a str, char, E> 98 where 99 E: ParseError<&'a str> + FromExternalError<&'a str, std::num::ParseIntError>, 100 { 101 // `take_while_m_n` parses between `m` and `n` bytes (inclusive) that match 102 // a predicate. `parse_hex` here parses between 1 and 6 hexadecimal numerals. 103 let parse_hex = take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit()); 104 105 // `preceded` takes a prefix parser, and if it succeeds, returns the result 106 // of the body parser. In this case, it parses u{XXXX}. 107 let parse_delimited_hex = preceded( 108 char('u'), 109 // `delimited` is like `preceded`, but it parses both a prefix and a suffix. 110 // It returns the result of the middle parser. In this case, it parses 111 // {XXXX}, where XXXX is 1 to 6 hex numerals, and returns XXXX 112 delimited(char('{'), parse_hex, char('}')), 113 ); 114 115 // `map_res` takes the result of a parser and applies a function that returns 116 // a Result. In this case we take the hex bytes from parse_hex and attempt to 117 // convert them to a u32. 118 let parse_u32 = map_res(parse_delimited_hex, move |hex| u32::from_str_radix(hex, 16)); 119 120 // map_opt is like map_res, but it takes an Option instead of a Result. If 121 // the function returns None, map_opt returns an error. In this case, because 122 // not all u32 values are valid Unicode code points, we have to fallibly 123 // convert to char with from_u32. 124 map_opt(parse_u32, std::char::from_u32)(input) 125 } 126 127 /// Parse an escaped character: \n, \t, \r, \u{00AC}, etc. 128 fn parse_escaped_char<'a, E>(input: &'a str) -> IResult<&'a str, char, E> 129 where 130 E: ParseError<&'a str> + FromExternalError<&'a str, std::num::ParseIntError>, 131 { 132 preceded( 133 char('\\'), 134 // `alt` tries each parser in sequence, returning the result of 135 // the first successful match 136 alt(( 137 parse_unicode, 138 // The `value` parser returns a fixed value (the first argument) if its 139 // parser (the second argument) succeeds. In these cases, it looks for 140 // the marker characters (n, r, t, etc.) and returns the matching 141 // character (\n, \r, \t, etc.). 142 value('\n', char('n')), 143 value('\r', char('r')), 144 value('\t', char('t')), 145 value('\u{08}', char('b')), 146 value('\u{0C}', char('f')), 147 value('\\', char('\\')), 148 value('/', char('/')), 149 value('"', char('"')), 150 )), 151 )(input) 152 } 153 154 /// Parse a backslash, followed by any amount of whitespace. This is used later 155 /// to discard any escaped whitespace. 156 fn parse_escaped_whitespace<'a, E: ParseError<&'a str>>(input: &'a str) -> IResult<&'a str, &'a str, E> { 157 preceded(char('\\'), multispace1)(input) 158 } 159 160 /// Parse a non-empty block of text that doesn't include \ or " 161 fn parse_literal<'a, E: ParseError<&'a str>>(input: &'a str) -> IResult<&'a str, &'a str, E> { 162 // Return an error if the literal contains an unsupported code point. 163 if input.chars().any(is_char_unsupported) { 164 return Err(Error(E::from_error_kind("String literal contains invalid codepoint", ErrorKind::Char))); 165 } 166 167 // `is_not` parses a string of 0 or more characters that aren't one of the 168 // given characters. 169 let not_quote_slash = is_not("\"\\"); 170 171 // `verify` runs a parser, then runs a verification function on the output of 172 // the parser. The verification function accepts output only if it 173 // returns true. In this case, we want to ensure that the output of is_not 174 // is non-empty. 175 verify(not_quote_slash, |s: &str| !s.is_empty())(input) 176 } 177 178 /// A string fragment contains a fragment of a string being parsed: either 179 /// a non-empty Literal (a series of non-escaped characters), a single 180 /// parsed escaped character, or a block of escaped whitespace. 181 #[derive(Debug, Clone, Copy, PartialEq, Eq)] 182 enum StringFragment<'a> { 183 Literal(&'a str), 184 EscapedChar(char), 185 EscapedWS, 186 } 187 188 /// Combine parse_literal, parse_escaped_whitespace, and parse_escaped_char 189 /// into a StringFragment. 190 fn parse_fragment<'a, E>(input: &'a str) -> IResult<&'a str, StringFragment<'a>, E> 191 where 192 E: ParseError<&'a str> + FromExternalError<&'a str, std::num::ParseIntError>, 193 { 194 alt(( 195 // The `map` combinator runs a parser, then applies a function to the output 196 // of that parser. 197 map(parse_literal, StringFragment::Literal), 198 map(parse_escaped_char, StringFragment::EscapedChar), 199 value(StringFragment::EscapedWS, parse_escaped_whitespace), 200 ))(input) 201 } 202 203 /// Parse a string. Use a loop of parse_fragment and push all of the fragments 204 /// into an output string. 205 pub fn parse_string<'a, E>(input: &'a str) -> IResult<&'a str, String, E> 206 where 207 E: ParseError<&'a str> + FromExternalError<&'a str, std::num::ParseIntError>, 208 { 209 // fold_many0 is the equivalent of iterator::fold. It runs a parser in a loop, 210 // and for each output value, calls a folding function on each output value. 211 let build_string = fold_many0( 212 // Our parser function– parses a single string fragment 213 parse_fragment, 214 // Our init value, an empty string 215 String::new, 216 // Our folding function. For each fragment, append the fragment to the 217 // string. 218 |mut string, fragment| { 219 match fragment { 220 StringFragment::Literal(s) => string.push_str(s), 221 StringFragment::EscapedChar(c) => string.push(c), 222 StringFragment::EscapedWS => {} 223 } 224 string 225 }, 226 ); 227 228 // Finally, parse the string. Note that, if `build_string` could accept a raw 229 // " character, the closing delimiter " would never match. When using 230 // `delimited` with a looping parser (like fold_many0), be sure that the 231 // loop won't accidentally match your closing delimiter! 232 delimited(char('"'), build_string, char('"'))(input) 233 } 234 } 235 236 #[test] 237 fn test_parse_string() { 238 // to use parse_string_wrapper instead of string_parser::parse_string::<nom::error::VerboseError<&str>> in the tests below: 239 fn parse_string_wrapper(input: &str) -> crate::ParserResult<'_, String> { 240 string_parser::parse_string(input) 241 } 242 243 // tests some correct string literals: 244 assert_eq!(("", String::from("")), parse_string_wrapper("\"\"").unwrap()); 245 assert_eq!(("", String::from("abc")), parse_string_wrapper("\"abc\"").unwrap()); 246 assert_eq!((" and more", String::from("abc")), parse_string_wrapper("\"abc\" and more").unwrap()); 247 assert_eq!(("", String::from("\r")), parse_string_wrapper("\"\r\"").unwrap()); 248 assert_eq!(("", String::from("4\u{4141}x\x09")), parse_string_wrapper("\"4\u{4141}x\x09\"").unwrap()); 249 250 // test rejection of disallowed characters: 251 assert!(parse_string_wrapper("\"hel\x08lo\"").is_err()); 252 assert!(parse_string_wrapper("\"hel\x1flo\"").is_err()); 253 assert!(parse_string_wrapper("\"hel\u{2069}lo\"").is_err()); 254 }