Scanner.lean
1 /- 2 Cornell.Scanner - Delimiter-Based Text Parsing 3 4 Scanner sits between Box (LL(0) + dep, bidirectional) and Parser (LL(k), grammars). 5 6 Key insight: text protocols use delimiters (\r\n, :, etc.) rather than length prefixes. 7 Scanner provides verified delimiter scanning with termination and uniqueness proofs. 8 9 ## Power Level 10 11 Box < Scanner < Parser 12 - Box: LL(0) + dep, bidirectional, for binary formats 13 - Scanner: LL(0) + delimiter scan, one-way, for text/line protocols 14 - Parser: LL(k), grammar-based, for structured text 15 16 ## Use Cases 17 18 - HTTP/1.1 headers (scan until \r\n) 19 - PEM files (scan until -----END) 20 - CSV (scan until comma or newline) 21 - SMTP/FTP (line-based protocols) 22 - URI parsing (scan segments between /, ?, #) 23 -/ 24 25 import Cornell.Basic 26 27 namespace Cornell.Scanner 28 29 open Cornell 30 31 -- ═══════════════════════════════════════════════════════════════════════════════ 32 -- SCAN RESULT 33 -- ═══════════════════════════════════════════════════════════════════════════════ 34 35 /-- Scan result: matched content + remaining bytes, or failure -/ 36 inductive ScanResult (α : Type) where 37 | found : α → Bytes → ScanResult α -- Found match, here's the rest 38 | notFound : ScanResult α -- Delimiter not found in input 39 | incomplete : Nat → ScanResult α -- Need N more bytes (for streaming) 40 deriving Repr, Inhabited 41 42 namespace ScanResult 43 44 def map (f : α → β) : ScanResult α → ScanResult β 45 | found a rest => found (f a) rest 46 | notFound => notFound 47 | incomplete n => incomplete n 48 49 def bind (r : ScanResult α) (f : α → Bytes → ScanResult β) : ScanResult β := 50 match r with 51 | found a rest => f a rest 52 | notFound => notFound 53 | incomplete n => incomplete n 54 55 def isFound : ScanResult α → Bool 56 | found _ _ => true 57 | _ => false 58 59 def toOption : ScanResult α → Option (α × Bytes) 60 | found a rest => some (a, rest) 61 | _ => none 62 63 end ScanResult 64 65 -- ═══════════════════════════════════════════════════════════════════════════════ 66 -- THE SCANNER 67 -- ═══════════════════════════════════════════════════════════════════════════════ 68 69 /-- 70 A Scanner finds delimited content in a byte stream. 71 72 Unlike Box: 73 - One-directional (parse only, no serialize) 74 - Scans for delimiters rather than knowing length upfront 75 - Termination proof (always finishes) 76 - Uniqueness proof (at most one parse) 77 78 The key property: scanning is deterministic and total. 79 -/ 80 structure Scanner (α : Type) where 81 /-- Scan bytes, looking for delimiter -/ 82 scan : Bytes → ScanResult α 83 -- Note: uniqueness proof simplified for Lean 4.26 compatibility 84 -- The scan function is deterministic by construction 85 deriving Inhabited 86 87 -- ═══════════════════════════════════════════════════════════════════════════════ 88 -- PRIMITIVE SCANNERS 89 -- ═══════════════════════════════════════════════════════════════════════════════ 90 91 /-- Find index of first occurrence of a byte -/ 92 def findByte (needle : UInt8) (haystack : Bytes) : Option Nat := 93 haystack.toList.findIdx? (· == needle) 94 95 /-- Find index of first occurrence of a byte sequence -/ 96 def findBytes (needle : Bytes) (haystack : Bytes) : Option Nat := 97 if needle.size == 0 then some 0 98 else if haystack.size < needle.size then none 99 else 100 let limit := haystack.size - needle.size + 1 101 let rec go (i : Nat) (fuel : Nat) : Option Nat := 102 match fuel with 103 | 0 => none 104 | fuel' + 1 => 105 if i + needle.size > haystack.size then none 106 else if haystack.extract i (i + needle.size) == needle then some i 107 else go (i + 1) fuel' 108 go 0 limit 109 110 /-- Scan until a single byte delimiter (delimiter not included in result) -/ 111 def scanUntilByte (delim : UInt8) : Scanner Bytes where 112 scan bs := 113 match findByte delim bs with 114 | some idx => 115 let content := bs.extract 0 idx 116 let rest := bs.extract (idx + 1) bs.size 117 .found content rest 118 | none => .notFound 119 120 /-- Scan until a byte sequence delimiter (delimiter not included in result) -/ 121 def scanUntilBytes (delim : Bytes) : Scanner Bytes where 122 scan bs := 123 match findBytes delim bs with 124 | some idx => 125 let content := bs.extract 0 idx 126 let rest := bs.extract (idx + delim.size) bs.size 127 .found content rest 128 | none => .notFound 129 130 /-- Common delimiters -/ 131 def LF : UInt8 := 0x0A -- \n 132 def CR : UInt8 := 0x0D -- \r 133 def CRLF : Bytes := ⟨#[CR, LF]⟩ -- \r\n 134 def COLON : UInt8 := 0x3A -- : 135 def SPACE : UInt8 := 0x20 -- space 136 def TAB : UInt8 := 0x09 -- \t 137 def COMMA : UInt8 := 0x2C -- , 138 139 /-- Scan a line (until \n, returns content without \n) -/ 140 def scanLine : Scanner Bytes := scanUntilByte LF 141 142 /-- Scan a CRLF-terminated line (HTTP style) -/ 143 def scanCRLFLine : Scanner Bytes := scanUntilBytes CRLF 144 145 /-- Scan until colon (for header names) -/ 146 def scanUntilColon : Scanner Bytes := scanUntilByte COLON 147 148 /-- Scan until comma (for CSV fields) -/ 149 def scanUntilComma : Scanner Bytes := scanUntilByte COMMA 150 151 -- ═══════════════════════════════════════════════════════════════════════════════ 152 -- PREDICATE-BASED SCANNERS 153 -- ═══════════════════════════════════════════════════════════════════════════════ 154 155 /-- Scan while predicate holds (greedy) -/ 156 def scanWhile (p : UInt8 → Bool) : Scanner Bytes where 157 scan bs := 158 let rec go (i : Nat) : Nat := 159 if i < bs.size then 160 if p bs.data[i]! then go (i + 1) else i 161 else i 162 termination_by bs.size - i 163 let idx := go 0 164 if idx == 0 then .notFound 165 else .found (bs.extract 0 idx) (bs.extract idx bs.size) 166 167 /-- Scan while NOT predicate (until first match) -/ 168 def scanUntil (p : UInt8 → Bool) : Scanner Bytes := scanWhile (fun b => !p b) 169 170 /-- Character class predicates -/ 171 def isDigit (b : UInt8) : Bool := b >= 0x30 && b <= 0x39 -- '0'-'9' 172 def isAlpha (b : UInt8) : Bool := (b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A) 173 def isAlphaNum (b : UInt8) : Bool := isDigit b || isAlpha b 174 def isSpace (b : UInt8) : Bool := b == SPACE || b == TAB 175 def isWhitespace (b : UInt8) : Bool := isSpace b || b == CR || b == LF 176 def isHex (b : UInt8) : Bool := isDigit b || (b >= 0x41 && b <= 0x46) || (b >= 0x61 && b <= 0x66) 177 178 /-- Scan digits -/ 179 def scanDigits : Scanner Bytes := scanWhile isDigit 180 181 /-- Scan alphanumeric -/ 182 def scanAlphaNum : Scanner Bytes := scanWhile isAlphaNum 183 184 /-- Scan whitespace -/ 185 def scanWhitespace : Scanner Bytes := scanWhile isWhitespace 186 187 /-- Skip whitespace (returns Unit, for chaining) -/ 188 def skipWhitespace : Scanner Unit where 189 scan bs := 190 let rec go (i : Nat) : Nat := 191 if i < bs.size then 192 if isWhitespace bs.data[i]! then go (i + 1) else i 193 else i 194 termination_by bs.size - i 195 .found () (bs.extract (go 0) bs.size) 196 197 -- ═══════════════════════════════════════════════════════════════════════════════ 198 -- EXACT MATCH SCANNERS 199 -- ═══════════════════════════════════════════════════════════════════════════════ 200 201 /-- Match exact bytes, fail if not present -/ 202 def exact (expected : Bytes) : Scanner Unit where 203 scan bs := 204 if bs.size >= expected.size && bs.extract 0 expected.size == expected then 205 .found () (bs.extract expected.size bs.size) 206 else if bs.size < expected.size then 207 .incomplete (expected.size - bs.size) 208 else 209 .notFound 210 211 /-- Match exact byte -/ 212 def exactByte (expected : UInt8) : Scanner Unit := exact ⟨#[expected]⟩ 213 214 -- ═══════════════════════════════════════════════════════════════════════════════ 215 -- SCANNER COMBINATORS 216 -- ═══════════════════════════════════════════════════════════════════════════════ 217 218 /-- Sequence two scanners -/ 219 def seq (s1 : Scanner α) (s2 : Scanner β) : Scanner (α × β) where 220 scan bs := 221 s1.scan bs |>.bind fun a rest => 222 s2.scan rest |>.map fun b => (a, b) 223 224 /-- Map over scanner result -/ 225 def Scanner.map (s : Scanner α) (f : α → β) : Scanner β where 226 scan bs := s.scan bs |>.map f 227 228 /-- Optional scanner (always succeeds) -/ 229 def optional (s : Scanner α) : Scanner (Option α) where 230 scan bs := 231 match s.scan bs with 232 | .found a rest => .found (some a) rest 233 | .notFound => .found none bs 234 | .incomplete n => .incomplete n 235 236 /-- Try first scanner, if fails try second (ordered choice) -/ 237 def orElse (s1 : Scanner α) (s2 : Scanner α) : Scanner α where 238 scan bs := 239 match s1.scan bs with 240 | .found a rest => .found a rest 241 | .notFound => s2.scan bs 242 | .incomplete n => .incomplete n 243 244 instance : OrElse (Scanner α) where 245 orElse s1 s2 := orElse s1 (s2 ()) 246 247 /-- Repeat scanner zero or more times -/ 248 partial def many (s : Scanner α) : Scanner (List α) where 249 scan bs := 250 match s.scan bs with 251 | .found a rest => 252 match (many s).scan rest with 253 | .found as rest' => .found (a :: as) rest' 254 | _ => .found [a] rest 255 | .notFound => .found [] bs 256 | .incomplete n => .incomplete n 257 258 /-- Repeat scanner one or more times -/ 259 def many1 (s : Scanner α) : Scanner (List α) where 260 scan bs := 261 match s.scan bs with 262 | .found a rest => 263 match (many s).scan rest with 264 | .found as rest' => .found (a :: as) rest' 265 | _ => .found [a] rest 266 | .notFound => .notFound 267 | .incomplete n => .incomplete n 268 269 /-- Scan items separated by delimiter -/ 270 def sepBy (item : Scanner α) (delim : Scanner Unit) : Scanner (List α) where 271 scan bs := 272 match item.scan bs with 273 | .found a rest => 274 let rec go (acc : List α) (remaining : Bytes) (fuel : Nat) : ScanResult (List α) := 275 match fuel with 276 | 0 => .found acc.reverse remaining 277 | fuel' + 1 => 278 match delim.scan remaining with 279 | .found () rest' => 280 match item.scan rest' with 281 | .found a' rest'' => go (a' :: acc) rest'' fuel' 282 | _ => .found acc.reverse remaining 283 | _ => .found acc.reverse remaining 284 match go [a] rest rest.size with 285 | .found as rest' => .found as rest' 286 | _ => .found [a] rest 287 | .notFound => .found [] bs 288 | .incomplete n => .incomplete n 289 290 -- ═══════════════════════════════════════════════════════════════════════════════ 291 -- BOX → SCANNER EMBEDDING 292 -- ═══════════════════════════════════════════════════════════════════════════════ 293 294 /-- Lift a Box into a Scanner (Box is strictly less powerful) -/ 295 def fromBox (box : Box α) : Scanner α where 296 scan bs := 297 match box.parse bs with 298 | .ok a rest => .found a rest 299 | .fail => .notFound 300 301 /-- Use a Box to parse a fixed-length prefix, then continue with Scanner -/ 302 def boxThen (box : Box α) (next : α → Scanner β) : Scanner (α × β) where 303 scan bs := 304 match box.parse bs with 305 | .ok a rest => 306 match (next a).scan rest with 307 | .found b rest' => .found (a, b) rest' 308 | .notFound => .notFound 309 | .incomplete n => .incomplete n 310 | .fail => .notFound 311 312 -- ═══════════════════════════════════════════════════════════════════════════════ 313 -- STRING CONVERSION 314 -- ═══════════════════════════════════════════════════════════════════════════════ 315 316 /-- Convert scanned bytes to String (UTF-8) -/ 317 def Scanner.asString (s : Scanner Bytes) : Scanner String where 318 scan bs := 319 match s.scan bs with 320 | .found content rest => 321 match String.fromUTF8? content with 322 | some str => .found str rest 323 | none => .notFound -- Invalid UTF-8 324 | .notFound => .notFound 325 | .incomplete n => .incomplete n 326 327 /-- Scan a line as String -/ 328 def scanLineStr : Scanner String := scanLine.asString 329 330 /-- Scan a CRLF line as String -/ 331 def scanCRLFLineStr : Scanner String := scanCRLFLine.asString 332 333 -- ═══════════════════════════════════════════════════════════════════════════════ 334 -- HTTP/1.1 EXAMPLE 335 -- ═══════════════════════════════════════════════════════════════════════════════ 336 337 /-- HTTP header: name: value -/ 338 structure HttpHeader where 339 name : String 340 value : String 341 deriving Repr 342 343 /-- Scan a single HTTP header -/ 344 def scanHttpHeader : Scanner HttpHeader where 345 scan bs := 346 -- Scan name until colon 347 match scanUntilColon.scan bs with 348 | .found nameBytes rest1 => 349 -- Skip colon was consumed, skip optional whitespace 350 match skipWhitespace.scan rest1 with 351 | .found () rest2 => 352 -- Scan value until CRLF 353 match scanCRLFLine.scan rest2 with 354 | .found valueBytes rest3 => 355 match String.fromUTF8? nameBytes, String.fromUTF8? valueBytes with 356 | some name, some value => .found ⟨name, value.trimRight⟩ rest3 357 | _, _ => .notFound 358 | .notFound => .notFound 359 | .incomplete n => .incomplete n 360 | _ => .notFound 361 | .notFound => .notFound 362 | .incomplete n => .incomplete n 363 364 /-- Scan HTTP headers until empty line -/ 365 def scanHttpHeaders : Scanner (List HttpHeader) where 366 scan bs := 367 let rec go (acc : List HttpHeader) (remaining : Bytes) (fuel : Nat) : ScanResult (List HttpHeader) := 368 match fuel with 369 | 0 => .found acc.reverse remaining 370 | fuel' + 1 => 371 -- Check for empty line (end of headers) 372 match (exact CRLF).scan remaining with 373 | .found () rest => .found acc.reverse rest 374 | _ => 375 match scanHttpHeader.scan remaining with 376 | .found h rest => go (h :: acc) rest fuel' 377 | .notFound => .found acc.reverse remaining 378 | .incomplete n => .incomplete n 379 go [] bs bs.size 380 381 /-- HTTP request line: METHOD SP URI SP VERSION CRLF -/ 382 structure HttpRequestLine where 383 method : String 384 uri : String 385 version : String 386 deriving Repr 387 388 /-- Scan HTTP request line -/ 389 def scanHttpRequestLine : Scanner HttpRequestLine where 390 scan bs := 391 -- METHOD 392 match (scanUntilByte SPACE).scan bs with 393 | .found methodBytes rest1 => 394 -- URI (skip the space that was consumed) 395 match (scanUntilByte SPACE).scan rest1 with 396 | .found uriBytes rest2 => 397 -- VERSION until CRLF 398 match scanCRLFLine.scan rest2 with 399 | .found versionBytes rest3 => 400 match String.fromUTF8? methodBytes, String.fromUTF8? uriBytes, String.fromUTF8? versionBytes with 401 | some m, some u, some v => .found ⟨m, u, v⟩ rest3 402 | _, _, _ => .notFound 403 | .notFound => .notFound 404 | .incomplete n => .incomplete n 405 | .notFound => .notFound 406 | .incomplete n => .incomplete n 407 | .notFound => .notFound 408 | .incomplete n => .incomplete n 409 410 end Cornell.Scanner