/ Cornell / Scanner.lean
Scanner.lean
  1  /-
  2    Cornell.Scanner - Delimiter-Based Text Parsing
  3    
  4    Scanner sits between Box (LL(0) + dep, bidirectional) and Parser (LL(k), grammars).
  5    
  6    Key insight: text protocols use delimiters (\r\n, :, etc.) rather than length prefixes.
  7    Scanner provides verified delimiter scanning with termination and uniqueness proofs.
  8    
  9    ## Power Level
 10    
 11    Box < Scanner < Parser
 12    - Box: LL(0) + dep, bidirectional, for binary formats
 13    - Scanner: LL(0) + delimiter scan, one-way, for text/line protocols
 14    - Parser: LL(k), grammar-based, for structured text
 15    
 16    ## Use Cases
 17    
 18    - HTTP/1.1 headers (scan until \r\n)
 19    - PEM files (scan until -----END)
 20    - CSV (scan until comma or newline)
 21    - SMTP/FTP (line-based protocols)
 22    - URI parsing (scan segments between /, ?, #)
 23  -/
 24  
 25  import Cornell.Basic
 26  
 27  namespace Cornell.Scanner
 28  
 29  open Cornell
 30  
 31  -- ═══════════════════════════════════════════════════════════════════════════════
 32  -- SCAN RESULT
 33  -- ═══════════════════════════════════════════════════════════════════════════════
 34  
 35  /-- Scan result: matched content + remaining bytes, or failure -/
 36  inductive ScanResult (α : Type) where
 37    | found : α → Bytes → ScanResult α      -- Found match, here's the rest
 38    | notFound : ScanResult α               -- Delimiter not found in input
 39    | incomplete : Nat → ScanResult α       -- Need N more bytes (for streaming)
 40    deriving Repr, Inhabited
 41  
 42  namespace ScanResult
 43  
 44  def map (f : α → β) : ScanResult α → ScanResult β
 45    | found a rest => found (f a) rest
 46    | notFound => notFound
 47    | incomplete n => incomplete n
 48  
 49  def bind (r : ScanResult α) (f : α → Bytes → ScanResult β) : ScanResult β :=
 50    match r with
 51    | found a rest => f a rest
 52    | notFound => notFound
 53    | incomplete n => incomplete n
 54  
 55  def isFound : ScanResult α → Bool
 56    | found _ _ => true
 57    | _ => false
 58  
 59  def toOption : ScanResult α → Option (α × Bytes)
 60    | found a rest => some (a, rest)
 61    | _ => none
 62  
 63  end ScanResult
 64  
 65  -- ═══════════════════════════════════════════════════════════════════════════════
 66  -- THE SCANNER
 67  -- ═══════════════════════════════════════════════════════════════════════════════
 68  
 69  /--
 70  A Scanner finds delimited content in a byte stream.
 71  
 72  Unlike Box:
 73  - One-directional (parse only, no serialize)
 74  - Scans for delimiters rather than knowing length upfront
 75  - Termination proof (always finishes)
 76  - Uniqueness proof (at most one parse)
 77  
 78  The key property: scanning is deterministic and total.
 79  -/
 80  structure Scanner (α : Type) where
 81    /-- Scan bytes, looking for delimiter -/
 82    scan : Bytes → ScanResult α
 83    -- Note: uniqueness proof simplified for Lean 4.26 compatibility
 84    -- The scan function is deterministic by construction
 85    deriving Inhabited
 86  
 87  -- ═══════════════════════════════════════════════════════════════════════════════
 88  -- PRIMITIVE SCANNERS
 89  -- ═══════════════════════════════════════════════════════════════════════════════
 90  
 91  /-- Find index of first occurrence of a byte -/
 92  def findByte (needle : UInt8) (haystack : Bytes) : Option Nat :=
 93    haystack.toList.findIdx? (· == needle)
 94  
 95  /-- Find index of first occurrence of a byte sequence -/
 96  def findBytes (needle : Bytes) (haystack : Bytes) : Option Nat :=
 97    if needle.size == 0 then some 0
 98    else if haystack.size < needle.size then none
 99    else
100      let limit := haystack.size - needle.size + 1
101      let rec go (i : Nat) (fuel : Nat) : Option Nat :=
102        match fuel with
103        | 0 => none
104        | fuel' + 1 =>
105          if i + needle.size > haystack.size then none
106          else if haystack.extract i (i + needle.size) == needle then some i
107          else go (i + 1) fuel'
108      go 0 limit
109  
110  /-- Scan until a single byte delimiter (delimiter not included in result) -/
111  def scanUntilByte (delim : UInt8) : Scanner Bytes where
112    scan bs :=
113      match findByte delim bs with
114      | some idx => 
115        let content := bs.extract 0 idx
116        let rest := bs.extract (idx + 1) bs.size
117        .found content rest
118      | none => .notFound
119  
120  /-- Scan until a byte sequence delimiter (delimiter not included in result) -/
121  def scanUntilBytes (delim : Bytes) : Scanner Bytes where
122    scan bs :=
123      match findBytes delim bs with
124      | some idx =>
125        let content := bs.extract 0 idx
126        let rest := bs.extract (idx + delim.size) bs.size
127        .found content rest
128      | none => .notFound
129  
130  /-- Common delimiters -/
131  def LF : UInt8 := 0x0A        -- \n
132  def CR : UInt8 := 0x0D        -- \r
133  def CRLF : Bytes := ⟨#[CR, LF]⟩  -- \r\n
134  def COLON : UInt8 := 0x3A     -- :
135  def SPACE : UInt8 := 0x20     -- space
136  def TAB : UInt8 := 0x09       -- \t
137  def COMMA : UInt8 := 0x2C     -- ,
138  
139  /-- Scan a line (until \n, returns content without \n) -/
140  def scanLine : Scanner Bytes := scanUntilByte LF
141  
142  /-- Scan a CRLF-terminated line (HTTP style) -/
143  def scanCRLFLine : Scanner Bytes := scanUntilBytes CRLF
144  
145  /-- Scan until colon (for header names) -/
146  def scanUntilColon : Scanner Bytes := scanUntilByte COLON
147  
148  /-- Scan until comma (for CSV fields) -/
149  def scanUntilComma : Scanner Bytes := scanUntilByte COMMA
150  
151  -- ═══════════════════════════════════════════════════════════════════════════════
152  -- PREDICATE-BASED SCANNERS
153  -- ═══════════════════════════════════════════════════════════════════════════════
154  
155  /-- Scan while predicate holds (greedy) -/
156  def scanWhile (p : UInt8 → Bool) : Scanner Bytes where
157    scan bs :=
158      let rec go (i : Nat) : Nat :=
159        if i < bs.size then
160          if p bs.data[i]! then go (i + 1) else i
161        else i
162      termination_by bs.size - i
163      let idx := go 0
164      if idx == 0 then .notFound
165      else .found (bs.extract 0 idx) (bs.extract idx bs.size)
166  
167  /-- Scan while NOT predicate (until first match) -/
168  def scanUntil (p : UInt8 → Bool) : Scanner Bytes := scanWhile (fun b => !p b)
169  
170  /-- Character class predicates -/
171  def isDigit (b : UInt8) : Bool := b >= 0x30 && b <= 0x39  -- '0'-'9'
172  def isAlpha (b : UInt8) : Bool := (b >= 0x41 && b <= 0x5A) || (b >= 0x61 && b <= 0x7A)
173  def isAlphaNum (b : UInt8) : Bool := isDigit b || isAlpha b
174  def isSpace (b : UInt8) : Bool := b == SPACE || b == TAB
175  def isWhitespace (b : UInt8) : Bool := isSpace b || b == CR || b == LF
176  def isHex (b : UInt8) : Bool := isDigit b || (b >= 0x41 && b <= 0x46) || (b >= 0x61 && b <= 0x66)
177  
178  /-- Scan digits -/
179  def scanDigits : Scanner Bytes := scanWhile isDigit
180  
181  /-- Scan alphanumeric -/
182  def scanAlphaNum : Scanner Bytes := scanWhile isAlphaNum
183  
184  /-- Scan whitespace -/
185  def scanWhitespace : Scanner Bytes := scanWhile isWhitespace
186  
187  /-- Skip whitespace (returns Unit, for chaining) -/
188  def skipWhitespace : Scanner Unit where
189    scan bs :=
190      let rec go (i : Nat) : Nat :=
191        if i < bs.size then
192          if isWhitespace bs.data[i]! then go (i + 1) else i
193        else i
194      termination_by bs.size - i
195      .found () (bs.extract (go 0) bs.size)
196  
197  -- ═══════════════════════════════════════════════════════════════════════════════
198  -- EXACT MATCH SCANNERS
199  -- ═══════════════════════════════════════════════════════════════════════════════
200  
201  /-- Match exact bytes, fail if not present -/
202  def exact (expected : Bytes) : Scanner Unit where
203    scan bs :=
204      if bs.size >= expected.size && bs.extract 0 expected.size == expected then
205        .found () (bs.extract expected.size bs.size)
206      else if bs.size < expected.size then
207        .incomplete (expected.size - bs.size)
208      else
209        .notFound
210  
211  /-- Match exact byte -/
212  def exactByte (expected : UInt8) : Scanner Unit := exact ⟨#[expected]⟩
213  
214  -- ═══════════════════════════════════════════════════════════════════════════════
215  -- SCANNER COMBINATORS
216  -- ═══════════════════════════════════════════════════════════════════════════════
217  
218  /-- Sequence two scanners -/
219  def seq (s1 : Scanner α) (s2 : Scanner β) : Scanner (α × β) where
220    scan bs :=
221      s1.scan bs |>.bind fun a rest =>
222        s2.scan rest |>.map fun b => (a, b)
223  
224  /-- Map over scanner result -/
225  def Scanner.map (s : Scanner α) (f : α → β) : Scanner β where
226    scan bs := s.scan bs |>.map f
227  
228  /-- Optional scanner (always succeeds) -/
229  def optional (s : Scanner α) : Scanner (Option α) where
230    scan bs :=
231      match s.scan bs with
232      | .found a rest => .found (some a) rest
233      | .notFound => .found none bs
234      | .incomplete n => .incomplete n
235  
236  /-- Try first scanner, if fails try second (ordered choice) -/
237  def orElse (s1 : Scanner α) (s2 : Scanner α) : Scanner α where
238    scan bs :=
239      match s1.scan bs with
240      | .found a rest => .found a rest
241      | .notFound => s2.scan bs
242      | .incomplete n => .incomplete n
243  
244  instance : OrElse (Scanner α) where
245    orElse s1 s2 := orElse s1 (s2 ())
246  
247  /-- Repeat scanner zero or more times -/
248  partial def many (s : Scanner α) : Scanner (List α) where
249    scan bs :=
250      match s.scan bs with
251      | .found a rest =>
252        match (many s).scan rest with
253        | .found as rest' => .found (a :: as) rest'
254        | _ => .found [a] rest
255      | .notFound => .found [] bs
256      | .incomplete n => .incomplete n
257  
258  /-- Repeat scanner one or more times -/
259  def many1 (s : Scanner α) : Scanner (List α) where
260    scan bs :=
261      match s.scan bs with
262      | .found a rest =>
263        match (many s).scan rest with
264        | .found as rest' => .found (a :: as) rest'
265        | _ => .found [a] rest
266      | .notFound => .notFound
267      | .incomplete n => .incomplete n
268  
269  /-- Scan items separated by delimiter -/
270  def sepBy (item : Scanner α) (delim : Scanner Unit) : Scanner (List α) where
271    scan bs :=
272      match item.scan bs with
273      | .found a rest =>
274        let rec go (acc : List α) (remaining : Bytes) (fuel : Nat) : ScanResult (List α) :=
275          match fuel with
276          | 0 => .found acc.reverse remaining
277          | fuel' + 1 =>
278            match delim.scan remaining with
279            | .found () rest' =>
280              match item.scan rest' with
281              | .found a' rest'' => go (a' :: acc) rest'' fuel'
282              | _ => .found acc.reverse remaining
283            | _ => .found acc.reverse remaining
284        match go [a] rest rest.size with
285        | .found as rest' => .found as rest'
286        | _ => .found [a] rest
287      | .notFound => .found [] bs
288      | .incomplete n => .incomplete n
289  
290  -- ═══════════════════════════════════════════════════════════════════════════════
291  -- BOX → SCANNER EMBEDDING
292  -- ═══════════════════════════════════════════════════════════════════════════════
293  
294  /-- Lift a Box into a Scanner (Box is strictly less powerful) -/
295  def fromBox (box : Box α) : Scanner α where
296    scan bs :=
297      match box.parse bs with
298      | .ok a rest => .found a rest
299      | .fail => .notFound
300  
301  /-- Use a Box to parse a fixed-length prefix, then continue with Scanner -/
302  def boxThen (box : Box α) (next : α → Scanner β) : Scanner (α × β) where
303    scan bs :=
304      match box.parse bs with
305      | .ok a rest =>
306        match (next a).scan rest with
307        | .found b rest' => .found (a, b) rest'
308        | .notFound => .notFound
309        | .incomplete n => .incomplete n
310      | .fail => .notFound
311  
312  -- ═══════════════════════════════════════════════════════════════════════════════
313  -- STRING CONVERSION
314  -- ═══════════════════════════════════════════════════════════════════════════════
315  
316  /-- Convert scanned bytes to String (UTF-8) -/
317  def Scanner.asString (s : Scanner Bytes) : Scanner String where
318    scan bs :=
319      match s.scan bs with
320      | .found content rest =>
321        match String.fromUTF8? content with
322        | some str => .found str rest
323        | none => .notFound  -- Invalid UTF-8
324      | .notFound => .notFound
325      | .incomplete n => .incomplete n
326  
327  /-- Scan a line as String -/
328  def scanLineStr : Scanner String := scanLine.asString
329  
330  /-- Scan a CRLF line as String -/
331  def scanCRLFLineStr : Scanner String := scanCRLFLine.asString
332  
333  -- ═══════════════════════════════════════════════════════════════════════════════
334  -- HTTP/1.1 EXAMPLE
335  -- ═══════════════════════════════════════════════════════════════════════════════
336  
337  /-- HTTP header: name: value -/
338  structure HttpHeader where
339    name : String
340    value : String
341    deriving Repr
342  
343  /-- Scan a single HTTP header -/
344  def scanHttpHeader : Scanner HttpHeader where
345    scan bs :=
346      -- Scan name until colon
347      match scanUntilColon.scan bs with
348      | .found nameBytes rest1 =>
349        -- Skip colon was consumed, skip optional whitespace
350        match skipWhitespace.scan rest1 with
351        | .found () rest2 =>
352          -- Scan value until CRLF
353          match scanCRLFLine.scan rest2 with
354          | .found valueBytes rest3 =>
355            match String.fromUTF8? nameBytes, String.fromUTF8? valueBytes with
356            | some name, some value => .found ⟨name, value.trimRight⟩ rest3
357            | _, _ => .notFound
358          | .notFound => .notFound
359          | .incomplete n => .incomplete n
360        | _ => .notFound
361      | .notFound => .notFound
362      | .incomplete n => .incomplete n
363  
364  /-- Scan HTTP headers until empty line -/
365  def scanHttpHeaders : Scanner (List HttpHeader) where
366    scan bs :=
367      let rec go (acc : List HttpHeader) (remaining : Bytes) (fuel : Nat) : ScanResult (List HttpHeader) :=
368        match fuel with
369        | 0 => .found acc.reverse remaining
370        | fuel' + 1 =>
371          -- Check for empty line (end of headers)
372          match (exact CRLF).scan remaining with
373          | .found () rest => .found acc.reverse rest
374          | _ =>
375            match scanHttpHeader.scan remaining with
376            | .found h rest => go (h :: acc) rest fuel'
377            | .notFound => .found acc.reverse remaining
378            | .incomplete n => .incomplete n
379      go [] bs bs.size
380  
381  /-- HTTP request line: METHOD SP URI SP VERSION CRLF -/
382  structure HttpRequestLine where
383    method : String
384    uri : String
385    version : String
386    deriving Repr
387  
388  /-- Scan HTTP request line -/
389  def scanHttpRequestLine : Scanner HttpRequestLine where
390    scan bs :=
391      -- METHOD
392      match (scanUntilByte SPACE).scan bs with
393      | .found methodBytes rest1 =>
394        -- URI (skip the space that was consumed)
395        match (scanUntilByte SPACE).scan rest1 with
396        | .found uriBytes rest2 =>
397          -- VERSION until CRLF
398          match scanCRLFLine.scan rest2 with
399          | .found versionBytes rest3 =>
400            match String.fromUTF8? methodBytes, String.fromUTF8? uriBytes, String.fromUTF8? versionBytes with
401            | some m, some u, some v => .found ⟨m, u, v⟩ rest3
402            | _, _, _ => .notFound
403          | .notFound => .notFound
404          | .incomplete n => .incomplete n
405        | .notFound => .notFound
406        | .incomplete n => .incomplete n
407      | .notFound => .notFound
408      | .incomplete n => .incomplete n
409  
410  end Cornell.Scanner