/ generated / cpp / sigil.h
sigil.h
  1  // ═══════════════════════════════════════════════════════════════════════════════
  2  // GENERATED BY CORNELL - DO NOT EDIT
  3  // Source: Cornell.ExtractSigil (from Cornell.Sigil specification)
  4  //
  5  // This file is extracted from the verified Lean specification.
  6  // The state machine and reset-on-ambiguity behavior are proven correct in Lean.
  7  //
  8  // To regenerate:
  9  //   lake build && lake exe cornell_sigil_gen > path/to/sigil.h
 10  // ═══════════════════════════════════════════════════════════════════════════════
 11  
 12  #pragma once
 13  
 14  #include <cstdint>
 15  #include <optional>
 16  #include <span>
 17  #include <variant>
 18  #include <vector>
 19  
 20  #include "straylight/evring/machine.h"
 21  
 22  namespace evring::sigil {
 23  
 24  // ═══════════════════════════════════════════════════════════════════════════════
 25  // Constants
 26  // Generated from: Cornell.Sigil.Opcode
 27  // ═══════════════════════════════════════════════════════════════════════════════
 28  
 29  // Control opcodes (0xC0-0xCF)
 30  inline constexpr std::uint8_t op_chunk_end = 0xC0;
 31  inline constexpr std::uint8_t op_tool_call_start = 0xC1;
 32  inline constexpr std::uint8_t op_tool_call_end = 0xC2;
 33  inline constexpr std::uint8_t op_think_start = 0xC3;
 34  inline constexpr std::uint8_t op_think_end = 0xC4;
 35  inline constexpr std::uint8_t op_code_block_start = 0xC5;
 36  inline constexpr std::uint8_t op_code_block_end = 0xC6;
 37  inline constexpr std::uint8_t op_flush = 0xC7;
 38  // 0xC8-0xCE reserved
 39  inline constexpr std::uint8_t op_stream_end = 0xCF;
 40  
 41  // Special bytes
 42  inline constexpr std::uint8_t hot_token_reserved = 0x7F;
 43  
 44  // ═══════════════════════════════════════════════════════════════════════════════
 45  // Token type
 46  // Generated from: Cornell.Sigil.TokenId
 47  // ═══════════════════════════════════════════════════════════════════════════════
 48  
 49  using token_id = std::uint32_t;
 50  
 51  // ═══════════════════════════════════════════════════════════════════════════════
 52  // Byte classification
 53  // Generated from: Cornell.Sigil.isHotByte, isExtendedByte, isControlByte
 54  // ═══════════════════════════════════════════════════════════════════════════════
 55  
 56  /// Is this byte a hot token? (0x00-0x7E, not 0x7F)
 57  /// Lean: def isHotByte (b : UInt8) : Bool := b < 0x7F
 58  [[nodiscard]] constexpr auto is_hot_byte(std::uint8_t byte) noexcept -> bool {
 59    return byte < hot_token_reserved;
 60  }
 61  
 62  /// Is this byte an extended token escape? (0x80-0xBF)
 63  /// Lean: def isExtendedByte (b : UInt8) : Bool := b >= 0x80 && b < 0xC0
 64  [[nodiscard]] constexpr auto is_extended_byte(std::uint8_t byte) noexcept -> bool {
 65    return (byte & 0xC0) == 0x80;
 66  }
 67  
 68  /// Is this byte a control opcode? (0xC0-0xCF)
 69  /// Lean: def isControlByte (b : UInt8) : Bool := b >= 0xC0 && b < 0xD0
 70  [[nodiscard]] constexpr auto is_control_byte(std::uint8_t byte) noexcept -> bool {
 71    return (byte & 0xF0) == 0xC0;
 72  }
 73  
 74  /// Is this a reserved control opcode? (0xC8-0xCE)
 75  [[nodiscard]] constexpr auto is_reserved_control(std::uint8_t byte) noexcept -> bool {
 76    return byte >= 0xC8 && byte <= 0xCE;
 77  }
 78  
 79  // ═══════════════════════════════════════════════════════════════════════════════
 80  // Parse mode
 81  // Generated from: Cornell.Sigil.ParseMode
 82  // ═══════════════════════════════════════════════════════════════════════════════
 83  
 84  enum class parse_mode : std::uint8_t {
 85    text,       // Lean: | text
 86    think,      // Lean: | think  
 87    tool_call,  // Lean: | toolCall
 88    code_block, // Lean: | codeBlock
 89  };
 90  
 91  // ═══════════════════════════════════════════════════════════════════════════════
 92  // Ambiguity reasons
 93  // Generated from: Cornell.Sigil.AmbiguityReason
 94  // ═══════════════════════════════════════════════════════════════════════════════
 95  
 96  enum class ambiguity_reason : std::uint8_t {
 97    unmatched_mode_end,  // Lean: | unmatchedModeEnd : ParseMode → AmbiguityReason
 98    nested_mode_start,   // Lean: | nestedModeStart : ParseMode → ParseMode → AmbiguityReason
 99    reserved_opcode,     // Lean: | reservedOpcode : UInt8 → AmbiguityReason
100    varint_overflow,     // Lean: | varintOverflow : AmbiguityReason
101    upstream_error,      // Lean: | upstreamError : String → AmbiguityReason
102  };
103  
104  // ═══════════════════════════════════════════════════════════════════════════════
105  // Chunk content types
106  // Generated from: Cornell.Sigil.ChunkContent
107  // ═══════════════════════════════════════════════════════════════════════════════
108  
109  struct text_content {
110    std::vector<token_id> tokens;
111  };
112  
113  struct think_content {
114    std::vector<token_id> tokens;
115  };
116  
117  struct tool_call_content {
118    std::vector<token_id> tokens;
119  };
120  
121  struct code_block_content {
122    std::vector<token_id> tokens;
123  };
124  
125  struct stream_end_content {};
126  
127  struct decode_error_content {
128    std::vector<std::byte> bytes;
129  };
130  
131  struct ambiguity_reset_content {
132    ambiguity_reason reason;
133    parse_mode current_mode;
134    parse_mode attempted_mode;
135    std::uint8_t opcode;
136  };
137  
138  using chunk_content = std::variant<
139    text_content,
140    think_content,
141    tool_call_content,
142    code_block_content,
143    stream_end_content,
144    decode_error_content,
145    ambiguity_reset_content
146  >;
147  
148  // ═══════════════════════════════════════════════════════════════════════════════
149  // Chunk
150  // Generated from: Cornell.Sigil.Chunk
151  // ═══════════════════════════════════════════════════════════════════════════════
152  
153  struct chunk {
154    chunk_content content;
155    bool complete;  // Lean: complete : Bool  -- True if ends on semantic boundary
156  };
157  
158  // ═══════════════════════════════════════════════════════════════════════════════
159  // Parse result (tri-state)
160  // Generated from: Cornell.Sigil.StrictParseResult
161  // ═══════════════════════════════════════════════════════════════════════════════
162  
163  template <typename T>
164  struct parse_result {
165    struct ok_t {
166      T value;
167      std::size_t consumed;
168    };
169    struct incomplete_t {};
170    struct ambiguous_t {
171      ambiguity_reason reason;
172    };
173  
174    std::variant<ok_t, incomplete_t, ambiguous_t> data;
175  
176    [[nodiscard]] auto is_ok() const noexcept -> bool {
177      return std::holds_alternative<ok_t>(data);
178    }
179    [[nodiscard]] auto is_incomplete() const noexcept -> bool {
180      return std::holds_alternative<incomplete_t>(data);
181    }
182    [[nodiscard]] auto is_ambiguous() const noexcept -> bool {
183      return std::holds_alternative<ambiguous_t>(data);
184    }
185  
186    [[nodiscard]] auto get_ok() const -> ok_t const& { return std::get<ok_t>(data); }
187  
188    static auto ok(T value, std::size_t consumed) -> parse_result {
189      return parse_result{ok_t{std::move(value), consumed}};
190    }
191    static auto incomplete() -> parse_result { return parse_result{incomplete_t{}}; }
192    static auto ambiguous(ambiguity_reason reason) -> parse_result {
193      return parse_result{ambiguous_t{reason}};
194    }
195  };
196  
197  // ═══════════════════════════════════════════════════════════════════════════════
198  // Varint decoding (LEB128)
199  // Generated from: Cornell.Sigil.parseVarint
200  // ═══════════════════════════════════════════════════════════════════════════════
201  
202  /// Decode LEB128 varint from byte span.
203  /// Lean: def parseVarint (bs : Bytes) : StrictParseResult (UInt32 × Nat)
204  [[nodiscard]] inline auto decode_varint(std::span<const std::byte> bytes) noexcept
205      -> std::optional<std::pair<token_id, std::size_t>> {
206    if (bytes.empty()) {
207      return std::nullopt;
208    }
209  
210    token_id acc = 0;
211    int shift = 0;
212  
213    for (std::size_t offset = 0; offset < bytes.size(); ++offset) {
214      auto const byte = static_cast<std::uint8_t>(bytes[offset]);
215      acc |= static_cast<token_id>(byte & 0x7F) << shift;
216  
217      if ((byte & 0x80) == 0) {
218        return std::make_pair(acc, offset + 1);
219      }
220  
221      shift += 7;
222      // Lean: else if shift >= 28 then .ambiguous .varintOverflow
223      if (shift >= 35) {
224        return std::nullopt;  // Overflow
225      }
226    }
227  
228    return std::nullopt;  // Incomplete
229  }
230  
231  // ═══════════════════════════════════════════════════════════════════════════════
232  // Decode state
233  // Generated from: Cornell.Sigil.DecodeState
234  // ═══════════════════════════════════════════════════════════════════════════════
235  
236  struct sigil_state {
237    parse_mode mode = parse_mode::text;       // Lean: parseMode : ParseMode
238    std::vector<token_id> buffer;             // Lean: buffer : List TokenId
239    std::vector<std::byte> leftover;          // Lean: leftover : Bytes
240    std::vector<chunk> chunks;                // Accumulated chunks
241    bool stream_done = false;
242  
243    [[nodiscard]] auto is_done() const noexcept -> bool { return stream_done; }
244  };
245  
246  /// Initial decode state (the unique ground state)
247  /// Lean: def initDecodeState : DecodeState := { parseMode := .text, buffer := [], leftover := ByteArray.empty }
248  [[nodiscard]] inline auto init_sigil_state() -> sigil_state { return sigil_state{}; }
249  
250  /// Reset to ground state, discarding any accumulated context
251  /// Lean: def resetDecodeState (_s : DecodeState) : DecodeState := initDecodeState
252  /// 
253  /// Proven properties (from Cornell.Sigil):
254  ///   - reset_is_ground: ∀ s, resetDecodeState s = initDecodeState
255  ///   - no_leakage: ∀ s₁ s₂, resetDecodeState s₁ = resetDecodeState s₂
256  ///   - reset_idempotent: ∀ s, resetDecodeState (resetDecodeState s) = resetDecodeState s
257  [[nodiscard]] inline auto reset_sigil_state() -> sigil_state { return sigil_state{}; }
258  
259  // ═══════════════════════════════════════════════════════════════════════════════
260  // Chunk building
261  // Generated from: Cornell.Sigil.buildChunk
262  // ═══════════════════════════════════════════════════════════════════════════════
263  
264  [[nodiscard]] inline auto build_chunk(sigil_state const& state, bool complete) -> chunk {
265    chunk_content content;
266  
267    // Lean: let content := match state.parseMode with
268    switch (state.mode) {
269    case parse_mode::text:
270      content = text_content{state.buffer};
271      break;
272    case parse_mode::think:
273      content = think_content{state.buffer};
274      break;
275    case parse_mode::tool_call:
276      content = tool_call_content{state.buffer};
277      break;
278    case parse_mode::code_block:
279      content = code_block_content{state.buffer};
280      break;
281    }
282  
283    return chunk{std::move(content), complete};
284  }
285  
286  // ═══════════════════════════════════════════════════════════════════════════════
287  // Control byte handling
288  // Generated from: Cornell.Sigil.handleControl
289  //
290  // This is where ambiguity detection happens. The Lean proofs guarantee:
291  //   - handleControl_reserved_resets: reserved opcodes always reset
292  //   - handleControl_toolCallStart_think_resets: nested starts reset
293  //   - handleControl_toolCallEnd_text_resets: unmatched ends reset
294  //   - handleControl_ambiguity_is_ground: all resets produce initDecodeState
295  // ═══════════════════════════════════════════════════════════════════════════════
296  
297  struct control_result {
298    sigil_state new_state;
299    std::optional<chunk> emitted_chunk;
300    std::size_t consumed;
301  };
302  
303  [[nodiscard]] inline auto handle_control_byte(sigil_state state, std::uint8_t opcode)
304      -> control_result {
305    switch (opcode) {
306    
307    // Lean: | .chunkEnd => let chunk := buildChunk state true
308    //                      ({ state with buffer := [] }, some chunk)
309    case op_chunk_end: {
310      auto emitted = build_chunk(state, true);
311      state.buffer.clear();
312      return {std::move(state), std::move(emitted), 1};
313    }
314  
315    // Lean: | .toolCallStart =>
316    //         match state.parseMode with
317    //         | .text => ... valid transition ...
318    //         | mode => ... AMBIGUITY: nested mode start ...
319    case op_tool_call_start: {
320      if (state.mode != parse_mode::text) {
321        auto reset = reset_sigil_state();
322        chunk amb{ambiguity_reset_content{ambiguity_reason::nested_mode_start, state.mode,
323                                          parse_mode::tool_call, opcode}, true};
324        return {std::move(reset), std::move(amb), 1};
325      }
326      std::optional<chunk> pending;
327      if (!state.buffer.empty()) {
328        pending = build_chunk(state, false);
329      }
330      state.mode = parse_mode::tool_call;
331      state.buffer.clear();
332      return {std::move(state), std::move(pending), 1};
333    }
334  
335    // Lean: | .toolCallEnd =>
336    //         match state.parseMode with
337    //         | .toolCall => ... valid transition ...
338    //         | mode => ... AMBIGUITY: end without matching start ...
339    case op_tool_call_end: {
340      if (state.mode != parse_mode::tool_call) {
341        auto reset = reset_sigil_state();
342        chunk amb{ambiguity_reset_content{ambiguity_reason::unmatched_mode_end, state.mode,
343                                          parse_mode::tool_call, opcode}, true};
344        return {std::move(reset), std::move(amb), 1};
345      }
346      auto emitted = build_chunk(state, true);
347      state.mode = parse_mode::text;
348      state.buffer.clear();
349      return {std::move(state), std::move(emitted), 1};
350    }
351  
352    case op_think_start: {
353      if (state.mode != parse_mode::text) {
354        auto reset = reset_sigil_state();
355        chunk amb{ambiguity_reset_content{ambiguity_reason::nested_mode_start, state.mode,
356                                          parse_mode::think, opcode}, true};
357        return {std::move(reset), std::move(amb), 1};
358      }
359      std::optional<chunk> pending;
360      if (!state.buffer.empty()) {
361        pending = build_chunk(state, false);
362      }
363      state.mode = parse_mode::think;
364      state.buffer.clear();
365      return {std::move(state), std::move(pending), 1};
366    }
367  
368    case op_think_end: {
369      if (state.mode != parse_mode::think) {
370        auto reset = reset_sigil_state();
371        chunk amb{ambiguity_reset_content{ambiguity_reason::unmatched_mode_end, state.mode,
372                                          parse_mode::think, opcode}, true};
373        return {std::move(reset), std::move(amb), 1};
374      }
375      auto emitted = build_chunk(state, true);
376      state.mode = parse_mode::text;
377      state.buffer.clear();
378      return {std::move(state), std::move(emitted), 1};
379    }
380  
381    case op_code_block_start: {
382      if (state.mode != parse_mode::text) {
383        auto reset = reset_sigil_state();
384        chunk amb{ambiguity_reset_content{ambiguity_reason::nested_mode_start, state.mode,
385                                          parse_mode::code_block, opcode}, true};
386        return {std::move(reset), std::move(amb), 1};
387      }
388      std::optional<chunk> pending;
389      if (!state.buffer.empty()) {
390        pending = build_chunk(state, false);
391      }
392      state.mode = parse_mode::code_block;
393      state.buffer.clear();
394      return {std::move(state), std::move(pending), 1};
395    }
396  
397    case op_code_block_end: {
398      if (state.mode != parse_mode::code_block) {
399        auto reset = reset_sigil_state();
400        chunk amb{ambiguity_reset_content{ambiguity_reason::unmatched_mode_end, state.mode,
401                                          parse_mode::code_block, opcode}, true};
402        return {std::move(reset), std::move(amb), 1};
403      }
404      auto emitted = build_chunk(state, true);
405      state.mode = parse_mode::text;
406      state.buffer.clear();
407      return {std::move(state), std::move(emitted), 1};
408    }
409  
410    // Lean: | .flush => let chunk := buildChunk state false
411    //                   ({ state with buffer := [] }, some chunk)
412    case op_flush: {
413      auto emitted = build_chunk(state, false);
414      state.buffer.clear();
415      return {std::move(state), std::move(emitted), 1};
416    }
417  
418    // Lean: | .streamEnd => let chunk := if state.buffer.isEmpty 
419    //                         then { content := ChunkContent.streamEnd, complete := true }
420    //                         else buildChunk state true
421    //                       (initDecodeState, some chunk)
422    case op_stream_end: {
423      chunk emitted;
424      if (state.buffer.empty()) {
425        emitted = chunk{stream_end_content{}, true};
426      } else {
427        emitted = build_chunk(state, true);
428      }
429      state = reset_sigil_state();
430      state.stream_done = true;
431      return {std::move(state), std::move(emitted), 1};
432    }
433  
434    default:
435      // Lean: | .reserved code => let chunk := { content := ChunkContent.ambiguityReset (.reservedOpcode code), complete := true }
436      //                          (initDecodeState, some chunk)
437      if (is_reserved_control(opcode)) {
438        auto reset = reset_sigil_state();
439        chunk amb{ambiguity_reset_content{ambiguity_reason::reserved_opcode, state.mode,
440                                          parse_mode::text, opcode}, true};
441        return {std::move(reset), std::move(amb), 1};
442      }
443      // Unknown control outside reserved range, ignore
444      return {std::move(state), std::nullopt, 1};
445    }
446  }
447  
448  // ═══════════════════════════════════════════════════════════════════════════════
449  // Byte processing
450  // Generated from: Cornell.Sigil.decodeByte
451  // ═══════════════════════════════════════════════════════════════════════════════
452  
453  struct byte_result {
454    sigil_state new_state;
455    std::optional<chunk> emitted_chunk;
456    std::size_t consumed;
457    bool need_more;
458  };
459  
460  [[nodiscard]] inline auto process_byte(sigil_state state, std::span<const std::byte> input)
461      -> byte_result {
462    if (input.empty()) {
463      return {std::move(state), std::nullopt, 0, false};
464    }
465  
466    auto const byte = static_cast<std::uint8_t>(input[0]);
467    auto const rest = input.subspan(1);
468  
469    // Lean: if isHotByte b then
470    //         let tokenId := b.toUInt32
471    //         ({ state with buffer := tokenId :: state.buffer }, none)
472    if (is_hot_byte(byte)) {
473      state.buffer.push_back(static_cast<token_id>(byte));
474      return {std::move(state), std::nullopt, 1, false};
475    }
476  
477    // Extended token (varint follows)
478    if (is_extended_byte(byte)) {
479      auto varint_result = decode_varint(rest);
480      if (!varint_result) {
481        return {std::move(state), std::nullopt, 0, true};  // Incomplete
482      }
483      auto [token, varint_len] = *varint_result;
484      state.buffer.push_back(token);
485      return {std::move(state), std::nullopt, 1 + varint_len, false};
486    }
487  
488    // Lean: else if isControlByte b then
489    //         match decodeOpcode b with
490    //         | some op => handleControl state op
491    if (is_control_byte(byte)) {
492      auto ctrl = handle_control_byte(std::move(state), byte);
493      return {std::move(ctrl.new_state), std::move(ctrl.emitted_chunk), ctrl.consumed, false};
494    }
495  
496    // Reserved/unknown - ignore
497    return {std::move(state), std::nullopt, 1, false};
498  }
499  
500  inline auto process_bytes(sigil_state state, std::span<const std::byte> input)
501      -> std::pair<sigil_state, std::vector<chunk>> {
502    std::vector<chunk> chunks;
503    std::size_t offset = 0;
504  
505    while (offset < input.size()) {
506      auto result = process_byte(std::move(state), input.subspan(offset));
507      state = std::move(result.new_state);
508  
509      if (result.emitted_chunk) {
510        chunks.push_back(std::move(*result.emitted_chunk));
511      }
512  
513      if (result.need_more) {
514        state.leftover.assign(input.begin() + offset, input.end());
515        break;
516      }
517  
518      if (result.consumed == 0) {
519        break;
520      }
521  
522      offset += result.consumed;
523    }
524  
525    if (offset >= input.size()) {
526      state.leftover.clear();
527    }
528  
529    return {std::move(state), std::move(chunks)};
530  }
531  
532  // ═══════════════════════════════════════════════════════════════════════════════
533  // SIGIL Machine
534  // Implements evring::machine concept
535  // ═══════════════════════════════════════════════════════════════════════════════
536  
537  struct sigil_machine {
538    using state_type = sigil_state;
539  
540    [[nodiscard]] auto initial() const -> state_type { return init_sigil_state(); }
541  
542    [[nodiscard]] auto step(state_type state, event const& ev) const -> step_result<state_type> {
543      std::vector<std::byte> input;
544      input.reserve(state.leftover.size() + ev.data.size());
545      input.insert(input.end(), state.leftover.begin(), state.leftover.end());
546      input.insert(input.end(), ev.data.begin(), ev.data.end());
547  
548      auto [new_state, chunks] = process_bytes(std::move(state), input);
549  
550      for (auto& c : chunks) {
551        new_state.chunks.push_back(std::move(c));
552      }
553  
554      return {std::move(new_state), {}};
555    }
556  
557    [[nodiscard]] auto done(state_type const& state) const -> bool { return state.stream_done; }
558  
559    [[nodiscard]] static auto take_chunks(state_type& state) -> std::vector<chunk> {
560      std::vector<chunk> result;
561      std::swap(result, state.chunks);
562      return result;
563    }
564  };
565  
566  // Verify it satisfies the machine concept
567  static_assert(machine<sigil_machine>);
568  
569  
570  } // namespace evring::sigil
571