sigil.h
1 // ═══════════════════════════════════════════════════════════════════════════════ 2 // GENERATED BY CORNELL - DO NOT EDIT 3 // Source: Cornell.ExtractSigil (from Cornell.Sigil specification) 4 // 5 // This file is extracted from the verified Lean specification. 6 // The state machine and reset-on-ambiguity behavior are proven correct in Lean. 7 // 8 // To regenerate: 9 // lake build && lake exe cornell_sigil_gen > path/to/sigil.h 10 // ═══════════════════════════════════════════════════════════════════════════════ 11 12 #pragma once 13 14 #include <cstdint> 15 #include <optional> 16 #include <span> 17 #include <variant> 18 #include <vector> 19 20 #include "straylight/evring/machine.h" 21 22 namespace evring::sigil { 23 24 // ═══════════════════════════════════════════════════════════════════════════════ 25 // Constants 26 // Generated from: Cornell.Sigil.Opcode 27 // ═══════════════════════════════════════════════════════════════════════════════ 28 29 // Control opcodes (0xC0-0xCF) 30 inline constexpr std::uint8_t op_chunk_end = 0xC0; 31 inline constexpr std::uint8_t op_tool_call_start = 0xC1; 32 inline constexpr std::uint8_t op_tool_call_end = 0xC2; 33 inline constexpr std::uint8_t op_think_start = 0xC3; 34 inline constexpr std::uint8_t op_think_end = 0xC4; 35 inline constexpr std::uint8_t op_code_block_start = 0xC5; 36 inline constexpr std::uint8_t op_code_block_end = 0xC6; 37 inline constexpr std::uint8_t op_flush = 0xC7; 38 // 0xC8-0xCE reserved 39 inline constexpr std::uint8_t op_stream_end = 0xCF; 40 41 // Special bytes 42 inline constexpr std::uint8_t hot_token_reserved = 0x7F; 43 44 // ═══════════════════════════════════════════════════════════════════════════════ 45 // Token type 46 // Generated from: Cornell.Sigil.TokenId 47 // ═══════════════════════════════════════════════════════════════════════════════ 48 49 using token_id = std::uint32_t; 50 51 // ═══════════════════════════════════════════════════════════════════════════════ 52 // Byte classification 53 // Generated from: Cornell.Sigil.isHotByte, isExtendedByte, isControlByte 54 // ═══════════════════════════════════════════════════════════════════════════════ 55 56 /// Is this byte a hot token? (0x00-0x7E, not 0x7F) 57 /// Lean: def isHotByte (b : UInt8) : Bool := b < 0x7F 58 [[nodiscard]] constexpr auto is_hot_byte(std::uint8_t byte) noexcept -> bool { 59 return byte < hot_token_reserved; 60 } 61 62 /// Is this byte an extended token escape? (0x80-0xBF) 63 /// Lean: def isExtendedByte (b : UInt8) : Bool := b >= 0x80 && b < 0xC0 64 [[nodiscard]] constexpr auto is_extended_byte(std::uint8_t byte) noexcept -> bool { 65 return (byte & 0xC0) == 0x80; 66 } 67 68 /// Is this byte a control opcode? (0xC0-0xCF) 69 /// Lean: def isControlByte (b : UInt8) : Bool := b >= 0xC0 && b < 0xD0 70 [[nodiscard]] constexpr auto is_control_byte(std::uint8_t byte) noexcept -> bool { 71 return (byte & 0xF0) == 0xC0; 72 } 73 74 /// Is this a reserved control opcode? (0xC8-0xCE) 75 [[nodiscard]] constexpr auto is_reserved_control(std::uint8_t byte) noexcept -> bool { 76 return byte >= 0xC8 && byte <= 0xCE; 77 } 78 79 // ═══════════════════════════════════════════════════════════════════════════════ 80 // Parse mode 81 // Generated from: Cornell.Sigil.ParseMode 82 // ═══════════════════════════════════════════════════════════════════════════════ 83 84 enum class parse_mode : std::uint8_t { 85 text, // Lean: | text 86 think, // Lean: | think 87 tool_call, // Lean: | toolCall 88 code_block, // Lean: | codeBlock 89 }; 90 91 // ═══════════════════════════════════════════════════════════════════════════════ 92 // Ambiguity reasons 93 // Generated from: Cornell.Sigil.AmbiguityReason 94 // ═══════════════════════════════════════════════════════════════════════════════ 95 96 enum class ambiguity_reason : std::uint8_t { 97 unmatched_mode_end, // Lean: | unmatchedModeEnd : ParseMode → AmbiguityReason 98 nested_mode_start, // Lean: | nestedModeStart : ParseMode → ParseMode → AmbiguityReason 99 reserved_opcode, // Lean: | reservedOpcode : UInt8 → AmbiguityReason 100 varint_overflow, // Lean: | varintOverflow : AmbiguityReason 101 upstream_error, // Lean: | upstreamError : String → AmbiguityReason 102 }; 103 104 // ═══════════════════════════════════════════════════════════════════════════════ 105 // Chunk content types 106 // Generated from: Cornell.Sigil.ChunkContent 107 // ═══════════════════════════════════════════════════════════════════════════════ 108 109 struct text_content { 110 std::vector<token_id> tokens; 111 }; 112 113 struct think_content { 114 std::vector<token_id> tokens; 115 }; 116 117 struct tool_call_content { 118 std::vector<token_id> tokens; 119 }; 120 121 struct code_block_content { 122 std::vector<token_id> tokens; 123 }; 124 125 struct stream_end_content {}; 126 127 struct decode_error_content { 128 std::vector<std::byte> bytes; 129 }; 130 131 struct ambiguity_reset_content { 132 ambiguity_reason reason; 133 parse_mode current_mode; 134 parse_mode attempted_mode; 135 std::uint8_t opcode; 136 }; 137 138 using chunk_content = std::variant< 139 text_content, 140 think_content, 141 tool_call_content, 142 code_block_content, 143 stream_end_content, 144 decode_error_content, 145 ambiguity_reset_content 146 >; 147 148 // ═══════════════════════════════════════════════════════════════════════════════ 149 // Chunk 150 // Generated from: Cornell.Sigil.Chunk 151 // ═══════════════════════════════════════════════════════════════════════════════ 152 153 struct chunk { 154 chunk_content content; 155 bool complete; // Lean: complete : Bool -- True if ends on semantic boundary 156 }; 157 158 // ═══════════════════════════════════════════════════════════════════════════════ 159 // Parse result (tri-state) 160 // Generated from: Cornell.Sigil.StrictParseResult 161 // ═══════════════════════════════════════════════════════════════════════════════ 162 163 template <typename T> 164 struct parse_result { 165 struct ok_t { 166 T value; 167 std::size_t consumed; 168 }; 169 struct incomplete_t {}; 170 struct ambiguous_t { 171 ambiguity_reason reason; 172 }; 173 174 std::variant<ok_t, incomplete_t, ambiguous_t> data; 175 176 [[nodiscard]] auto is_ok() const noexcept -> bool { 177 return std::holds_alternative<ok_t>(data); 178 } 179 [[nodiscard]] auto is_incomplete() const noexcept -> bool { 180 return std::holds_alternative<incomplete_t>(data); 181 } 182 [[nodiscard]] auto is_ambiguous() const noexcept -> bool { 183 return std::holds_alternative<ambiguous_t>(data); 184 } 185 186 [[nodiscard]] auto get_ok() const -> ok_t const& { return std::get<ok_t>(data); } 187 188 static auto ok(T value, std::size_t consumed) -> parse_result { 189 return parse_result{ok_t{std::move(value), consumed}}; 190 } 191 static auto incomplete() -> parse_result { return parse_result{incomplete_t{}}; } 192 static auto ambiguous(ambiguity_reason reason) -> parse_result { 193 return parse_result{ambiguous_t{reason}}; 194 } 195 }; 196 197 // ═══════════════════════════════════════════════════════════════════════════════ 198 // Varint decoding (LEB128) 199 // Generated from: Cornell.Sigil.parseVarint 200 // ═══════════════════════════════════════════════════════════════════════════════ 201 202 /// Decode LEB128 varint from byte span. 203 /// Lean: def parseVarint (bs : Bytes) : StrictParseResult (UInt32 × Nat) 204 [[nodiscard]] inline auto decode_varint(std::span<const std::byte> bytes) noexcept 205 -> std::optional<std::pair<token_id, std::size_t>> { 206 if (bytes.empty()) { 207 return std::nullopt; 208 } 209 210 token_id acc = 0; 211 int shift = 0; 212 213 for (std::size_t offset = 0; offset < bytes.size(); ++offset) { 214 auto const byte = static_cast<std::uint8_t>(bytes[offset]); 215 acc |= static_cast<token_id>(byte & 0x7F) << shift; 216 217 if ((byte & 0x80) == 0) { 218 return std::make_pair(acc, offset + 1); 219 } 220 221 shift += 7; 222 // Lean: else if shift >= 28 then .ambiguous .varintOverflow 223 if (shift >= 35) { 224 return std::nullopt; // Overflow 225 } 226 } 227 228 return std::nullopt; // Incomplete 229 } 230 231 // ═══════════════════════════════════════════════════════════════════════════════ 232 // Decode state 233 // Generated from: Cornell.Sigil.DecodeState 234 // ═══════════════════════════════════════════════════════════════════════════════ 235 236 struct sigil_state { 237 parse_mode mode = parse_mode::text; // Lean: parseMode : ParseMode 238 std::vector<token_id> buffer; // Lean: buffer : List TokenId 239 std::vector<std::byte> leftover; // Lean: leftover : Bytes 240 std::vector<chunk> chunks; // Accumulated chunks 241 bool stream_done = false; 242 243 [[nodiscard]] auto is_done() const noexcept -> bool { return stream_done; } 244 }; 245 246 /// Initial decode state (the unique ground state) 247 /// Lean: def initDecodeState : DecodeState := { parseMode := .text, buffer := [], leftover := ByteArray.empty } 248 [[nodiscard]] inline auto init_sigil_state() -> sigil_state { return sigil_state{}; } 249 250 /// Reset to ground state, discarding any accumulated context 251 /// Lean: def resetDecodeState (_s : DecodeState) : DecodeState := initDecodeState 252 /// 253 /// Proven properties (from Cornell.Sigil): 254 /// - reset_is_ground: ∀ s, resetDecodeState s = initDecodeState 255 /// - no_leakage: ∀ s₁ s₂, resetDecodeState s₁ = resetDecodeState s₂ 256 /// - reset_idempotent: ∀ s, resetDecodeState (resetDecodeState s) = resetDecodeState s 257 [[nodiscard]] inline auto reset_sigil_state() -> sigil_state { return sigil_state{}; } 258 259 // ═══════════════════════════════════════════════════════════════════════════════ 260 // Chunk building 261 // Generated from: Cornell.Sigil.buildChunk 262 // ═══════════════════════════════════════════════════════════════════════════════ 263 264 [[nodiscard]] inline auto build_chunk(sigil_state const& state, bool complete) -> chunk { 265 chunk_content content; 266 267 // Lean: let content := match state.parseMode with 268 switch (state.mode) { 269 case parse_mode::text: 270 content = text_content{state.buffer}; 271 break; 272 case parse_mode::think: 273 content = think_content{state.buffer}; 274 break; 275 case parse_mode::tool_call: 276 content = tool_call_content{state.buffer}; 277 break; 278 case parse_mode::code_block: 279 content = code_block_content{state.buffer}; 280 break; 281 } 282 283 return chunk{std::move(content), complete}; 284 } 285 286 // ═══════════════════════════════════════════════════════════════════════════════ 287 // Control byte handling 288 // Generated from: Cornell.Sigil.handleControl 289 // 290 // This is where ambiguity detection happens. The Lean proofs guarantee: 291 // - handleControl_reserved_resets: reserved opcodes always reset 292 // - handleControl_toolCallStart_think_resets: nested starts reset 293 // - handleControl_toolCallEnd_text_resets: unmatched ends reset 294 // - handleControl_ambiguity_is_ground: all resets produce initDecodeState 295 // ═══════════════════════════════════════════════════════════════════════════════ 296 297 struct control_result { 298 sigil_state new_state; 299 std::optional<chunk> emitted_chunk; 300 std::size_t consumed; 301 }; 302 303 [[nodiscard]] inline auto handle_control_byte(sigil_state state, std::uint8_t opcode) 304 -> control_result { 305 switch (opcode) { 306 307 // Lean: | .chunkEnd => let chunk := buildChunk state true 308 // ({ state with buffer := [] }, some chunk) 309 case op_chunk_end: { 310 auto emitted = build_chunk(state, true); 311 state.buffer.clear(); 312 return {std::move(state), std::move(emitted), 1}; 313 } 314 315 // Lean: | .toolCallStart => 316 // match state.parseMode with 317 // | .text => ... valid transition ... 318 // | mode => ... AMBIGUITY: nested mode start ... 319 case op_tool_call_start: { 320 if (state.mode != parse_mode::text) { 321 auto reset = reset_sigil_state(); 322 chunk amb{ambiguity_reset_content{ambiguity_reason::nested_mode_start, state.mode, 323 parse_mode::tool_call, opcode}, true}; 324 return {std::move(reset), std::move(amb), 1}; 325 } 326 std::optional<chunk> pending; 327 if (!state.buffer.empty()) { 328 pending = build_chunk(state, false); 329 } 330 state.mode = parse_mode::tool_call; 331 state.buffer.clear(); 332 return {std::move(state), std::move(pending), 1}; 333 } 334 335 // Lean: | .toolCallEnd => 336 // match state.parseMode with 337 // | .toolCall => ... valid transition ... 338 // | mode => ... AMBIGUITY: end without matching start ... 339 case op_tool_call_end: { 340 if (state.mode != parse_mode::tool_call) { 341 auto reset = reset_sigil_state(); 342 chunk amb{ambiguity_reset_content{ambiguity_reason::unmatched_mode_end, state.mode, 343 parse_mode::tool_call, opcode}, true}; 344 return {std::move(reset), std::move(amb), 1}; 345 } 346 auto emitted = build_chunk(state, true); 347 state.mode = parse_mode::text; 348 state.buffer.clear(); 349 return {std::move(state), std::move(emitted), 1}; 350 } 351 352 case op_think_start: { 353 if (state.mode != parse_mode::text) { 354 auto reset = reset_sigil_state(); 355 chunk amb{ambiguity_reset_content{ambiguity_reason::nested_mode_start, state.mode, 356 parse_mode::think, opcode}, true}; 357 return {std::move(reset), std::move(amb), 1}; 358 } 359 std::optional<chunk> pending; 360 if (!state.buffer.empty()) { 361 pending = build_chunk(state, false); 362 } 363 state.mode = parse_mode::think; 364 state.buffer.clear(); 365 return {std::move(state), std::move(pending), 1}; 366 } 367 368 case op_think_end: { 369 if (state.mode != parse_mode::think) { 370 auto reset = reset_sigil_state(); 371 chunk amb{ambiguity_reset_content{ambiguity_reason::unmatched_mode_end, state.mode, 372 parse_mode::think, opcode}, true}; 373 return {std::move(reset), std::move(amb), 1}; 374 } 375 auto emitted = build_chunk(state, true); 376 state.mode = parse_mode::text; 377 state.buffer.clear(); 378 return {std::move(state), std::move(emitted), 1}; 379 } 380 381 case op_code_block_start: { 382 if (state.mode != parse_mode::text) { 383 auto reset = reset_sigil_state(); 384 chunk amb{ambiguity_reset_content{ambiguity_reason::nested_mode_start, state.mode, 385 parse_mode::code_block, opcode}, true}; 386 return {std::move(reset), std::move(amb), 1}; 387 } 388 std::optional<chunk> pending; 389 if (!state.buffer.empty()) { 390 pending = build_chunk(state, false); 391 } 392 state.mode = parse_mode::code_block; 393 state.buffer.clear(); 394 return {std::move(state), std::move(pending), 1}; 395 } 396 397 case op_code_block_end: { 398 if (state.mode != parse_mode::code_block) { 399 auto reset = reset_sigil_state(); 400 chunk amb{ambiguity_reset_content{ambiguity_reason::unmatched_mode_end, state.mode, 401 parse_mode::code_block, opcode}, true}; 402 return {std::move(reset), std::move(amb), 1}; 403 } 404 auto emitted = build_chunk(state, true); 405 state.mode = parse_mode::text; 406 state.buffer.clear(); 407 return {std::move(state), std::move(emitted), 1}; 408 } 409 410 // Lean: | .flush => let chunk := buildChunk state false 411 // ({ state with buffer := [] }, some chunk) 412 case op_flush: { 413 auto emitted = build_chunk(state, false); 414 state.buffer.clear(); 415 return {std::move(state), std::move(emitted), 1}; 416 } 417 418 // Lean: | .streamEnd => let chunk := if state.buffer.isEmpty 419 // then { content := ChunkContent.streamEnd, complete := true } 420 // else buildChunk state true 421 // (initDecodeState, some chunk) 422 case op_stream_end: { 423 chunk emitted; 424 if (state.buffer.empty()) { 425 emitted = chunk{stream_end_content{}, true}; 426 } else { 427 emitted = build_chunk(state, true); 428 } 429 state = reset_sigil_state(); 430 state.stream_done = true; 431 return {std::move(state), std::move(emitted), 1}; 432 } 433 434 default: 435 // Lean: | .reserved code => let chunk := { content := ChunkContent.ambiguityReset (.reservedOpcode code), complete := true } 436 // (initDecodeState, some chunk) 437 if (is_reserved_control(opcode)) { 438 auto reset = reset_sigil_state(); 439 chunk amb{ambiguity_reset_content{ambiguity_reason::reserved_opcode, state.mode, 440 parse_mode::text, opcode}, true}; 441 return {std::move(reset), std::move(amb), 1}; 442 } 443 // Unknown control outside reserved range, ignore 444 return {std::move(state), std::nullopt, 1}; 445 } 446 } 447 448 // ═══════════════════════════════════════════════════════════════════════════════ 449 // Byte processing 450 // Generated from: Cornell.Sigil.decodeByte 451 // ═══════════════════════════════════════════════════════════════════════════════ 452 453 struct byte_result { 454 sigil_state new_state; 455 std::optional<chunk> emitted_chunk; 456 std::size_t consumed; 457 bool need_more; 458 }; 459 460 [[nodiscard]] inline auto process_byte(sigil_state state, std::span<const std::byte> input) 461 -> byte_result { 462 if (input.empty()) { 463 return {std::move(state), std::nullopt, 0, false}; 464 } 465 466 auto const byte = static_cast<std::uint8_t>(input[0]); 467 auto const rest = input.subspan(1); 468 469 // Lean: if isHotByte b then 470 // let tokenId := b.toUInt32 471 // ({ state with buffer := tokenId :: state.buffer }, none) 472 if (is_hot_byte(byte)) { 473 state.buffer.push_back(static_cast<token_id>(byte)); 474 return {std::move(state), std::nullopt, 1, false}; 475 } 476 477 // Extended token (varint follows) 478 if (is_extended_byte(byte)) { 479 auto varint_result = decode_varint(rest); 480 if (!varint_result) { 481 return {std::move(state), std::nullopt, 0, true}; // Incomplete 482 } 483 auto [token, varint_len] = *varint_result; 484 state.buffer.push_back(token); 485 return {std::move(state), std::nullopt, 1 + varint_len, false}; 486 } 487 488 // Lean: else if isControlByte b then 489 // match decodeOpcode b with 490 // | some op => handleControl state op 491 if (is_control_byte(byte)) { 492 auto ctrl = handle_control_byte(std::move(state), byte); 493 return {std::move(ctrl.new_state), std::move(ctrl.emitted_chunk), ctrl.consumed, false}; 494 } 495 496 // Reserved/unknown - ignore 497 return {std::move(state), std::nullopt, 1, false}; 498 } 499 500 inline auto process_bytes(sigil_state state, std::span<const std::byte> input) 501 -> std::pair<sigil_state, std::vector<chunk>> { 502 std::vector<chunk> chunks; 503 std::size_t offset = 0; 504 505 while (offset < input.size()) { 506 auto result = process_byte(std::move(state), input.subspan(offset)); 507 state = std::move(result.new_state); 508 509 if (result.emitted_chunk) { 510 chunks.push_back(std::move(*result.emitted_chunk)); 511 } 512 513 if (result.need_more) { 514 state.leftover.assign(input.begin() + offset, input.end()); 515 break; 516 } 517 518 if (result.consumed == 0) { 519 break; 520 } 521 522 offset += result.consumed; 523 } 524 525 if (offset >= input.size()) { 526 state.leftover.clear(); 527 } 528 529 return {std::move(state), std::move(chunks)}; 530 } 531 532 // ═══════════════════════════════════════════════════════════════════════════════ 533 // SIGIL Machine 534 // Implements evring::machine concept 535 // ═══════════════════════════════════════════════════════════════════════════════ 536 537 struct sigil_machine { 538 using state_type = sigil_state; 539 540 [[nodiscard]] auto initial() const -> state_type { return init_sigil_state(); } 541 542 [[nodiscard]] auto step(state_type state, event const& ev) const -> step_result<state_type> { 543 std::vector<std::byte> input; 544 input.reserve(state.leftover.size() + ev.data.size()); 545 input.insert(input.end(), state.leftover.begin(), state.leftover.end()); 546 input.insert(input.end(), ev.data.begin(), ev.data.end()); 547 548 auto [new_state, chunks] = process_bytes(std::move(state), input); 549 550 for (auto& c : chunks) { 551 new_state.chunks.push_back(std::move(c)); 552 } 553 554 return {std::move(new_state), {}}; 555 } 556 557 [[nodiscard]] auto done(state_type const& state) const -> bool { return state.stream_done; } 558 559 [[nodiscard]] static auto take_chunks(state_type& state) -> std::vector<chunk> { 560 std::vector<chunk> result; 561 std::swap(result, state.chunks); 562 return result; 563 } 564 }; 565 566 // Verify it satisfies the machine concept 567 static_assert(machine<sigil_machine>); 568 569 570 } // namespace evring::sigil 571