/ ink / termio / tokenize.ts
tokenize.ts
  1  /**
  2   * Input Tokenizer - Escape sequence boundary detection
  3   *
  4   * Splits terminal input into tokens: text chunks and raw escape sequences.
  5   * Unlike the Parser which interprets sequences semantically, this just
  6   * identifies boundaries for use by keyboard input parsing.
  7   */
  8  
  9  import { C0, ESC_TYPE, isEscFinal } from './ansi.js'
 10  import { isCSIFinal, isCSIIntermediate, isCSIParam } from './csi.js'
 11  
 12  export type Token =
 13    | { type: 'text'; value: string }
 14    | { type: 'sequence'; value: string }
 15  
 16  type State =
 17    | 'ground'
 18    | 'escape'
 19    | 'escapeIntermediate'
 20    | 'csi'
 21    | 'ss3'
 22    | 'osc'
 23    | 'dcs'
 24    | 'apc'
 25  
 26  export type Tokenizer = {
 27    /** Feed input and get resulting tokens */
 28    feed(input: string): Token[]
 29    /** Flush any buffered incomplete sequences */
 30    flush(): Token[]
 31    /** Reset tokenizer state */
 32    reset(): void
 33    /** Get any buffered incomplete sequence */
 34    buffer(): string
 35  }
 36  
 37  type TokenizerOptions = {
 38    /**
 39     * Treat `CSI M` as an X10 mouse event prefix and consume 3 payload bytes.
 40     * Only enable for stdin input — `\x1b[M` is also CSI DL (Delete Lines) in
 41     * output streams, and enabling this there swallows display text. Default false.
 42     */
 43    x10Mouse?: boolean
 44  }
 45  
 46  /**
 47   * Create a streaming tokenizer for terminal input.
 48   *
 49   * Usage:
 50   * ```typescript
 51   * const tokenizer = createTokenizer()
 52   * const tokens1 = tokenizer.feed('hello\x1b[')
 53   * const tokens2 = tokenizer.feed('A')  // completes the escape sequence
 54   * const remaining = tokenizer.flush()  // force output incomplete sequences
 55   * ```
 56   */
 57  export function createTokenizer(options?: TokenizerOptions): Tokenizer {
 58    let currentState: State = 'ground'
 59    let currentBuffer = ''
 60    const x10Mouse = options?.x10Mouse ?? false
 61  
 62    return {
 63      feed(input: string): Token[] {
 64        const result = tokenize(
 65          input,
 66          currentState,
 67          currentBuffer,
 68          false,
 69          x10Mouse,
 70        )
 71        currentState = result.state.state
 72        currentBuffer = result.state.buffer
 73        return result.tokens
 74      },
 75  
 76      flush(): Token[] {
 77        const result = tokenize('', currentState, currentBuffer, true, x10Mouse)
 78        currentState = result.state.state
 79        currentBuffer = result.state.buffer
 80        return result.tokens
 81      },
 82  
 83      reset(): void {
 84        currentState = 'ground'
 85        currentBuffer = ''
 86      },
 87  
 88      buffer(): string {
 89        return currentBuffer
 90      },
 91    }
 92  }
 93  
 94  type InternalState = {
 95    state: State
 96    buffer: string
 97  }
 98  
 99  function tokenize(
100    input: string,
101    initialState: State,
102    initialBuffer: string,
103    flush: boolean,
104    x10Mouse: boolean,
105  ): { tokens: Token[]; state: InternalState } {
106    const tokens: Token[] = []
107    const result: InternalState = {
108      state: initialState,
109      buffer: '',
110    }
111  
112    const data = initialBuffer + input
113    let i = 0
114    let textStart = 0
115    let seqStart = 0
116  
117    const flushText = (): void => {
118      if (i > textStart) {
119        const text = data.slice(textStart, i)
120        if (text) {
121          tokens.push({ type: 'text', value: text })
122        }
123      }
124      textStart = i
125    }
126  
127    const emitSequence = (seq: string): void => {
128      if (seq) {
129        tokens.push({ type: 'sequence', value: seq })
130      }
131      result.state = 'ground'
132      textStart = i
133    }
134  
135    while (i < data.length) {
136      const code = data.charCodeAt(i)
137  
138      switch (result.state) {
139        case 'ground':
140          if (code === C0.ESC) {
141            flushText()
142            seqStart = i
143            result.state = 'escape'
144            i++
145          } else {
146            i++
147          }
148          break
149  
150        case 'escape':
151          if (code === ESC_TYPE.CSI) {
152            result.state = 'csi'
153            i++
154          } else if (code === ESC_TYPE.OSC) {
155            result.state = 'osc'
156            i++
157          } else if (code === ESC_TYPE.DCS) {
158            result.state = 'dcs'
159            i++
160          } else if (code === ESC_TYPE.APC) {
161            result.state = 'apc'
162            i++
163          } else if (code === 0x4f) {
164            // 'O' - SS3
165            result.state = 'ss3'
166            i++
167          } else if (isCSIIntermediate(code)) {
168            // Intermediate byte (e.g., ESC ( for charset) - continue buffering
169            result.state = 'escapeIntermediate'
170            i++
171          } else if (isEscFinal(code)) {
172            // Two-character escape sequence
173            i++
174            emitSequence(data.slice(seqStart, i))
175          } else if (code === C0.ESC) {
176            // Double escape - emit first, start new
177            emitSequence(data.slice(seqStart, i))
178            seqStart = i
179            result.state = 'escape'
180            i++
181          } else {
182            // Invalid - treat ESC as text
183            result.state = 'ground'
184            textStart = seqStart
185          }
186          break
187  
188        case 'escapeIntermediate':
189          // After intermediate byte(s), wait for final byte
190          if (isCSIIntermediate(code)) {
191            // More intermediate bytes
192            i++
193          } else if (isEscFinal(code)) {
194            // Final byte - complete the sequence
195            i++
196            emitSequence(data.slice(seqStart, i))
197          } else {
198            // Invalid - treat as text
199            result.state = 'ground'
200            textStart = seqStart
201          }
202          break
203  
204        case 'csi':
205          // X10 mouse: CSI M + 3 raw payload bytes (Cb+32, Cx+32, Cy+32).
206          // M immediately after [ (offset 2) means no params — SGR mouse
207          // (CSI < … M) has a `<` param byte first and reaches M at offset > 2.
208          // Terminals that ignore DECSET 1006 but honor 1000/1002 emit this
209          // legacy encoding; without this branch the 3 payload bytes leak
210          // through as text (`` `rK `` / `arK` garbage in the prompt).
211          //
212          // Gated on x10Mouse — `\x1b[M` is also CSI DL (Delete Lines) and
213          // blindly consuming 3 chars corrupts output rendering (Parser/Ansi)
214          // and fragments bracketed-paste PASTE_END. Only stdin enables this.
215          // The ≥0x20 check on each payload slot is belt-and-suspenders: X10
216          // guarantees Cb≥32, Cx≥33, Cy≥33, so a control byte (ESC=0x1B) in
217          // any slot means this is CSI DL adjacent to another sequence, not a
218          // mouse event. Checking all three slots prevents PASTE_END's ESC
219          // from being consumed when paste content ends in `\x1b[M`+0-2 chars.
220          //
221          // Known limitation: this counts JS string chars, but X10 is byte-
222          // oriented and stdin uses utf8 encoding (App.tsx). At col 162-191 ×
223          // row 96-159 the two coord bytes (0xC2-0xDF, 0x80-0xBF) form a valid
224          // UTF-8 2-byte sequence and collapse to one char — the length check
225          // fails and the event buffers until the next keypress absorbs it.
226          // Fixing this requires latin1 stdin; X10's 223-coord cap is exactly
227          // why SGR was invented, and no-SGR terminals at 162+ cols are rare.
228          if (
229            x10Mouse &&
230            code === 0x4d /* M */ &&
231            i - seqStart === 2 &&
232            (i + 1 >= data.length || data.charCodeAt(i + 1) >= 0x20) &&
233            (i + 2 >= data.length || data.charCodeAt(i + 2) >= 0x20) &&
234            (i + 3 >= data.length || data.charCodeAt(i + 3) >= 0x20)
235          ) {
236            if (i + 4 <= data.length) {
237              i += 4
238              emitSequence(data.slice(seqStart, i))
239            } else {
240              // Incomplete — exit loop; end-of-input buffers from seqStart.
241              // Re-entry re-tokenizes from ground via the invalid-CSI fallthrough.
242              i = data.length
243            }
244            break
245          }
246          if (isCSIFinal(code)) {
247            i++
248            emitSequence(data.slice(seqStart, i))
249          } else if (isCSIParam(code) || isCSIIntermediate(code)) {
250            i++
251          } else {
252            // Invalid CSI - abort, treat as text
253            result.state = 'ground'
254            textStart = seqStart
255          }
256          break
257  
258        case 'ss3':
259          // SS3 sequences: ESC O followed by a single final byte
260          if (code >= 0x40 && code <= 0x7e) {
261            i++
262            emitSequence(data.slice(seqStart, i))
263          } else {
264            // Invalid - treat as text
265            result.state = 'ground'
266            textStart = seqStart
267          }
268          break
269  
270        case 'osc':
271          if (code === C0.BEL) {
272            i++
273            emitSequence(data.slice(seqStart, i))
274          } else if (
275            code === C0.ESC &&
276            i + 1 < data.length &&
277            data.charCodeAt(i + 1) === ESC_TYPE.ST
278          ) {
279            i += 2
280            emitSequence(data.slice(seqStart, i))
281          } else {
282            i++
283          }
284          break
285  
286        case 'dcs':
287        case 'apc':
288          if (code === C0.BEL) {
289            i++
290            emitSequence(data.slice(seqStart, i))
291          } else if (
292            code === C0.ESC &&
293            i + 1 < data.length &&
294            data.charCodeAt(i + 1) === ESC_TYPE.ST
295          ) {
296            i += 2
297            emitSequence(data.slice(seqStart, i))
298          } else {
299            i++
300          }
301          break
302      }
303    }
304  
305    // Handle end of input
306    if (result.state === 'ground') {
307      flushText()
308    } else if (flush) {
309      // Force output incomplete sequence
310      const remaining = data.slice(seqStart)
311      if (remaining) tokens.push({ type: 'sequence', value: remaining })
312      result.state = 'ground'
313    } else {
314      // Buffer incomplete sequence for next call
315      result.buffer = data.slice(seqStart)
316    }
317  
318    return { tokens, state: result }
319  }