Cradicle Explorer

/ utils / bash / treeSitterAnalysis.ts
treeSitterAnalysis.ts
  1  /**
  2   * Tree-sitter AST analysis utilities for bash command security validation.
  3   *
  4   * These functions extract security-relevant information from tree-sitter
  5   * parse trees, providing more accurate analysis than regex/shell-quote
  6   * parsing. Each function takes a root node and command string, and returns
  7   * structured data that can be used by security validators.
  8   *
  9   * The native NAPI parser returns plain JS objects — no cleanup needed.
 10   */
 11  
 12  type TreeSitterNode = {
 13    type: string
 14    text: string
 15    startIndex: number
 16    endIndex: number
 17    children: TreeSitterNode[]
 18    childCount: number
 19  }
 20  
 21  export type QuoteContext = {
 22    /** Command text with single-quoted content removed (double-quoted content preserved) */
 23    withDoubleQuotes: string
 24    /** Command text with all quoted content removed */
 25    fullyUnquoted: string
 26    /** Like fullyUnquoted but preserves quote characters (', ") */
 27    unquotedKeepQuoteChars: string
 28  }
 29  
 30  export type CompoundStructure = {
 31    /** Whether the command has compound operators (&&, ||, ;) at the top level */
 32    hasCompoundOperators: boolean
 33    /** Whether the command has pipelines */
 34    hasPipeline: boolean
 35    /** Whether the command has subshells */
 36    hasSubshell: boolean
 37    /** Whether the command has command groups ({...}) */
 38    hasCommandGroup: boolean
 39    /** Top-level compound operator types found */
 40    operators: string[]
 41    /** Individual command segments split by compound operators */
 42    segments: string[]
 43  }
 44  
 45  export type DangerousPatterns = {
 46    /** Has $() or backtick command substitution (outside quotes that would make it safe) */
 47    hasCommandSubstitution: boolean
 48    /** Has <() or >() process substitution */
 49    hasProcessSubstitution: boolean
 50    /** Has ${...} parameter expansion */
 51    hasParameterExpansion: boolean
 52    /** Has heredoc */
 53    hasHeredoc: boolean
 54    /** Has comment */
 55    hasComment: boolean
 56  }
 57  
 58  export type TreeSitterAnalysis = {
 59    quoteContext: QuoteContext
 60    compoundStructure: CompoundStructure
 61    /** Whether actual operator nodes (;, &&, ||) exist — if false, \; is just a word argument */
 62    hasActualOperatorNodes: boolean
 63    dangerousPatterns: DangerousPatterns
 64  }
 65  
 66  type QuoteSpans = {
 67    raw: Array<[number, number]> // raw_string (single-quoted)
 68    ansiC: Array<[number, number]> // ansi_c_string ($'...')
 69    double: Array<[number, number]> // string (double-quoted)
 70    heredoc: Array<[number, number]> // quoted heredoc_redirect
 71  }
 72  
 73  /**
 74   * Single-pass collection of all quote-related spans.
 75   * Previously this was 5 separate tree walks (one per type-set plus
 76   * allQuoteTypes plus heredoc); fusing cuts tree-traversal ~5x.
 77   *
 78   * Replicates the per-type walk semantics: each original walk stopped at
 79   * its own type. So the raw_string walk would recurse THROUGH a string
 80   * node (not its type) to reach nested raw_string inside $(...), but the
 81   * string walk would stop at the outer string. We track `inDouble` to
 82   * collect the *outermost* string span per path, while still descending
 83   * into $()/${} bodies to pick up inner raw_string/ansi_c_string.
 84   *
 85   * raw_string / ansi_c_string / quoted-heredoc bodies are literal text
 86   * in bash (no expansion), so no nested quote nodes exist — return early.
 87   */
 88  function collectQuoteSpans(
 89    node: TreeSitterNode,
 90    out: QuoteSpans,
 91    inDouble: boolean,
 92  ): void {
 93    switch (node.type) {
 94      case 'raw_string':
 95        out.raw.push([node.startIndex, node.endIndex])
 96        return // literal body, no nested quotes possible
 97      case 'ansi_c_string':
 98        out.ansiC.push([node.startIndex, node.endIndex])
 99        return // literal body
100      case 'string':
101        // Only collect the outermost string (matches old per-type walk
102        // which stops at first match). Recurse regardless — a nested
103        // $(cmd 'x') inside "..." has a real inner raw_string.
104        if (!inDouble) out.double.push([node.startIndex, node.endIndex])
105        for (const child of node.children) {
106          if (child) collectQuoteSpans(child, out, true)
107        }
108        return
109      case 'heredoc_redirect': {
110        // Quoted heredocs (<<'EOF', <<"EOF", <<\EOF): literal body.
111        // Unquoted (<<EOF) expands $()/${} — the body can contain
112        // $(cmd 'x') whose inner '...' IS a real raw_string node.
113        // Detection: heredoc_start text starts with '/"/\\
114        // Matches sync path's extractHeredocs({ quotedOnly: true }).
115        let isQuoted = false
116        for (const child of node.children) {
117          if (child && child.type === 'heredoc_start') {
118            const first = child.text[0]
119            isQuoted = first === "'" || first === '"' || first === '\\'
120            break
121          }
122        }
123        if (isQuoted) {
124          out.heredoc.push([node.startIndex, node.endIndex])
125          return // literal body, no nested quote nodes
126        }
127        // Unquoted: recurse into heredoc_body → command_substitution →
128        // inner quote nodes. The original per-type walks did NOT stop at
129        // heredoc_redirect (not in their type sets), so they recursed here.
130        break
131      }
132    }
133  
134    for (const child of node.children) {
135      if (child) collectQuoteSpans(child, out, inDouble)
136    }
137  }
138  
139  /**
140   * Builds a Set of all character positions covered by the given spans.
141   */
142  function buildPositionSet(spans: Array<[number, number]>): Set<number> {
143    const set = new Set<number>()
144    for (const [start, end] of spans) {
145      for (let i = start; i < end; i++) {
146        set.add(i)
147      }
148    }
149    return set
150  }
151  
152  /**
153   * Drops spans that are fully contained within another span, keeping only the
154   * outermost. Nested quotes (e.g., `"$(echo 'hi')"`) yield overlapping spans
155   * — the inner raw_string is found by recursing into the outer string node.
156   * Processing overlapping spans corrupts indices since removing/replacing the
157   * outer span shifts the inner span's start/end into stale positions.
158   */
159  function dropContainedSpans<T extends readonly [number, number, ...unknown[]]>(
160    spans: T[],
161  ): T[] {
162    return spans.filter(
163      (s, i) =>
164        !spans.some(
165          (other, j) =>
166            j !== i &&
167            other[0] <= s[0] &&
168            other[1] >= s[1] &&
169            (other[0] < s[0] || other[1] > s[1]),
170        ),
171    )
172  }
173  
174  /**
175   * Removes spans from a string, returning the string with those character
176   * ranges removed.
177   */
178  function removeSpans(command: string, spans: Array<[number, number]>): string {
179    if (spans.length === 0) return command
180  
181    // Drop inner spans that are fully contained in an outer one, then sort by
182    // start index descending so we can splice without offset shifts.
183    const sorted = dropContainedSpans(spans).sort((a, b) => b[0] - a[0])
184    let result = command
185    for (const [start, end] of sorted) {
186      result = result.slice(0, start) + result.slice(end)
187    }
188    return result
189  }
190  
191  /**
192   * Replaces spans with just the quote delimiters (preserving ' and " characters).
193   */
194  function replaceSpansKeepQuotes(
195    command: string,
196    spans: Array<[number, number, string, string]>,
197  ): string {
198    if (spans.length === 0) return command
199  
200    const sorted = dropContainedSpans(spans).sort((a, b) => b[0] - a[0])
201    let result = command
202    for (const [start, end, open, close] of sorted) {
203      // Replace content but keep the quote delimiters
204      result = result.slice(0, start) + open + close + result.slice(end)
205    }
206    return result
207  }
208  
209  /**
210   * Extract quote context from the tree-sitter AST.
211   * Replaces the manual character-by-character extractQuotedContent() function.
212   *
213   * Tree-sitter node types:
214   * - raw_string: single-quoted ('...')
215   * - string: double-quoted ("...")
216   * - ansi_c_string: ANSI-C quoting ($'...') — span includes the leading $
217   * - heredoc_redirect: QUOTED heredocs only (<<'EOF', <<"EOF", <<\EOF) —
218   *   the full redirect span (<<, delimiters, body, newlines) is stripped
219   *   since the body is literal text in bash (no expansion). UNQUOTED
220   *   heredocs (<<EOF) are left in place since bash expands $(...)/${...}
221   *   inside them, and validators need to see those patterns. Matches the
222   *   sync path's extractHeredocs({ quotedOnly: true }).
223   */
224  export function extractQuoteContext(
225    rootNode: unknown,
226    command: string,
227  ): QuoteContext {
228    // Single walk collects all quote span types at once.
229    const spans: QuoteSpans = { raw: [], ansiC: [], double: [], heredoc: [] }
230    collectQuoteSpans(rootNode as TreeSitterNode, spans, false)
231    const singleQuoteSpans = spans.raw
232    const ansiCSpans = spans.ansiC
233    const doubleQuoteSpans = spans.double
234    const quotedHeredocSpans = spans.heredoc
235    const allQuoteSpans = [
236      ...singleQuoteSpans,
237      ...ansiCSpans,
238      ...doubleQuoteSpans,
239      ...quotedHeredocSpans,
240    ]
241  
242    // Build a set of positions that should be excluded for each output variant.
243    // For withDoubleQuotes: remove single-quoted spans entirely, plus the
244    // opening/closing `"` delimiters of double-quoted spans (but keep the
245    // content between them). This matches the regex extractQuotedContent()
246    // semantics where `"` toggles quote state but content is still emitted.
247    const singleQuoteSet = buildPositionSet([
248      ...singleQuoteSpans,
249      ...ansiCSpans,
250      ...quotedHeredocSpans,
251    ])
252    const doubleQuoteDelimSet = new Set<number>()
253    for (const [start, end] of doubleQuoteSpans) {
254      doubleQuoteDelimSet.add(start) // opening "
255      doubleQuoteDelimSet.add(end - 1) // closing "
256    }
257    let withDoubleQuotes = ''
258    for (let i = 0; i < command.length; i++) {
259      if (singleQuoteSet.has(i)) continue
260      if (doubleQuoteDelimSet.has(i)) continue
261      withDoubleQuotes += command[i]
262    }
263  
264    // fullyUnquoted: remove all quoted content
265    const fullyUnquoted = removeSpans(command, allQuoteSpans)
266  
267    // unquotedKeepQuoteChars: remove content but keep delimiter chars
268    const spansWithQuoteChars: Array<[number, number, string, string]> = []
269    for (const [start, end] of singleQuoteSpans) {
270      spansWithQuoteChars.push([start, end, "'", "'"])
271    }
272    for (const [start, end] of ansiCSpans) {
273      // ansi_c_string spans include the leading $; preserve it so this
274      // matches the regex path, which treats $ as unquoted preceding '.
275      spansWithQuoteChars.push([start, end, "$'", "'"])
276    }
277    for (const [start, end] of doubleQuoteSpans) {
278      spansWithQuoteChars.push([start, end, '"', '"'])
279    }
280    for (const [start, end] of quotedHeredocSpans) {
281      // Heredoc redirect spans have no inline quote delimiters — strip entirely.
282      spansWithQuoteChars.push([start, end, '', ''])
283    }
284    const unquotedKeepQuoteChars = replaceSpansKeepQuotes(
285      command,
286      spansWithQuoteChars,
287    )
288  
289    return { withDoubleQuotes, fullyUnquoted, unquotedKeepQuoteChars }
290  }
291  
292  /**
293   * Extract compound command structure from the AST.
294   * Replaces isUnsafeCompoundCommand() and splitCommand() for tree-sitter path.
295   */
296  export function extractCompoundStructure(
297    rootNode: unknown,
298    command: string,
299  ): CompoundStructure {
300    const n = rootNode as TreeSitterNode
301    const operators: string[] = []
302    const segments: string[] = []
303    let hasSubshell = false
304    let hasCommandGroup = false
305    let hasPipeline = false
306  
307    // Walk top-level children of the program node
308    function walkTopLevel(node: TreeSitterNode): void {
309      for (const child of node.children) {
310        if (!child) continue
311  
312        if (child.type === 'list') {
313          // list nodes contain && and || operators
314          for (const listChild of child.children) {
315            if (!listChild) continue
316            if (listChild.type === '&&' || listChild.type === '||') {
317              operators.push(listChild.type)
318            } else if (
319              listChild.type === 'list' ||
320              listChild.type === 'redirected_statement'
321            ) {
322              // Nested list, or redirected_statement wrapping a list/pipeline —
323              // recurse so inner operators/pipelines are detected. For
324              // `cmd1 && cmd2 2>/dev/null && cmd3`, the redirected_statement
325              // wraps `list(cmd1 && cmd2)` — the inner `&&` would be missed
326              // without recursion.
327              walkTopLevel({ ...node, children: [listChild] } as TreeSitterNode)
328            } else if (listChild.type === 'pipeline') {
329              hasPipeline = true
330              segments.push(listChild.text)
331            } else if (listChild.type === 'subshell') {
332              hasSubshell = true
333              segments.push(listChild.text)
334            } else if (listChild.type === 'compound_statement') {
335              hasCommandGroup = true
336              segments.push(listChild.text)
337            } else {
338              segments.push(listChild.text)
339            }
340          }
341        } else if (child.type === ';') {
342          operators.push(';')
343        } else if (child.type === 'pipeline') {
344          hasPipeline = true
345          segments.push(child.text)
346        } else if (child.type === 'subshell') {
347          hasSubshell = true
348          segments.push(child.text)
349        } else if (child.type === 'compound_statement') {
350          hasCommandGroup = true
351          segments.push(child.text)
352        } else if (
353          child.type === 'command' ||
354          child.type === 'declaration_command' ||
355          child.type === 'variable_assignment'
356        ) {
357          segments.push(child.text)
358        } else if (child.type === 'redirected_statement') {
359          // `cd ~/src && find path 2>/dev/null` — tree-sitter wraps the ENTIRE
360          // compound in a redirected_statement: program → redirected_statement →
361          // (list → cmd1, &&, cmd2) + file_redirect. Same for `cmd1 | cmd2 > out`
362          // (wraps pipeline) and `(cmd) > out` (wraps subshell). Recurse to
363          // detect the inner structure; skip file_redirect children (redirects
364          // don't affect compound/pipeline classification).
365          let foundInner = false
366          for (const inner of child.children) {
367            if (!inner || inner.type === 'file_redirect') continue
368            foundInner = true
369            walkTopLevel({ ...child, children: [inner] } as TreeSitterNode)
370          }
371          if (!foundInner) {
372            // Standalone redirect with no body (shouldn't happen, but fail-safe)
373            segments.push(child.text)
374          }
375        } else if (child.type === 'negated_command') {
376          // `! cmd` — recurse into the inner command so its structure is
377          // classified (pipeline/subshell/etc.), but also record the full
378          // negated text as a segment so segments.length stays meaningful.
379          segments.push(child.text)
380          walkTopLevel(child)
381        } else if (
382          child.type === 'if_statement' ||
383          child.type === 'while_statement' ||
384          child.type === 'for_statement' ||
385          child.type === 'case_statement' ||
386          child.type === 'function_definition'
387        ) {
388          // Control-flow constructs: the construct itself is one segment,
389          // but recurse so inner pipelines/subshells/operators are detected.
390          segments.push(child.text)
391          walkTopLevel(child)
392        }
393      }
394    }
395  
396    walkTopLevel(n)
397  
398    // If no segments found, the whole command is one segment
399    if (segments.length === 0) {
400      segments.push(command)
401    }
402  
403    return {
404      hasCompoundOperators: operators.length > 0,
405      hasPipeline,
406      hasSubshell,
407      hasCommandGroup,
408      operators,
409      segments,
410    }
411  }
412  
413  /**
414   * Check whether the AST contains actual operator nodes (;, &&, ||).
415   *
416   * This is the key function for eliminating the `find -exec \;` false positive.
417   * Tree-sitter parses `\;` as part of a `word` node (an argument to find),
418   * NOT as a `;` operator. So if no actual `;` operator nodes exist in the AST,
419   * there are no compound operators and hasBackslashEscapedOperator() can be skipped.
420   */
421  export function hasActualOperatorNodes(rootNode: unknown): boolean {
422    const n = rootNode as TreeSitterNode
423  
424    function walk(node: TreeSitterNode): boolean {
425      // Check for operator types that indicate compound commands
426      if (node.type === ';' || node.type === '&&' || node.type === '||') {
427        // Verify this is a child of a list or program, not inside a command
428        return true
429      }
430  
431      if (node.type === 'list') {
432        // A list node means there are compound operators
433        return true
434      }
435  
436      for (const child of node.children) {
437        if (child && walk(child)) return true
438      }
439      return false
440    }
441  
442    return walk(n)
443  }
444  
445  /**
446   * Extract dangerous pattern information from the AST.
447   */
448  export function extractDangerousPatterns(rootNode: unknown): DangerousPatterns {
449    const n = rootNode as TreeSitterNode
450    let hasCommandSubstitution = false
451    let hasProcessSubstitution = false
452    let hasParameterExpansion = false
453    let hasHeredoc = false
454    let hasComment = false
455  
456    function walk(node: TreeSitterNode): void {
457      switch (node.type) {
458        case 'command_substitution':
459          hasCommandSubstitution = true
460          break
461        case 'process_substitution':
462          hasProcessSubstitution = true
463          break
464        case 'expansion':
465          hasParameterExpansion = true
466          break
467        case 'heredoc_redirect':
468          hasHeredoc = true
469          break
470        case 'comment':
471          hasComment = true
472          break
473      }
474  
475      for (const child of node.children) {
476        if (child) walk(child)
477      }
478    }
479  
480    walk(n)
481  
482    return {
483      hasCommandSubstitution,
484      hasProcessSubstitution,
485      hasParameterExpansion,
486      hasHeredoc,
487      hasComment,
488    }
489  }
490  
491  /**
492   * Perform complete tree-sitter analysis of a command.
493   * Extracts all security-relevant data from the AST in one pass.
494   * This data must be extracted before tree.delete() is called.
495   */
496  export function analyzeCommand(
497    rootNode: unknown,
498    command: string,
499  ): TreeSitterAnalysis {
500    return {
501      quoteContext: extractQuoteContext(rootNode, command),
502      compoundStructure: extractCompoundStructure(rootNode, command),
503      hasActualOperatorNodes: hasActualOperatorNodes(rootNode),
504      dangerousPatterns: extractDangerousPatterns(rootNode),
505    }
506  }