/ utils / bash / parser.ts
parser.ts
  1  import { feature } from 'bun:bundle'
  2  import { logEvent } from '../../services/analytics/index.js'
  3  import { logForDebugging } from '../debug.js'
  4  import {
  5    ensureParserInitialized,
  6    getParserModule,
  7    type TsNode,
  8  } from './bashParser.js'
  9  
 10  export type Node = TsNode
 11  
 12  export interface ParsedCommandData {
 13    rootNode: Node
 14    envVars: string[]
 15    commandNode: Node | null
 16    originalCommand: string
 17  }
 18  
 19  const MAX_COMMAND_LENGTH = 10000
 20  const DECLARATION_COMMANDS = new Set([
 21    'export',
 22    'declare',
 23    'typeset',
 24    'readonly',
 25    'local',
 26    'unset',
 27    'unsetenv',
 28  ])
 29  const ARGUMENT_TYPES = new Set(['word', 'string', 'raw_string', 'number'])
 30  const SUBSTITUTION_TYPES = new Set([
 31    'command_substitution',
 32    'process_substitution',
 33  ])
 34  const COMMAND_TYPES = new Set(['command', 'declaration_command'])
 35  
 36  let logged = false
 37  function logLoadOnce(success: boolean): void {
 38    if (logged) return
 39    logged = true
 40    logForDebugging(
 41      success ? 'tree-sitter: native module loaded' : 'tree-sitter: unavailable',
 42    )
 43    logEvent('tengu_tree_sitter_load', { success })
 44  }
 45  
 46  /**
 47   * Awaits WASM init (Parser.init + Language.load). Must be called before
 48   * parseCommand/parseCommandRaw for the parser to be available. Idempotent.
 49   */
 50  export async function ensureInitialized(): Promise<void> {
 51    if (feature('TREE_SITTER_BASH') || feature('TREE_SITTER_BASH_SHADOW')) {
 52      await ensureParserInitialized()
 53    }
 54  }
 55  
 56  export async function parseCommand(
 57    command: string,
 58  ): Promise<ParsedCommandData | null> {
 59    if (!command || command.length > MAX_COMMAND_LENGTH) return null
 60  
 61    // Gate: ant-only until pentest. External builds fall back to legacy
 62    // regex/shell-quote path. Guarding the whole body inside the positive
 63    // branch lets Bun DCE the NAPI import AND keeps telemetry honest — we
 64    // only fire tengu_tree_sitter_load when a load was genuinely attempted.
 65    if (feature('TREE_SITTER_BASH')) {
 66      await ensureParserInitialized()
 67      const mod = getParserModule()
 68      logLoadOnce(mod !== null)
 69      if (!mod) return null
 70  
 71      try {
 72        const rootNode = mod.parse(command)
 73        if (!rootNode) return null
 74  
 75        const commandNode = findCommandNode(rootNode, null)
 76        const envVars = extractEnvVars(commandNode)
 77  
 78        return { rootNode, envVars, commandNode, originalCommand: command }
 79      } catch {
 80        return null
 81      }
 82    }
 83    return null
 84  }
 85  
 86  /**
 87   * SECURITY: Sentinel for "parser was loaded and attempted, but aborted"
 88   * (timeout / node budget / Rust panic). Distinct from `null` (module not
 89   * loaded). Adversarial input can trigger abort under MAX_COMMAND_LENGTH:
 90   * `(( a[0][0]... ))` with ~2800 subscripts hits PARSE_TIMEOUT_MICROS.
 91   * Callers MUST treat this as fail-closed (too-complex), NOT route to legacy.
 92   */
 93  export const PARSE_ABORTED = Symbol('parse-aborted')
 94  
 95  /**
 96   * Raw parse — skips findCommandNode/extractEnvVars which the security
 97   * walker in ast.ts doesn't use. Saves one tree walk per bash command.
 98   *
 99   * Returns:
100   *   - Node: parse succeeded
101   *   - null: module not loaded / feature off / empty / over-length
102   *   - PARSE_ABORTED: module loaded but parse failed (timeout/panic)
103   */
104  export async function parseCommandRaw(
105    command: string,
106  ): Promise<Node | null | typeof PARSE_ABORTED> {
107    if (!command || command.length > MAX_COMMAND_LENGTH) return null
108    if (feature('TREE_SITTER_BASH') || feature('TREE_SITTER_BASH_SHADOW')) {
109      await ensureParserInitialized()
110      const mod = getParserModule()
111      logLoadOnce(mod !== null)
112      if (!mod) return null
113      try {
114        const result = mod.parse(command)
115        // SECURITY: Module loaded; null here = timeout/node-budget abort in
116        // bashParser.ts (PARSE_TIMEOUT_MS=50, MAX_NODES=50_000).
117        // Previously collapsed into `return null` → parse-unavailable → legacy
118        // path, which lacks EVAL_LIKE_BUILTINS — `trap`, `enable`, `hash` leaked.
119        if (result === null) {
120          logEvent('tengu_tree_sitter_parse_abort', {
121            cmdLength: command.length,
122            panic: false,
123          })
124          return PARSE_ABORTED
125        }
126        return result
127      } catch {
128        logEvent('tengu_tree_sitter_parse_abort', {
129          cmdLength: command.length,
130          panic: true,
131        })
132        return PARSE_ABORTED
133      }
134    }
135    return null
136  }
137  
138  function findCommandNode(node: Node, parent: Node | null): Node | null {
139    const { type, children } = node
140  
141    if (COMMAND_TYPES.has(type)) return node
142  
143    // Variable assignment followed by command
144    if (type === 'variable_assignment' && parent) {
145      return (
146        parent.children.find(
147          c => COMMAND_TYPES.has(c.type) && c.startIndex > node.startIndex,
148        ) ?? null
149      )
150    }
151  
152    // Pipeline: recurse into first child (which may be a redirected_statement)
153    if (type === 'pipeline') {
154      for (const child of children) {
155        const result = findCommandNode(child, node)
156        if (result) return result
157      }
158      return null
159    }
160  
161    // Redirected statement: find the command inside
162    if (type === 'redirected_statement') {
163      return children.find(c => COMMAND_TYPES.has(c.type)) ?? null
164    }
165  
166    // Recursive search
167    for (const child of children) {
168      const result = findCommandNode(child, node)
169      if (result) return result
170    }
171  
172    return null
173  }
174  
175  function extractEnvVars(commandNode: Node | null): string[] {
176    if (!commandNode || commandNode.type !== 'command') return []
177  
178    const envVars: string[] = []
179    for (const child of commandNode.children) {
180      if (child.type === 'variable_assignment') {
181        envVars.push(child.text)
182      } else if (child.type === 'command_name' || child.type === 'word') {
183        break
184      }
185    }
186    return envVars
187  }
188  
189  export function extractCommandArguments(commandNode: Node): string[] {
190    // Declaration commands
191    if (commandNode.type === 'declaration_command') {
192      const firstChild = commandNode.children[0]
193      return firstChild && DECLARATION_COMMANDS.has(firstChild.text)
194        ? [firstChild.text]
195        : []
196    }
197  
198    const args: string[] = []
199    let foundCommandName = false
200  
201    for (const child of commandNode.children) {
202      if (child.type === 'variable_assignment') continue
203  
204      // Command name
205      if (
206        child.type === 'command_name' ||
207        (!foundCommandName && child.type === 'word')
208      ) {
209        foundCommandName = true
210        args.push(child.text)
211        continue
212      }
213  
214      // Arguments
215      if (ARGUMENT_TYPES.has(child.type)) {
216        args.push(stripQuotes(child.text))
217      } else if (SUBSTITUTION_TYPES.has(child.type)) {
218        break
219      }
220    }
221    return args
222  }
223  
224  function stripQuotes(text: string): string {
225    return text.length >= 2 &&
226      ((text[0] === '"' && text.at(-1) === '"') ||
227        (text[0] === "'" && text.at(-1) === "'"))
228      ? text.slice(1, -1)
229      : text
230  }