Cradicle Explorer

ast.ts
   1  /**
   2   * AST-based bash command analysis using tree-sitter.
   3   *
   4   * This module replaces the shell-quote + hand-rolled char-walker approach in
   5   * bashSecurity.ts / commands.ts. Instead of detecting parser differentials
   6   * one-by-one, we parse with tree-sitter-bash and walk the tree with an
   7   * EXPLICIT allowlist of node types. Any node type not in the allowlist causes
   8   * the entire command to be classified as 'too-complex', which means it goes
   9   * through the normal permission prompt flow.
  10   *
  11   * The key design property is FAIL-CLOSED: we never interpret structure we
  12   * don't understand. If tree-sitter produces a node we haven't explicitly
  13   * allowlisted, we refuse to extract argv and the caller must ask the user.
  14   *
  15   * This is NOT a sandbox. It does not prevent dangerous commands from running.
  16   * It answers exactly one question: "Can we produce a trustworthy argv[] for
  17   * each simple command in this string?" If yes, downstream code can match
  18   * argv[0] against permission rules and flag allowlists. If no, ask the user.
  19   */
  20  
  21  import { SHELL_KEYWORDS } from './bashParser.js'
  22  import type { Node } from './parser.js'
  23  import { PARSE_ABORTED, parseCommandRaw } from './parser.js'
  24  
  25  export type Redirect = {
  26    op: '>' | '>>' | '<' | '<<' | '>&' | '>|' | '<&' | '&>' | '&>>' | '<<<'
  27    target: string
  28    fd?: number
  29  }
  30  
  31  export type SimpleCommand = {
  32    /** argv[0] is the command name, rest are arguments with quotes already resolved */
  33    argv: string[]
  34    /** Leading VAR=val assignments */
  35    envVars: { name: string; value: string }[]
  36    /** Output/input redirects */
  37    redirects: Redirect[]
  38    /** Original source span for this command (for UI display) */
  39    text: string
  40  }
  41  
  42  export type ParseForSecurityResult =
  43    | { kind: 'simple'; commands: SimpleCommand[] }
  44    | { kind: 'too-complex'; reason: string; nodeType?: string }
  45    | { kind: 'parse-unavailable' }
  46  
  47  /**
  48   * Structural node types that represent composition of commands. We recurse
  49   * through these to find the leaf `command` nodes. `program` is the root;
  50   * `list` is `a && b || c`; `pipeline` is `a | b`; `redirected_statement`
  51   * wraps a command with its redirects. Semicolon-separated commands appear
  52   * as direct siblings under `program` (no wrapper node).
  53   */
  54  const STRUCTURAL_TYPES = new Set([
  55    'program',
  56    'list',
  57    'pipeline',
  58    'redirected_statement',
  59  ])
  60  
  61  /**
  62   * Operator tokens that separate commands. These are leaf nodes that appear
  63   * between commands in `list`/`pipeline`/`program` and carry no payload.
  64   */
  65  const SEPARATOR_TYPES = new Set(['&&', '||', '|', ';', '&', '|&', '\n'])
  66  
  67  /**
  68   * Placeholder string used in outer argv when a $() is recursively extracted.
  69   * The actual $() output is runtime-determined; the inner command(s) are
  70   * checked against permission rules separately. Using a placeholder keeps
  71   * the outer argv clean (no multi-line heredoc bodies polluting path
  72   * extraction or triggering newline checks).
  73   */
  74  const CMDSUB_PLACEHOLDER = '__CMDSUB_OUTPUT__'
  75  
  76  /**
  77   * Placeholder for simple_expansion ($VAR) references to variables set earlier
  78   * in the same command via variable_assignment. Since we tracked the assignment,
  79   * we know the var exists and its value is either a static string or
  80   * __CMDSUB_OUTPUT__ (if set via $()). Either way, safe to substitute.
  81   */
  82  const VAR_PLACEHOLDER = '__TRACKED_VAR__'
  83  
  84  /**
  85   * All placeholder strings. Used for defense-in-depth: if a varScope value
  86   * contains ANY placeholder (exact or embedded), the value is NOT a pure
  87   * literal and cannot be trusted as a bare argument. Covers composites like
  88   * `VAR="prefix$(cmd)"` → `"prefix__CMDSUB_OUTPUT__"` — the substring check
  89   * catches these where exact-match Set.has() would miss.
  90   *
  91   * Also catches user-typed literals that collide with placeholder strings:
  92   * `VAR=__TRACKED_VAR__ && rm $VAR` — treated as non-literal (conservative).
  93   */
  94  function containsAnyPlaceholder(value: string): boolean {
  95    return value.includes(CMDSUB_PLACEHOLDER) || value.includes(VAR_PLACEHOLDER)
  96  }
  97  
  98  /**
  99   * Unquoted $VAR in bash undergoes word-splitting (on $IFS: space/tab/NL)
 100   * and pathname expansion (glob matching on * ? [). Our argv stores a
 101   * single string — but at runtime bash may produce MULTIPLE args, or paths
 102   * matched by a glob. A value containing these metacharacters cannot be
 103   * trusted as a bare arg: `VAR="-rf /" && rm $VAR` → bash runs `rm -rf /`
 104   * (two args) but our argv would have `['rm', '-rf /']` (one arg). Similarly
 105   * `VAR="/etc/*" && cat $VAR` → bash expands to all /etc files.
 106   *
 107   * Inside double-quotes ("$VAR"), neither splitting nor globbing applies —
 108   * the value IS a single literal argument.
 109   */
 110  const BARE_VAR_UNSAFE_RE = /[ \t\n*?[]/
 111  
 112  // stdbuf flag forms — hoisted from the wrapper-stripping while-loop
 113  const STDBUF_SHORT_SEP_RE = /^-[ioe]$/
 114  const STDBUF_SHORT_FUSED_RE = /^-[ioe]./
 115  const STDBUF_LONG_RE = /^--(input|output|error)=/
 116  
 117  /**
 118   * Known-safe environment variables that bash sets automatically. Their values
 119   * are controlled by the shell/OS, not arbitrary user input. Referencing these
 120   * via $VAR is safe — the expansion is deterministic and doesn't introduce
 121   * injection risk. Covers `$HOME`, `$PWD`, `$USER`, `$PATH`, `$SHELL`, etc.
 122   * Intentionally small: only vars that are always set by bash/login and whose
 123   * values are paths/names (not arbitrary content).
 124   */
 125  const SAFE_ENV_VARS = new Set([
 126    'HOME', // user's home directory
 127    'PWD', // current working directory (bash maintains)
 128    'OLDPWD', // previous directory
 129    'USER', // current username
 130    'LOGNAME', // login name
 131    'SHELL', // user's login shell
 132    'PATH', // executable search path
 133    'HOSTNAME', // machine hostname
 134    'UID', // user id
 135    'EUID', // effective user id
 136    'PPID', // parent process id
 137    'RANDOM', // random number (bash builtin)
 138    'SECONDS', // seconds since shell start
 139    'LINENO', // current line number
 140    'TMPDIR', // temp directory
 141    // Special bash variables — always set, values are shell-controlled:
 142    'BASH_VERSION', // bash version string
 143    'BASHPID', // current bash process id
 144    'SHLVL', // shell nesting level
 145    'HISTFILE', // history file path
 146    'IFS', // field separator (NOTE: only safe INSIDE strings; as bare arg
 147    //       $IFS is the classic injection primitive and the insideString
 148    //       gate in resolveSimpleExpansion correctly blocks it)
 149  ])
 150  
 151  /**
 152   * Special shell variables ($?, $$, $!, $#, $0-$9). tree-sitter uses
 153   * `special_variable_name` for these (not `variable_name`). Values are
 154   * shell-controlled: exit status, PIDs, positional args. Safe to resolve
 155   * ONLY inside strings (same rationale as SAFE_ENV_VARS — as bare args
 156   * their value IS the argument and might be a path/flag from $1 etc.).
 157   *
 158   * SECURITY: '@' and '*' are NOT in this set. Inside "...", they expand to
 159   * the positional params — which are EMPTY in a fresh BashTool shell (how we
 160   * always spawn). Returning VAR_PLACEHOLDER would lie: `git "push$*"` gives
 161   * argv ['git','push__TRACKED_VAR__'] while bash passes ['git','push']. Deny
 162   * rule Bash(git push:*) fails on both .text (raw `$*`) AND rebuilt argv
 163   * (placeholder). With them removed, resolveSimpleExpansion falls through to
 164   * tooComplex for `$*` / `$@`. `echo "args: $*"` becomes too-complex —
 165   * acceptable (rare in BashTool usage; `"$@"` even rarer).
 166   */
 167  const SPECIAL_VAR_NAMES = new Set([
 168    '?', // exit status of last command
 169    '$', // current shell PID
 170    '!', // last background PID
 171    '#', // number of positional params
 172    '0', // script name
 173    '-', // shell option flags
 174  ])
 175  
 176  /**
 177   * Node types that mean "this command cannot be statically analyzed." These
 178   * either execute arbitrary code (substitutions, subshells, control flow) or
 179   * expand to values we can't determine statically (parameter/arithmetic
 180   * expansion, brace expressions).
 181   *
 182   * This set is not exhaustive — it documents KNOWN dangerous types. The real
 183   * safety property is the allowlist in walkArgument/walkCommand: any type NOT
 184   * explicitly handled there also triggers too-complex.
 185   */
 186  const DANGEROUS_TYPES = new Set([
 187    'command_substitution',
 188    'process_substitution',
 189    'expansion',
 190    'simple_expansion',
 191    'brace_expression',
 192    'subshell',
 193    'compound_statement',
 194    'for_statement',
 195    'while_statement',
 196    'until_statement',
 197    'if_statement',
 198    'case_statement',
 199    'function_definition',
 200    'test_command',
 201    'ansi_c_string',
 202    'translated_string',
 203    'herestring_redirect',
 204    'heredoc_redirect',
 205  ])
 206  
 207  /**
 208   * Numeric IDs for analytics (logEvent doesn't accept strings). Index into
 209   * DANGEROUS_TYPES. Append new entries at the end to keep IDs stable.
 210   * 0 = unknown/other, -1 = ERROR (parse failure), -2 = pre-check.
 211   */
 212  const DANGEROUS_TYPE_IDS = [...DANGEROUS_TYPES]
 213  export function nodeTypeId(nodeType: string | undefined): number {
 214    if (!nodeType) return -2
 215    if (nodeType === 'ERROR') return -1
 216    const i = DANGEROUS_TYPE_IDS.indexOf(nodeType)
 217    return i >= 0 ? i + 1 : 0
 218  }
 219  
 220  /**
 221   * Redirect operator tokens → canonical operator. tree-sitter produces these
 222   * as child nodes of `file_redirect`.
 223   */
 224  const REDIRECT_OPS: Record<string, Redirect['op']> = {
 225    '>': '>',
 226    '>>': '>>',
 227    '<': '<',
 228    '>&': '>&',
 229    '<&': '<&',
 230    '>|': '>|',
 231    '&>': '&>',
 232    '&>>': '&>>',
 233    '<<<': '<<<',
 234  }
 235  
 236  /**
 237   * Brace expansion pattern: {a,b} or {a..b}. Must have , or .. inside
 238   * braces. We deliberately do NOT try to determine whether the opening brace
 239   * is backslash-escaped: tree-sitter doesn't unescape backslashes, so
 240   * distinguishing `\{a,b}` (escaped, literal) from `\\{a,b}` (literal
 241   * backslash + expansion) would require reimplementing bash quote removal.
 242   * Reject both — the escaped-brace case is rare and trivially rewritten
 243   * with single quotes.
 244   */
 245  const BRACE_EXPANSION_RE = /\{[^{}\s]*(,|\.\.)[^{}\s]*\}/
 246  
 247  /**
 248   * Control characters that bash silently drops but confuse static analysis.
 249   * Includes CR (0x0D): tree-sitter treats CR as a word separator but bash's
 250   * default IFS does not include CR, so tree-sitter and bash disagree on
 251   * word boundaries.
 252   */
 253  // eslint-disable-next-line no-control-regex
 254  const CONTROL_CHAR_RE = /[\x00-\x08\x0B-\x1F\x7F]/
 255  
 256  /**
 257   * Unicode whitespace beyond ASCII. These render invisibly (or as regular
 258   * spaces) in terminals so a user reviewing the command can't see them, but
 259   * bash treats them as literal word characters. Blocks NBSP, zero-width
 260   * spaces, line/paragraph separators, BOM.
 261   */
 262  const UNICODE_WHITESPACE_RE =
 263    /[\u00A0\u1680\u2000-\u200B\u2028\u2029\u202F\u205F\u3000\uFEFF]/
 264  
 265  /**
 266   * Backslash immediately before whitespace. bash treats `\ ` as a literal
 267   * space inside the current word, but tree-sitter returns the raw text with
 268   * the backslash still present. argv[0] from tree-sitter is `cat\ test`
 269   * while bash runs `cat test` (with a literal space). Rather than
 270   * reimplement bash's unescaping rules, we reject these — they're rare in
 271   * practice and trivial to rewrite with quotes.
 272   *
 273   * Also matches `\` before newline (line continuation) when adjacent to a
 274   * non-whitespace char. `tr\<NL>aceroute` — bash joins to `traceroute`, but
 275   * tree-sitter splits into two words (differential). When `\<NL>` is preceded
 276   * by whitespace (e.g. `foo && \<NL>bar`), there's no word to join — both
 277   * parsers agree, so we allow it.
 278   */
 279  const BACKSLASH_WHITESPACE_RE = /\\[ \t]|[^ \t\n\\]\\\n/
 280  
 281  /**
 282   * Zsh dynamic named directory expansion: ~[name]. In zsh this invokes the
 283   * zsh_directory_name hook, which can run arbitrary code. bash treats it as
 284   * a literal tilde followed by a glob character class. Since BashTool runs
 285   * via the user's default shell (often zsh), reject conservatively.
 286   */
 287  const ZSH_TILDE_BRACKET_RE = /~\[/
 288  
 289  /**
 290   * Zsh EQUALS expansion: word-initial `=cmd` expands to the absolute path of
 291   * `cmd` (equivalent to `$(which cmd)`). `=curl evil.com` runs as
 292   * `/usr/bin/curl evil.com`. tree-sitter parses `=curl` as a literal word, so
 293   * a `Bash(curl:*)` deny rule matching on base command name won't see `curl`.
 294   * Only matches word-initial `=` followed by a command-name char — `VAR=val`
 295   * and `--flag=val` have `=` mid-word and are not expanded by zsh.
 296   */
 297  const ZSH_EQUALS_EXPANSION_RE = /(?:^|[\s;&|])=[a-zA-Z_]/
 298  
 299  /**
 300   * Brace character combined with quote characters. Constructions like
 301   * `{a'}',b}` use quoted braces inside brace expansion context to obfuscate
 302   * the expansion from regex-based detection. In bash, `{a'}',b}` expands to
 303   * `a} b` (the quoted `}` becomes literal inside the first alternative).
 304   * These are hard to analyze correctly and have no legitimate use in
 305   * commands we'd want to auto-allow.
 306   *
 307   * This check runs on a version of the command with `{` masked out of
 308   * single-quoted and double-quoted spans, so JSON payloads like
 309   * `curl -d '{"k":"v"}'` don't trigger a false positive. Brace expansion
 310   * cannot occur inside quotes, so a `{` there can never start an obfuscation
 311   * pattern. The quote characters themselves stay visible so `{a'}',b}` and
 312   * `{@'{'0},...}` still match via the outer unquoted `{`.
 313   */
 314  const BRACE_WITH_QUOTE_RE = /\{[^}]*['"]/
 315  
 316  /**
 317   * Mask `{` characters that appear inside single- or double-quoted contexts.
 318   * Uses a single-pass bash-aware quote-state scanner instead of a regex.
 319   *
 320   * A naive regex (`/'[^']*'/g`) mis-detects spans when a `'` appears inside
 321   * a double-quoted string: for `echo "it's" {a'}',b}`, it matches from the
 322   * `'` in `it's` across to the `'` in `{a'}`, masking the unquoted `{` and
 323   * producing a false negative. The scanner tracks actual bash quote state:
 324   * `'` toggles single-quote only in unquoted context; `"` toggles
 325   * double-quote only outside single quotes; `\` escapes the next char in
 326   * unquoted context and escapes `"` / `\\` inside double quotes.
 327   *
 328   * Brace expansion is impossible in both quote contexts, so masking `{` in
 329   * either is safe. Secondary defense: BRACE_EXPANSION_RE in walkArgument.
 330   */
 331  function maskBracesInQuotedContexts(cmd: string): string {
 332    // Fast path: no `{` → nothing to mask. Skips the char-by-char scan for
 333    // the >90% of commands with no braces (`ls -la`, `git status`, etc).
 334    if (!cmd.includes('{')) return cmd
 335    const out: string[] = []
 336    let inSingle = false
 337    let inDouble = false
 338    let i = 0
 339    while (i < cmd.length) {
 340      const c = cmd[i]!
 341      if (inSingle) {
 342        // Bash single quotes: no escapes, `'` always terminates.
 343        if (c === "'") inSingle = false
 344        out.push(c === '{' ? ' ' : c)
 345        i++
 346      } else if (inDouble) {
 347        // Bash double quotes: `\` escapes `"` and `\` (also `$`, backtick,
 348        // newline — but those don't affect quote state so we let them pass).
 349        if (c === '\\' && (cmd[i + 1] === '"' || cmd[i + 1] === '\\')) {
 350          out.push(c, cmd[i + 1]!)
 351          i += 2
 352        } else {
 353          if (c === '"') inDouble = false
 354          out.push(c === '{' ? ' ' : c)
 355          i++
 356        }
 357      } else {
 358        // Unquoted: `\` escapes any next char.
 359        if (c === '\\' && i + 1 < cmd.length) {
 360          out.push(c, cmd[i + 1]!)
 361          i += 2
 362        } else {
 363          if (c === "'") inSingle = true
 364          else if (c === '"') inDouble = true
 365          out.push(c)
 366          i++
 367        }
 368      }
 369    }
 370    return out.join('')
 371  }
 372  
 373  const DOLLAR = String.fromCharCode(0x24)
 374  
 375  /**
 376   * Parse a bash command string and extract a flat list of simple commands.
 377   * Returns 'too-complex' if the command uses any shell feature we can't
 378   * statically analyze. Returns 'parse-unavailable' if tree-sitter WASM isn't
 379   * loaded — caller should fall back to conservative behavior.
 380   */
 381  export async function parseForSecurity(
 382    cmd: string,
 383  ): Promise<ParseForSecurityResult> {
 384    // parseCommandRaw('') returns null (falsy check), so short-circuit here.
 385    // Don't use .trim() — it strips Unicode whitespace (\u00a0 etc.) which the
 386    // pre-checks in parseForSecurityFromAst need to see and reject.
 387    if (cmd === '') return { kind: 'simple', commands: [] }
 388    const root = await parseCommandRaw(cmd)
 389    return root === null
 390      ? { kind: 'parse-unavailable' }
 391      : parseForSecurityFromAst(cmd, root)
 392  }
 393  
 394  /**
 395   * Same as parseForSecurity but takes a pre-parsed AST root so callers that
 396   * need the tree for other purposes can parse once and share. Pre-checks
 397   * still run on `cmd` — they catch tree-sitter/bash differentials that a
 398   * successful parse doesn't.
 399   */
 400  export function parseForSecurityFromAst(
 401    cmd: string,
 402    root: Node | typeof PARSE_ABORTED,
 403  ): ParseForSecurityResult {
 404    // Pre-checks: characters that cause tree-sitter and bash to disagree on
 405    // word boundaries. These run before tree-sitter because they're the known
 406    // tree-sitter/bash differentials. Everything after this point trusts
 407    // tree-sitter's tokenization.
 408    if (CONTROL_CHAR_RE.test(cmd)) {
 409      return { kind: 'too-complex', reason: 'Contains control characters' }
 410    }
 411    if (UNICODE_WHITESPACE_RE.test(cmd)) {
 412      return { kind: 'too-complex', reason: 'Contains Unicode whitespace' }
 413    }
 414    if (BACKSLASH_WHITESPACE_RE.test(cmd)) {
 415      return {
 416        kind: 'too-complex',
 417        reason: 'Contains backslash-escaped whitespace',
 418      }
 419    }
 420    if (ZSH_TILDE_BRACKET_RE.test(cmd)) {
 421      return {
 422        kind: 'too-complex',
 423        reason: 'Contains zsh ~[ dynamic directory syntax',
 424      }
 425    }
 426    if (ZSH_EQUALS_EXPANSION_RE.test(cmd)) {
 427      return {
 428        kind: 'too-complex',
 429        reason: 'Contains zsh =cmd equals expansion',
 430      }
 431    }
 432    if (BRACE_WITH_QUOTE_RE.test(maskBracesInQuotedContexts(cmd))) {
 433      return {
 434        kind: 'too-complex',
 435        reason: 'Contains brace with quote character (expansion obfuscation)',
 436      }
 437    }
 438  
 439    const trimmed = cmd.trim()
 440    if (trimmed === '') {
 441      return { kind: 'simple', commands: [] }
 442    }
 443  
 444    if (root === PARSE_ABORTED) {
 445      // SECURITY: module loaded but parse aborted (timeout / node budget /
 446      // panic). Adversarially triggerable — `(( a[0][0]... ))` with ~2800
 447      // subscripts hits PARSE_TIMEOUT_MICROS under the 10K length limit.
 448      // Previously indistinguishable from module-not-loaded → routed to
 449      // legacy (parse-unavailable), which lacks EVAL_LIKE_BUILTINS — `trap`,
 450      // `enable`, `hash` leaked with Bash(*). Fail closed: too-complex → ask.
 451      return {
 452        kind: 'too-complex',
 453        reason:
 454          'Parser aborted (timeout or resource limit) — possible adversarial input',
 455        nodeType: 'PARSE_ABORT',
 456      }
 457    }
 458  
 459    return walkProgram(root)
 460  }
 461  
 462  function walkProgram(root: Node): ParseForSecurityResult {
 463    // ERROR-node check folded into collectCommands — any unhandled node type
 464    // (including ERROR) falls through to tooComplex() in the default branch.
 465    // Avoids a separate full-tree walk for error detection.
 466    const commands: SimpleCommand[] = []
 467    // Track variables assigned earlier in the same command. When a
 468    // simple_expansion ($VAR) references a tracked var, we can substitute
 469    // a placeholder instead of returning too-complex. Enables patterns like
 470    // `NOW=$(date) && jq --arg now "$NOW" ...` — $NOW is known to be the
 471    // $(date) output (already extracted as inner command).
 472    const varScope = new Map<string, string>()
 473    const err = collectCommands(root, commands, varScope)
 474    if (err) return err
 475    return { kind: 'simple', commands }
 476  }
 477  
 478  /**
 479   * Recursively collect leaf `command` nodes from a structural wrapper node.
 480   * Returns an error result on any disallowed node type, or null on success.
 481   */
 482  function collectCommands(
 483    node: Node,
 484    commands: SimpleCommand[],
 485    varScope: Map<string, string>,
 486  ): ParseForSecurityResult | null {
 487    if (node.type === 'command') {
 488      // Pass `commands` as the innerCommands accumulator — any $() extracted
 489      // during walkCommand gets appended alongside the outer command.
 490      const result = walkCommand(node, [], commands, varScope)
 491      if (result.kind !== 'simple') return result
 492      commands.push(...result.commands)
 493      return null
 494    }
 495  
 496    if (node.type === 'redirected_statement') {
 497      return walkRedirectedStatement(node, commands, varScope)
 498    }
 499  
 500    if (node.type === 'comment') {
 501      return null
 502    }
 503  
 504    if (STRUCTURAL_TYPES.has(node.type)) {
 505      // SECURITY: `||`, `|`, `|&`, `&` must NOT carry varScope linearly. In bash:
 506      //   `||` RHS runs conditionally → vars set there MAY not be set
 507      //   `|`/`|&` stages run in subshells → vars set there are NEVER visible after
 508      //   `&` LHS runs in a background subshell → same as above
 509      // Flag-omission attack: `true || FLAG=--dry-run && cmd $FLAG` — bash skips
 510      // the `||` RHS (FLAG unset → $FLAG empty), runs `cmd` WITHOUT --dry-run.
 511      // With linear scope, our argv has ['cmd','--dry-run'] → looks SAFE → bypass.
 512      //
 513      // Fix: snapshot incoming scope at entry. After these separators, reset to
 514      // the snapshot — vars set in clauses between separators don't leak. `scope`
 515      // for clauses BETWEEN `&&`/`;` chains shares state (common `VAR=x && cmd
 516      // $VAR`). `scope` crosses `||`/`|`/`&` as the pre-structure snapshot only.
 517      //
 518      // `&&` and `;` DO carry scope: `VAR=x && cmd $VAR` is sequential, VAR is set.
 519      //
 520      // NOTE: `scope` and `varScope` diverge after the first `||`/`|`/`&`. The
 521      // caller's varScope is only mutated for the `&&`/`;` prefix — this is
 522      // conservative (vars set in `A && B | C && D` leak A+B into caller, not
 523      // C+D) but safe.
 524      //
 525      // Efficiency: snapshot is only needed if we hit `||`/`|`/`|&`/`&`. For
 526      // the dominant case (`ls`, `git status` — no such separators), skip the
 527      // Map alloc via a cheap pre-scan. For `pipeline`, node.type already tells
 528      // us stages are subshells — copy once at entry, no snapshot needed (each
 529      // reset uses the entry copy pattern via varScope, which is untouched).
 530      const isPipeline = node.type === 'pipeline'
 531      let needsSnapshot = false
 532      if (!isPipeline) {
 533        for (const c of node.children) {
 534          if (c && (c.type === '||' || c.type === '&')) {
 535            needsSnapshot = true
 536            break
 537          }
 538        }
 539      }
 540      const snapshot = needsSnapshot ? new Map(varScope) : null
 541      // For `pipeline`, ALL stages run in subshells — start with a copy so
 542      // nothing mutates caller's scope. For `list`/`program`, the `&&`/`;`
 543      // chain mutates caller's scope (sequential); fork only on `||`/`&`.
 544      let scope = isPipeline ? new Map(varScope) : varScope
 545      for (const child of node.children) {
 546        if (!child) continue
 547        if (SEPARATOR_TYPES.has(child.type)) {
 548          if (
 549            child.type === '||' ||
 550            child.type === '|' ||
 551            child.type === '|&' ||
 552            child.type === '&'
 553          ) {
 554            // For pipeline: varScope is untouched (we started with a copy).
 555            // For list/program: snapshot is non-null (pre-scan set it).
 556            // `|`/`|&` only appear under `pipeline` nodes; `||`/`&` under list.
 557            scope = new Map(snapshot ?? varScope)
 558          }
 559          continue
 560        }
 561        const err = collectCommands(child, commands, scope)
 562        if (err) return err
 563      }
 564      return null
 565    }
 566  
 567    if (node.type === 'negated_command') {
 568      // `! cmd` inverts exit code only — doesn't execute code or affect
 569      // argv. Recurse into the wrapped command. Common in CI: `! grep err`,
 570      // `! test -f lock`, `! git diff --quiet`.
 571      for (const child of node.children) {
 572        if (!child) continue
 573        if (child.type === '!') continue
 574        return collectCommands(child, commands, varScope)
 575      }
 576      return null
 577    }
 578  
 579    if (node.type === 'declaration_command') {
 580      // `export`/`local`/`readonly`/`declare`/`typeset`. tree-sitter emits
 581      // these as declaration_command, not command, so they previously fell
 582      // through to tooComplex. Values are validated via walkVariableAssignment:
 583      // `$()` in the value is recursively extracted (inner command pushed to
 584      // commands[], outer argv gets CMDSUB_PLACEHOLDER); other disallowed
 585      // expansions still reject via walkArgument. argv[0] is the builtin name so
 586      // `Bash(export:*)` rules match.
 587      const argv: string[] = []
 588      for (const child of node.children) {
 589        if (!child) continue
 590        switch (child.type) {
 591          case 'export':
 592          case 'local':
 593          case 'readonly':
 594          case 'declare':
 595          case 'typeset':
 596            argv.push(child.text)
 597            break
 598          case 'word':
 599          case 'number':
 600          case 'raw_string':
 601          case 'string':
 602          case 'concatenation': {
 603            // Flags (`declare -r`), quoted names (`export "FOO=bar"`), numbers
 604            // (`declare -i 42`). Mirrors walkCommand's argv handling — before
 605            // this, `export "FOO=bar"` hit tooComplex on the `string` child.
 606            // walkArgument validates each (expansions still reject).
 607            const arg = walkArgument(child, commands, varScope)
 608            if (typeof arg !== 'string') return arg
 609            // SECURITY: declare/typeset/local flags that change assignment
 610            // semantics break our static model. -n (nameref): `declare -n X=Y`
 611            // then `$X` dereferences to $Y's VALUE — varScope stores 'Y'
 612            // (target NAME), argv[0] shows 'Y' while bash runs whatever $Y
 613            // holds. -i (integer): `declare -i X='a[$(cmd)]'` arithmetically
 614            // evaluates the RHS at assignment time, running $(cmd) even from
 615            // a single-quoted raw_string (same primitive walkArithmetic
 616            // guards in $((…))). -a/-A (array): subscript arithmetic on
 617            // assignment. -r/-x/-g/-p/-f/-F are inert. Check the resolved
 618            // arg (not child.text) so `\-n` and quoted `-n` are caught.
 619            // Scope to declare/typeset/local only: `export -n` means "remove
 620            // export attribute" (not nameref), and export/readonly don't
 621            // accept -i; readonly -a/-A rejects subscripted args as invalid
 622            // identifiers so subscript-arith doesn't fire.
 623            if (
 624              (argv[0] === 'declare' ||
 625                argv[0] === 'typeset' ||
 626                argv[0] === 'local') &&
 627              /^-[a-zA-Z]*[niaA]/.test(arg)
 628            ) {
 629              return {
 630                kind: 'too-complex',
 631                reason: `declare flag ${arg} changes assignment semantics (nameref/integer/array)`,
 632                nodeType: 'declaration_command',
 633              }
 634            }
 635            // SECURITY: bare positional assignment with a subscript also
 636            // evaluates — no -a/-i flag needed. `declare 'x[$(id)]=val'`
 637            // implicitly creates an array element, arithmetically evaluating
 638            // the subscript and running $(id). tree-sitter delivers the
 639            // single-quoted form as a raw_string leaf so walkArgument sees
 640            // only the literal text. Scoped to declare/typeset/local:
 641            // export/readonly reject `[` in identifiers before eval.
 642            if (
 643              (argv[0] === 'declare' ||
 644                argv[0] === 'typeset' ||
 645                argv[0] === 'local') &&
 646              arg[0] !== '-' &&
 647              /^[^=]*\[/.test(arg)
 648            ) {
 649              return {
 650                kind: 'too-complex',
 651                reason: `declare positional '${arg}' contains array subscript — bash evaluates $(cmd) in subscripts`,
 652                nodeType: 'declaration_command',
 653              }
 654            }
 655            argv.push(arg)
 656            break
 657          }
 658          case 'variable_assignment': {
 659            const ev = walkVariableAssignment(child, commands, varScope)
 660            if ('kind' in ev) return ev
 661            // export/declare assignments populate the scope so later $VAR refs resolve.
 662            applyVarToScope(varScope, ev)
 663            argv.push(`${ev.name}=${ev.value}`)
 664            break
 665          }
 666          case 'variable_name':
 667            // `export FOO` — bare name, no assignment.
 668            argv.push(child.text)
 669            break
 670          default:
 671            return tooComplex(child)
 672        }
 673      }
 674      commands.push({ argv, envVars: [], redirects: [], text: node.text })
 675      return null
 676    }
 677  
 678    if (node.type === 'variable_assignment') {
 679      // Bare `VAR=value` at statement level (not a command env prefix).
 680      // Sets a shell variable — no code execution, no filesystem I/O.
 681      // The value is validated via walkVariableAssignment → walkArgument,
 682      // so `VAR=$(evil)` still recursively extracts/rejects based on the
 683      // inner command. Does NOT push to commands — a bare assignment needs
 684      // no permission rule (it's inert). Common pattern: `VAR=x && cmd`
 685      // where cmd references $VAR. ~35% of too-complex in top-5k ant cmds.
 686      const ev = walkVariableAssignment(node, commands, varScope)
 687      if ('kind' in ev) return ev
 688      // Populate scope so later `$VAR` references resolve.
 689      applyVarToScope(varScope, ev)
 690      return null
 691    }
 692  
 693    if (node.type === 'for_statement') {
 694      // `for VAR in WORD...; do BODY; done` — iterate BODY once per word.
 695      // Body commands extracted once; every iteration runs the same commands.
 696      //
 697      // SECURITY: Loop var is ALWAYS treated as unknown-value (VAR_PLACEHOLDER).
 698      // Even "static" iteration words can be:
 699      //  - Absolute paths: `for i in /etc/passwd; do rm $i; done` — body argv
 700      //    would have placeholder, path validation never sees /etc/passwd.
 701      //  - Globs: `for i in /etc/*; do rm $i; done` — `/etc/*` is a static word
 702      //    at parse time but bash expands it at runtime.
 703      //  - Flags: `for i in -rf /; do rm $i; done` — flag smuggling.
 704      //
 705      // VAR_PLACEHOLDER means bare `$i` in body → too-complex. Only
 706      // string-embedding (`echo "item: $i"`) stays simple. This reverts some
 707      // of the too-complex→simple rescues in the original PR — each one was a
 708      // potential path-validation bypass.
 709      let loopVar: string | null = null
 710      let doGroup: Node | null = null
 711      for (const child of node.children) {
 712        if (!child) continue
 713        if (child.type === 'variable_name') {
 714          loopVar = child.text
 715        } else if (child.type === 'do_group') {
 716          doGroup = child
 717        } else if (
 718          child.type === 'for' ||
 719          child.type === 'in' ||
 720          child.type === 'select' ||
 721          child.type === ';'
 722        ) {
 723          continue // structural tokens
 724        } else if (child.type === 'command_substitution') {
 725          // `for i in $(seq 1 3)` — inner cmd IS extracted and rule-checked.
 726          const err = collectCommandSubstitution(child, commands, varScope)
 727          if (err) return err
 728        } else {
 729          // Iteration values — validated via walkArgument. Value discarded:
 730          // body argv gets VAR_PLACEHOLDER regardless of the iteration words,
 731          // and bare `$i` in body → too-complex (see SECURITY comment above).
 732          // We still validate to reject e.g. `for i in $(cmd); do ...; done`
 733          // where the iteration word itself is a disallowed expansion.
 734          const arg = walkArgument(child, commands, varScope)
 735          if (typeof arg !== 'string') return arg
 736        }
 737      }
 738      if (loopVar === null || doGroup === null) return tooComplex(node)
 739      // SECURITY: `for PS4 in '$(id)'; do set -x; :; done` sets PS4 directly
 740      // via varScope.set below — walkVariableAssignment's PS4/IFS checks never
 741      // fire. Trace-time RCE (PS4) or word-split bypass (IFS). No legit use.
 742      if (loopVar === 'PS4' || loopVar === 'IFS') {
 743        return {
 744          kind: 'too-complex',
 745          reason: `${loopVar} as loop variable bypasses assignment validation`,
 746          nodeType: 'for_statement',
 747        }
 748      }
 749      // SECURITY: Body uses a scope COPY — vars assigned inside the loop
 750      // body don't leak to commands after `done`. The loop var itself is
 751      // set in the REAL scope (bash semantics: $i still set after loop)
 752      // and copied into the body scope. ALWAYS VAR_PLACEHOLDER — see above.
 753      varScope.set(loopVar, VAR_PLACEHOLDER)
 754      const bodyScope = new Map(varScope)
 755      for (const c of doGroup.children) {
 756        if (!c) continue
 757        if (c.type === 'do' || c.type === 'done' || c.type === ';') continue
 758        const err = collectCommands(c, commands, bodyScope)
 759        if (err) return err
 760      }
 761      return null
 762    }
 763  
 764    if (node.type === 'if_statement' || node.type === 'while_statement') {
 765      // `if COND; then BODY; [elif...; else...;] fi`
 766      // `while COND; do BODY; done`
 767      // Extract condition command(s) + all branch/body commands. All get
 768      // checked against permission rules. `while read VAR` tracks VAR so
 769      // body can reference $VAR.
 770      //
 771      // SECURITY: Branch bodies use scope COPIES — vars assigned inside a
 772      // conditional branch (which may not execute) must not leak to commands
 773      // after fi/done. `if false; then T=safe; fi && rm $T` must reject $T.
 774      // Condition commands use the REAL varScope (they always run for the
 775      // check, so assignments there are unconditional — e.g., `while read V`
 776      // tracking must persist to the body copy).
 777      //
 778      // tree-sitter if_statement children: if, COND..., then, THEN-BODY...,
 779      // [elif_clause...], [else_clause], fi. We distinguish condition from
 780      // then-body by tracking whether we've seen the `then` token.
 781      let seenThen = false
 782      for (const child of node.children) {
 783        if (!child) continue
 784        if (
 785          child.type === 'if' ||
 786          child.type === 'fi' ||
 787          child.type === 'else' ||
 788          child.type === 'elif' ||
 789          child.type === 'while' ||
 790          child.type === 'until' ||
 791          child.type === ';'
 792        ) {
 793          continue
 794        }
 795        if (child.type === 'then') {
 796          seenThen = true
 797          continue
 798        }
 799        if (child.type === 'do_group') {
 800          // while body: recurse with scope COPY (body assignments don't leak
 801          // past done). The COPY contains any `read VAR` tracking from the
 802          // condition (already in real varScope at this point).
 803          const bodyScope = new Map(varScope)
 804          for (const c of child.children) {
 805            if (!c) continue
 806            if (c.type === 'do' || c.type === 'done' || c.type === ';') continue
 807            const err = collectCommands(c, commands, bodyScope)
 808            if (err) return err
 809          }
 810          continue
 811        }
 812        if (child.type === 'elif_clause' || child.type === 'else_clause') {
 813          // elif_clause: elif, cond, ;, then, body... / else_clause: else, body...
 814          // Scope COPY — elif/else branch assignments don't leak past fi.
 815          const branchScope = new Map(varScope)
 816          for (const c of child.children) {
 817            if (!c) continue
 818            if (
 819              c.type === 'elif' ||
 820              c.type === 'else' ||
 821              c.type === 'then' ||
 822              c.type === ';'
 823            ) {
 824              continue
 825            }
 826            const err = collectCommands(c, commands, branchScope)
 827            if (err) return err
 828          }
 829          continue
 830        }
 831        // Condition (seenThen=false) or then-body (seenThen=true).
 832        // Condition uses REAL varScope (always runs). Then-body uses a COPY.
 833        // Special-case `while read VAR`: after condition `read VAR` is
 834        // collected, track VAR in the REAL scope so the body COPY inherits it.
 835        const targetScope = seenThen ? new Map(varScope) : varScope
 836        const before = commands.length
 837        const err = collectCommands(child, commands, targetScope)
 838        if (err) return err
 839        // If condition included `read VAR...`, track vars in REAL scope.
 840        // read var value is UNKNOWN (stdin input) → use VAR_PLACEHOLDER
 841        // (unknown-value sentinel, string-only).
 842        if (!seenThen) {
 843          for (let i = before; i < commands.length; i++) {
 844            const c = commands[i]
 845            if (c?.argv[0] === 'read') {
 846              for (const a of c.argv.slice(1)) {
 847                // Skip flags (-r, -d, etc.); track bare identifier args as var names.
 848                if (!a.startsWith('-') && /^[A-Za-z_][A-Za-z0-9_]*$/.test(a)) {
 849                  // SECURITY: commands[] is a flat accumulator. `true || read
 850                  // VAR` in the condition: the list handler correctly uses a
 851                  // scope COPY for the ||-RHS (may not run), but `read VAR`
 852                  // IS still pushed to commands[] — we can't tell it was
 853                  // scope-isolated from here. Same for `echo | read VAR`
 854                  // (pipeline, subshell in bash) and `(read VAR)` (subshell).
 855                  // Overwriting a tracked literal with VAR_PLACEHOLDER hides
 856                  // path traversal: `VAR=../../etc/passwd && if true || read
 857                  // VAR; then cat "/tmp/$VAR"; fi` — parser would see
 858                  // /tmp/__TRACKED_VAR__, bash reads /etc/passwd. Fail closed
 859                  // when a tracked literal would be overwritten. Safe case
 860                  // (no prior value or already a placeholder) → proceed.
 861                  const existing = varScope.get(a)
 862                  if (
 863                    existing !== undefined &&
 864                    !containsAnyPlaceholder(existing)
 865                  ) {
 866                    return {
 867                      kind: 'too-complex',
 868                      reason: `'read ${a}' in condition may not execute (||/pipeline/subshell); cannot prove it overwrites tracked literal '${existing}'`,
 869                      nodeType: 'if_statement',
 870                    }
 871                  }
 872                  varScope.set(a, VAR_PLACEHOLDER)
 873                }
 874              }
 875            }
 876          }
 877        }
 878      }
 879      return null
 880    }
 881  
 882    if (node.type === 'subshell') {
 883      // `(cmd1; cmd2)` — run commands in a subshell. Inner commands ARE
 884      // executed, so extract them for permission checking. Subshell has
 885      // isolated scope: vars set inside don't leak out. Use a COPY of
 886      // varScope (outer vars visible, inner changes discarded).
 887      const innerScope = new Map(varScope)
 888      for (const child of node.children) {
 889        if (!child) continue
 890        if (child.type === '(' || child.type === ')') continue
 891        const err = collectCommands(child, commands, innerScope)
 892        if (err) return err
 893      }
 894      return null
 895    }
 896  
 897    if (node.type === 'test_command') {
 898      // `[[ EXPR ]]` or `[ EXPR ]` — conditional test. Evaluates to true/false
 899      // based on file tests (-f, -d), string comparisons (==, !=), etc.
 900      // No code execution (no command_substitution inside — that would be a
 901      // child and we'd recurse into it via walkArgument and reject it).
 902      // Push as a synthetic command with argv[0]='[[' so permission rules
 903      // can match — `Bash([[ :*)` would be unusual but legal.
 904      // Walk arguments to validate (no cmdsub/expansion inside operands).
 905      const argv: string[] = ['[[']
 906      for (const child of node.children) {
 907        if (!child) continue
 908        if (child.type === '[[' || child.type === ']]') continue
 909        if (child.type === '[' || child.type === ']') continue
 910        // Recurse into test expression structure: unary_expression,
 911        // binary_expression, parenthesized_expression, negated_expression.
 912        // The leaves are test_operator (-f, -d, ==) and operand words.
 913        const err = walkTestExpr(child, argv, commands, varScope)
 914        if (err) return err
 915      }
 916      commands.push({ argv, envVars: [], redirects: [], text: node.text })
 917      return null
 918    }
 919  
 920    if (node.type === 'unset_command') {
 921      // `unset FOO BAR`, `unset -f func`. Safe: only removes shell
 922      // variables/functions from the current shell — no code execution, no
 923      // filesystem I/O. tree-sitter emits a dedicated node type so it
 924      // previously fell through to tooComplex. Children: `unset` keyword,
 925      // `variable_name` for each name, `word` for flags like `-f`/`-v`.
 926      const argv: string[] = []
 927      for (const child of node.children) {
 928        if (!child) continue
 929        switch (child.type) {
 930          case 'unset':
 931            argv.push(child.text)
 932            break
 933          case 'variable_name':
 934            argv.push(child.text)
 935            // SECURITY: unset removes the var from bash's scope. Remove from
 936            // varScope so subsequent `$VAR` references correctly reject.
 937            // `VAR=safe && unset VAR && rm $VAR` must NOT resolve $VAR.
 938            varScope.delete(child.text)
 939            break
 940          case 'word': {
 941            const arg = walkArgument(child, commands, varScope)
 942            if (typeof arg !== 'string') return arg
 943            argv.push(arg)
 944            break
 945          }
 946          default:
 947            return tooComplex(child)
 948        }
 949      }
 950      commands.push({ argv, envVars: [], redirects: [], text: node.text })
 951      return null
 952    }
 953  
 954    return tooComplex(node)
 955  }
 956  
 957  /**
 958   * Recursively walk a test_command expression tree (unary/binary/negated/
 959   * parenthesized expressions). Leaves are test_operator tokens and operands
 960   * (word/string/number/etc). Operands are validated via walkArgument.
 961   */
 962  function walkTestExpr(
 963    node: Node,
 964    argv: string[],
 965    innerCommands: SimpleCommand[],
 966    varScope: Map<string, string>,
 967  ): ParseForSecurityResult | null {
 968    switch (node.type) {
 969      case 'unary_expression':
 970      case 'binary_expression':
 971      case 'negated_expression':
 972      case 'parenthesized_expression': {
 973        for (const c of node.children) {
 974          if (!c) continue
 975          const err = walkTestExpr(c, argv, innerCommands, varScope)
 976          if (err) return err
 977        }
 978        return null
 979      }
 980      case 'test_operator':
 981      case '!':
 982      case '(':
 983      case ')':
 984      case '&&':
 985      case '||':
 986      case '==':
 987      case '=':
 988      case '!=':
 989      case '<':
 990      case '>':
 991      case '=~':
 992        argv.push(node.text)
 993        return null
 994      case 'regex':
 995      case 'extglob_pattern':
 996        // RHS of =~ or ==/!= in [[ ]]. Pattern text only — no code execution.
 997        // Parser emits these as leaf nodes with no children (any $(...) or ${...}
 998        // inside the pattern is a sibling, not a child, and is walked separately).
 999        argv.push(node.text)
1000        return null
1001      default: {
1002        // Operand — word, string, number, etc. Validate via walkArgument.
1003        const arg = walkArgument(node, innerCommands, varScope)
1004        if (typeof arg !== 'string') return arg
1005        argv.push(arg)
1006        return null
1007      }
1008    }
1009  }
1010  
1011  /**
1012   * A `redirected_statement` wraps a command (or pipeline) plus one or more
1013   * `file_redirect`/`heredoc_redirect` nodes. Extract redirects, walk the
1014   * inner command, attach redirects to the LAST command (the one whose output
1015   * is being redirected).
1016   */
1017  function walkRedirectedStatement(
1018    node: Node,
1019    commands: SimpleCommand[],
1020    varScope: Map<string, string>,
1021  ): ParseForSecurityResult | null {
1022    const redirects: Redirect[] = []
1023    let innerCommand: Node | null = null
1024  
1025    for (const child of node.children) {
1026      if (!child) continue
1027      if (child.type === 'file_redirect') {
1028        // Thread `commands` so $() in redirect targets (e.g., `> $(mktemp)`)
1029        // extracts the inner command for permission checking.
1030        const r = walkFileRedirect(child, commands, varScope)
1031        if ('kind' in r) return r
1032        redirects.push(r)
1033      } else if (child.type === 'heredoc_redirect') {
1034        const r = walkHeredocRedirect(child)
1035        if (r) return r
1036      } else if (
1037        child.type === 'command' ||
1038        child.type === 'pipeline' ||
1039        child.type === 'list' ||
1040        child.type === 'negated_command' ||
1041        child.type === 'declaration_command' ||
1042        child.type === 'unset_command'
1043      ) {
1044        innerCommand = child
1045      } else {
1046        return tooComplex(child)
1047      }
1048    }
1049  
1050    if (!innerCommand) {
1051      // `> file` alone is valid bash (truncates file). Represent as a command
1052      // with empty argv so downstream sees the write.
1053      commands.push({ argv: [], envVars: [], redirects, text: node.text })
1054      return null
1055    }
1056  
1057    const before = commands.length
1058    const err = collectCommands(innerCommand, commands, varScope)
1059    if (err) return err
1060    if (commands.length > before && redirects.length > 0) {
1061      const last = commands[commands.length - 1]
1062      if (last) last.redirects.push(...redirects)
1063    }
1064    return null
1065  }
1066  
1067  /**
1068   * Extract operator + target from a `file_redirect` node. The target must be
1069   * a static word or string.
1070   */
1071  function walkFileRedirect(
1072    node: Node,
1073    innerCommands: SimpleCommand[],
1074    varScope: Map<string, string>,
1075  ): Redirect | ParseForSecurityResult {
1076    let op: Redirect['op'] | null = null
1077    let target: string | null = null
1078    let fd: number | undefined
1079  
1080    for (const child of node.children) {
1081      if (!child) continue
1082      if (child.type === 'file_descriptor') {
1083        fd = Number(child.text)
1084      } else if (child.type in REDIRECT_OPS) {
1085        op = REDIRECT_OPS[child.type] ?? null
1086      } else if (child.type === 'word' || child.type === 'number') {
1087        // SECURITY: `number` nodes can contain expansion children via the
1088        // `NN#<expansion>` arithmetic-base grammar quirk — same issue as
1089        // walkArgument's number case. `> 10#$(cmd)` runs cmd at runtime.
1090        // Plain word/number nodes have zero children.
1091        if (child.children.length > 0) return tooComplex(child)
1092        // Symmetry with walkArgument (~608): `echo foo > {a,b}` is an
1093        // ambiguous redirect in bash. tree-sitter actually emits a
1094        // `concatenation` node for brace targets (caught by the default
1095        // branch below), but check `word` text too for defense-in-depth.
1096        if (BRACE_EXPANSION_RE.test(child.text)) return tooComplex(child)
1097        // Unescape backslash sequences — same as walkArgument. Bash quote
1098        // removal turns `\X` → `X`. Without this, `cat < /proc/self/\environ`
1099        // stores target `/proc/self/\environ` which evades PROC_ENVIRON_RE,
1100        // but bash reads /proc/self/environ.
1101        target = child.text.replace(/\\(.)/g, '$1')
1102      } else if (child.type === 'raw_string') {
1103        target = stripRawString(child.text)
1104      } else if (child.type === 'string') {
1105        const s = walkString(child, innerCommands, varScope)
1106        if (typeof s !== 'string') return s
1107        target = s
1108      } else if (child.type === 'concatenation') {
1109        // `echo > "foo"bar` — tree-sitter produces a concatenation of string +
1110        // word children. walkArgument already validates concatenation (rejects
1111        // expansions, checks brace syntax) and returns the joined text.
1112        const s = walkArgument(child, innerCommands, varScope)
1113        if (typeof s !== 'string') return s
1114        target = s
1115      } else {
1116        return tooComplex(child)
1117      }
1118    }
1119  
1120    if (!op || target === null) {
1121      return {
1122        kind: 'too-complex',
1123        reason: 'Unrecognized redirect shape',
1124        nodeType: node.type,
1125      }
1126    }
1127    return { op, target, fd }
1128  }
1129  
1130  /**
1131   * Heredoc redirect. Only quoted-delimiter heredocs (<<'EOF') are safe —
1132   * their bodies are literal text. Unquoted-delimiter heredocs (<<EOF)
1133   * undergo full parameter/command/arithmetic expansion in the body.
1134   *
1135   * SECURITY: tree-sitter-bash has a grammar gap — backticks (`...`) inside
1136   * an unquoted heredoc body are NOT parsed as command_substitution nodes
1137   * (body.children is empty, backticks are in body.text). But bash DOES
1138   * execute them. We cannot safely relax the quoted-delimiter requirement
1139   * by checking body children for expansion nodes — we'd miss backtick
1140   * substitution. Keep rejecting all unquoted heredocs. Users should use
1141   * <<'EOF' to get a literal body, which the model already prefers.
1142   */
1143  function walkHeredocRedirect(node: Node): ParseForSecurityResult | null {
1144    let startText: string | null = null
1145    let body: Node | null = null
1146  
1147    for (const child of node.children) {
1148      if (!child) continue
1149      if (child.type === 'heredoc_start') startText = child.text
1150      else if (child.type === 'heredoc_body') body = child
1151      else if (
1152        child.type === '<<' ||
1153        child.type === '<<-' ||
1154        child.type === 'heredoc_end' ||
1155        child.type === 'file_descriptor'
1156      ) {
1157        // expected structural tokens — safe to skip. file_descriptor
1158        // covers fd-prefixed heredocs (`cat 3<<'EOF'`) — walkFileRedirect
1159        // already treats it as a benign structural token.
1160      } else {
1161        // SECURITY: tree-sitter places pipeline / command / file_redirect /
1162        // && / etc. as children of heredoc_redirect when they follow the
1163        // delimiter on the same line (e.g. `ls <<'EOF' | rm x`). Previously
1164        // these were silently skipped, hiding the piped command from
1165        // permission checks. Fail closed like every other walker.
1166        return tooComplex(child)
1167      }
1168    }
1169  
1170    const isQuoted =
1171      startText !== null &&
1172      ((startText.startsWith("'") && startText.endsWith("'")) ||
1173        (startText.startsWith('"') && startText.endsWith('"')) ||
1174        startText.startsWith('\\'))
1175  
1176    if (!isQuoted) {
1177      return {
1178        kind: 'too-complex',
1179        reason: 'Heredoc with unquoted delimiter undergoes shell expansion',
1180        nodeType: 'heredoc_redirect',
1181      }
1182    }
1183  
1184    if (body) {
1185      for (const child of body.children) {
1186        if (!child) continue
1187        if (child.type !== 'heredoc_content') {
1188          return tooComplex(child)
1189        }
1190      }
1191    }
1192    return null
1193  }
1194  
1195  /**
1196   * Here-string redirect (`<<< content`). The content becomes stdin — not
1197   * argv, not a path. Safe when content is a literal word, raw_string, or
1198   * string with no expansions. Reject when content contains $()/${}/$VAR —
1199   * those execute arbitrary code or inject runtime values.
1200   *
1201   * Reuses walkArgument for content validation: it already rejects
1202   * command_substitution, expansion, and (for strings) simple_expansion
1203   * unless the var is tracked/safe. The result string is discarded — we only
1204   * care that it's statically resolvable.
1205   *
1206   * NOTE: `VAR=$(cmd) && cat <<< "$VAR"` would be safe in principle (inner
1207   * cmd is extracted separately, herestring content is stdin) but is
1208   * currently rejected conservatively — walkString's solo-placeholder guard
1209   * fires because it has no awareness of herestring vs argv context.
1210   */
1211  function walkHerestringRedirect(
1212    node: Node,
1213    innerCommands: SimpleCommand[],
1214    varScope: Map<string, string>,
1215  ): ParseForSecurityResult | null {
1216    for (const child of node.children) {
1217      if (!child) continue
1218      if (child.type === '<<<') continue
1219      // Content node: reuse walkArgument. It returns a string on success
1220      // (which we discard — content is stdin, irrelevant to permissions) or
1221      // a too-complex result on failure (expansion found, unresolvable var).
1222      const content = walkArgument(child, innerCommands, varScope)
1223      if (typeof content !== 'string') return content
1224      // Herestring content is discarded (not in argv/envVars/redirects) but
1225      // remains in .text via raw node.text. Scan it here so checkSemantics's
1226      // NEWLINE_HASH invariant (bashPermissions.ts relies on it) still holds.
1227      if (NEWLINE_HASH_RE.test(content)) return tooComplex(child)
1228    }
1229    return null
1230  }
1231  
1232  /**
1233   * Walk a `command` node and extract argv. Children appear in order:
1234   * [variable_assignment...] command_name [argument...] [file_redirect...]
1235   * Any child type not explicitly handled triggers too-complex.
1236   */
1237  function walkCommand(
1238    node: Node,
1239    extraRedirects: Redirect[],
1240    innerCommands: SimpleCommand[],
1241    varScope: Map<string, string>,
1242  ): ParseForSecurityResult {
1243    const argv: string[] = []
1244    const envVars: { name: string; value: string }[] = []
1245    const redirects: Redirect[] = [...extraRedirects]
1246  
1247    for (const child of node.children) {
1248      if (!child) continue
1249  
1250      switch (child.type) {
1251        case 'variable_assignment': {
1252          const ev = walkVariableAssignment(child, innerCommands, varScope)
1253          if ('kind' in ev) return ev
1254          // SECURITY: Env-prefix assignments (`VAR=x cmd`) are command-local in
1255          // bash — VAR is only visible to `cmd` as an env var, NOT to
1256          // subsequent commands. Do NOT add to global varScope — that would
1257          // let `VAR=safe cmd1 && rm $VAR` resolve $VAR when bash has unset it.
1258          envVars.push({ name: ev.name, value: ev.value })
1259          break
1260        }
1261        case 'command_name': {
1262          const arg = walkArgument(
1263            child.children[0] ?? child,
1264            innerCommands,
1265            varScope,
1266          )
1267          if (typeof arg !== 'string') return arg
1268          argv.push(arg)
1269          break
1270        }
1271        case 'word':
1272        case 'number':
1273        case 'raw_string':
1274        case 'string':
1275        case 'concatenation':
1276        case 'arithmetic_expansion': {
1277          const arg = walkArgument(child, innerCommands, varScope)
1278          if (typeof arg !== 'string') return arg
1279          argv.push(arg)
1280          break
1281        }
1282        // NOTE: command_substitution as a BARE argument (not inside a string)
1283        // is intentionally NOT handled here — the $() output IS the argument,
1284        // and for path-sensitive commands (cd, rm, chmod) the placeholder would
1285        // hide the real path from downstream checks. `cd $(echo /etc)` must
1286        // stay too-complex so the path-check can't be bypassed. $() inside
1287        // strings ("Timer: $(date)") is handled in walkString where the output
1288        // is embedded in a longer string (safer).
1289        case 'simple_expansion': {
1290          // Bare `$VAR` as an argument. Tracked static vars return the ACTUAL
1291          // value (e.g. VAR=/etc → '/etc'). Values with IFS/glob chars or
1292          // placeholders reject. See resolveSimpleExpansion.
1293          const v = resolveSimpleExpansion(child, varScope, false)
1294          if (typeof v !== 'string') return v
1295          argv.push(v)
1296          break
1297        }
1298        case 'file_redirect': {
1299          const r = walkFileRedirect(child, innerCommands, varScope)
1300          if ('kind' in r) return r
1301          redirects.push(r)
1302          break
1303        }
1304        case 'herestring_redirect': {
1305          // `cmd <<< "content"` — content is stdin, not argv. Validate it's
1306          // literal (no expansion); discard the content string.
1307          const err = walkHerestringRedirect(child, innerCommands, varScope)
1308          if (err) return err
1309          break
1310        }
1311        default:
1312          return tooComplex(child)
1313      }
1314    }
1315  
1316    // .text is the raw source span. Downstream (bashToolCheckPermission →
1317    // splitCommand_DEPRECATED) re-tokenizes it via shell-quote. Normally .text
1318    // is used unchanged — but if we resolved a $VAR into argv, .text diverges
1319    // (has raw `$VAR`) and downstream RULE MATCHING would miss deny rules.
1320    //
1321    // SECURITY: `SUB=push && git $SUB --force` with `Bash(git push:*)` deny:
1322    //   argv = ['git', 'push', '--force']  ← correct, path validation sees 'push'
1323    //   .text = 'git $SUB --force'         ← deny rule 'git push:*' doesn't match
1324    //
1325    // Detection: any `$<identifier>` in node.text means a simple_expansion was
1326    // resolved (or we'd have returned too-complex). This catches $VAR at any
1327    // position — command_name, word, string interior, concatenation part.
1328    // `$(...)` doesn't match (paren, not identifier start). `'$VAR'` in single
1329    // quotes: tree-sitter's .text includes the quotes, so a naive check would
1330    // FP on `echo '$VAR'`. But single-quoted $ is LITERAL in bash — argv has
1331    // the literal `$VAR` string, so rebuilding from argv produces `'$VAR'`
1332    // anyway (shell-escape wraps it). Same net .text. No rule-matching error.
1333    //
1334    // Rebuild .text from argv. Shell-escape each arg: single-quote wrap with
1335    // `'\''` for embedded single quotes. Empty string, metacharacters, and
1336    // placeholders all get quoted. Downstream shell-quote re-parse is correct.
1337    //
1338    // NOTE: This does NOT include redirects/envVars in the rebuilt .text —
1339    // walkFileRedirect rejects simple_expansion, and envVars aren't used for
1340    // rule matching. If either changes, this rebuild must include them.
1341    //
1342    // SECURITY: also rebuild when node.text contains a newline. Line
1343    // continuations `<space>\<LF>` are invisible to argv (tree-sitter collapses
1344    // them) but preserved in node.text. `timeout 5 \<LF>curl evil.com` → argv
1345    // is correct, but raw .text → stripSafeWrappers matches `timeout 5 ` (the
1346    // space before \), leaving `\<LF>curl evil.com` — Bash(curl:*) deny doesn't
1347    // prefix-match. Rebuilt .text joins argv with ' ' → no newlines →
1348    // stripSafeWrappers works. Also covers heredoc-body leakage.
1349    const text =
1350      /\$[A-Za-z_]/.test(node.text) || node.text.includes('\n')
1351        ? argv
1352            .map(a =>
1353              a === '' || /["'\\ \t\n$`;|&<>(){}*?[\]~#]/.test(a)
1354                ? `'${a.replace(/'/g, "'\\''")}'`
1355                : a,
1356            )
1357            .join(' ')
1358        : node.text
1359    return {
1360      kind: 'simple',
1361      commands: [{ argv, envVars, redirects, text }],
1362    }
1363  }
1364  
1365  /**
1366   * Recurse into a command_substitution node's inner command(s). If the inner
1367   * command(s) parse cleanly (simple), add them to the innerCommands
1368   * accumulator and return null (success). If the inner command is itself
1369   * too-complex (e.g., nested arith expansion, process sub), return the error.
1370   * This enables recursive permission checking: `echo $(git rev-parse HEAD)`
1371   * extracts BOTH `echo $(git rev-parse HEAD)` (outer) AND `git rev-parse HEAD`
1372   * (inner) — permission rules must match BOTH for the whole command to allow.
1373   */
1374  function collectCommandSubstitution(
1375    csNode: Node,
1376    innerCommands: SimpleCommand[],
1377    varScope: Map<string, string>,
1378  ): ParseForSecurityResult | null {
1379    // Vars set BEFORE the $() are visible inside (bash subshell semantics),
1380    // but vars set INSIDE don't leak out. Pass a COPY of the outer scope so
1381    // inner assignments don't mutate the outer map.
1382    const innerScope = new Map(varScope)
1383    // command_substitution children: `$(` or `` ` ``, inner statement(s), `)`
1384    for (const child of csNode.children) {
1385      if (!child) continue
1386      if (child.type === '$(' || child.type === '`' || child.type === ')') {
1387        continue
1388      }
1389      const err = collectCommands(child, innerCommands, innerScope)
1390      if (err) return err
1391    }
1392    return null
1393  }
1394  
1395  /**
1396   * Convert an argument node to its literal string value. Quotes are resolved.
1397   * This function implements the argument-position allowlist.
1398   */
1399  function walkArgument(
1400    node: Node | null,
1401    innerCommands: SimpleCommand[],
1402    varScope: Map<string, string>,
1403  ): string | ParseForSecurityResult {
1404    if (!node) {
1405      return { kind: 'too-complex', reason: 'Null argument node' }
1406    }
1407  
1408    switch (node.type) {
1409      case 'word': {
1410        // Unescape backslash sequences. In unquoted context, bash's quote
1411        // removal turns `\X` → `X` for any character X. tree-sitter preserves
1412        // the raw text. Required for checkSemantics: `\eval` must match
1413        // EVAL_LIKE_BUILTINS, `\zmodload` must match ZSH_DANGEROUS_BUILTINS.
1414        // Also makes argv accurate: `find -exec {} \;` → argv has `;` not
1415        // `\;`. (Deny-rule matching on .text already worked via downstream
1416        // splitCommand_DEPRECATED unescaping — see walkCommand comment.) `\<whitespace>`
1417        // is already rejected by BACKSLASH_WHITESPACE_RE.
1418        if (BRACE_EXPANSION_RE.test(node.text)) {
1419          return {
1420            kind: 'too-complex',
1421            reason: 'Word contains brace expansion syntax',
1422            nodeType: 'word',
1423          }
1424        }
1425        return node.text.replace(/\\(.)/g, '$1')
1426      }
1427  
1428      case 'number':
1429        // SECURITY: tree-sitter-bash parses `NN#<expansion>` (arithmetic base
1430        // syntax) as a `number` node with the expansion as a CHILD. `10#$(cmd)`
1431        // is a number node whose .text is the full literal but whose child is a
1432        // command_substitution — bash runs the substitution. .text on a node
1433        // with children would smuggle the expansion past permission checks.
1434        // Plain numbers (`10`, `16#ff`) have zero children.
1435        if (node.children.length > 0) {
1436          return {
1437            kind: 'too-complex',
1438            reason: 'Number node contains expansion (NN# arithmetic base syntax)',
1439            nodeType: node.children[0]?.type,
1440          }
1441        }
1442        return node.text
1443  
1444      case 'raw_string':
1445        return stripRawString(node.text)
1446  
1447      case 'string':
1448        return walkString(node, innerCommands, varScope)
1449  
1450      case 'concatenation': {
1451        if (BRACE_EXPANSION_RE.test(node.text)) {
1452          return {
1453            kind: 'too-complex',
1454            reason: 'Brace expansion',
1455            nodeType: 'concatenation',
1456          }
1457        }
1458        let result = ''
1459        for (const child of node.children) {
1460          if (!child) continue
1461          const part = walkArgument(child, innerCommands, varScope)
1462          if (typeof part !== 'string') return part
1463          result += part
1464        }
1465        return result
1466      }
1467  
1468      case 'arithmetic_expansion': {
1469        const err = walkArithmetic(node)
1470        if (err) return err
1471        return node.text
1472      }
1473  
1474      case 'simple_expansion': {
1475        // `$VAR` inside a concatenation (e.g., `prefix$VAR`). Same rules
1476        // as the bare case in walkCommand: must be tracked or SAFE_ENV_VARS.
1477        // inside-concatenation counts as bare arg (the whole concat IS the arg)
1478        return resolveSimpleExpansion(node, varScope, false)
1479      }
1480  
1481      // NOTE: command_substitution at arg position (bare or inside concatenation)
1482      // is intentionally NOT handled — the output is/becomes-part-of a positional
1483      // argument which might be a path or flag. `rm $(foo)` or `rm $(foo)bar`
1484      // would hide the real path behind the placeholder. Only $() inside a
1485      // `string` node (walkString) is extracted, since the output is embedded
1486      // in a longer string rather than BEING the argument.
1487  
1488      default:
1489        return tooComplex(node)
1490    }
1491  }
1492  
1493  /**
1494   * Extract literal content from a double-quoted string node. A `string` node's
1495   * children are `"` delimiters, `string_content` literals, and possibly
1496   * expansion nodes.
1497   *
1498   * tree-sitter quirk: literal newlines inside double quotes are NOT included
1499   * in `string_content` node text. bash preserves them. For `"a\nb"`,
1500   * tree-sitter produces two `string_content` children (`"a"`, `"b"`) with the
1501   * newline in neither. For `"\n#"`, it produces ONE child (`"#"`) with the
1502   * leading newline eaten. Concatenating children therefore loses newlines.
1503   *
1504   * Fix: track child `startIndex` and insert one `\n` per index gap. The gap
1505   * between children IS the dropped newline(s). This makes the argv value
1506   * match what bash actually sees.
1507   */
1508  function walkString(
1509    node: Node,
1510    innerCommands: SimpleCommand[],
1511    varScope: Map<string, string>,
1512  ): string | ParseForSecurityResult {
1513    let result = ''
1514    let cursor = -1
1515    // SECURITY: Track whether the string contains a runtime-unknown
1516    // placeholder ($() output or unknown-value tracked var) vs any literal
1517    // content. A string that is ONLY a placeholder (`"$(cmd)"`, `"$VAR"`
1518    // where VAR holds an unknown sentinel) produces an argv element that IS
1519    // the placeholder — which downstream path validation resolves as a
1520    // relative filename within cwd, bypassing the check. `cd "$(echo /etc)"`
1521    // would pass validation but runtime-cd into /etc. We reject
1522    // solo-placeholder strings; placeholders mixed with literal content
1523    // (`"prefix: $(cmd)"`) are safe — runtime value can't equal a bare path.
1524    let sawDynamicPlaceholder = false
1525    let sawLiteralContent = false
1526    for (const child of node.children) {
1527      if (!child) continue
1528      // Index gap between this child and the previous one = dropped newline(s).
1529      // Ignore the gap before the first non-delimiter child (cursor === -1).
1530      // Skip gap-fill for `"` delimiters: a gap before the closing `"` is the
1531      // tree-sitter whitespace-only-string quirk (space/tab, not newline) — let
1532      // the Fix C check below catch it as too-complex instead of mis-filling
1533      // with `\n` and diverging from bash.
1534      if (cursor !== -1 && child.startIndex > cursor && child.type !== '"') {
1535        result += '\n'.repeat(child.startIndex - cursor)
1536        sawLiteralContent = true
1537      }
1538      cursor = child.endIndex
1539      switch (child.type) {
1540        case '"':
1541          // Reset cursor after opening quote so the gap between `"` and the
1542          // first content child is captured.
1543          cursor = child.endIndex
1544          break
1545        case 'string_content':
1546          // Bash double-quote escape rules (NOT the generic /\\(.)/g used for
1547          // unquoted words in walkArgument): inside "...", a backslash only
1548          // escapes $ ` " \ — other sequences like \n stay literal. So
1549          // `"fix \"bug\""` → `fix "bug"`, but `"a\nb"` → `a\nb` (backslash
1550          // kept). tree-sitter preserves the raw escapes in .text; we resolve
1551          // them here so argv matches what bash actually passes.
1552          result += child.text.replace(/\\([$`"\\])/g, '$1')
1553          sawLiteralContent = true
1554          break
1555        case DOLLAR:
1556          // A bare dollar sign before closing quote or a non-name char is
1557          // literal in bash. tree-sitter emits it as a standalone node.
1558          result += DOLLAR
1559          sawLiteralContent = true
1560          break
1561        case 'command_substitution': {
1562          // Carve-out: `$(cat <<'EOF' ... EOF)` is safe. The quoted-delimiter
1563          // heredoc body is literal (no expansion), and `cat` just prints it.
1564          // The substitution result is therefore a known static string. This
1565          // pattern is the idiomatic way to pass multi-line content to tools
1566          // like `gh pr create --body`. We replace the substitution with a
1567          // placeholder argv value — the actual content doesn't matter for
1568          // permission checking, only that it IS static.
1569          const heredocBody = extractSafeCatHeredoc(child)
1570          if (heredocBody === 'DANGEROUS') return tooComplex(child)
1571          if (heredocBody !== null) {
1572            // SECURITY: the body IS the substitution result. Previously we
1573            // dropped it → `rm "$(cat <<'EOF'\n/etc/passwd\nEOF)"` produced
1574            // argv ['rm',''] while bash runs `rm /etc/passwd`. validatePath('')
1575            // resolves to cwd → allowed. Every path-constrained command
1576            // bypassed via this. Now: append the body (trailing LF trimmed —
1577            // bash $() strips trailing newlines).
1578            //
1579            // Tradeoff: bodies with internal newlines are multi-line text
1580            // (markdown, scripts) which cannot be valid paths — safe to drop
1581            // to avoid NEWLINE_HASH_RE false positives on `## Summary`. A
1582            // single-line body (like `/etc/passwd`) MUST go into argv so
1583            // downstream path validation sees the real target.
1584            const trimmed = heredocBody.replace(/\n+$/, '')
1585            if (trimmed.includes('\n')) {
1586              sawLiteralContent = true
1587              break
1588            }
1589            result += trimmed
1590            sawLiteralContent = true
1591            break
1592          }
1593          // General $() inside "...": recurse into inner command(s). If they
1594          // parse cleanly, they become additional subcommands that the
1595          // permission system must match rules against. The outer argv gets
1596          // the original $() text as placeholder (runtime-determined value).
1597          // `echo "SHA: $(git rev-parse HEAD)"` → extracts BOTH
1598          // `echo "SHA: $(...)"` AND `git rev-parse HEAD` — both must match
1599          // permission rules. ~27% of too-complex in top-5k ant cmds.
1600          const err = collectCommandSubstitution(child, innerCommands, varScope)
1601          if (err) return err
1602          result += CMDSUB_PLACEHOLDER
1603          sawDynamicPlaceholder = true
1604          break
1605        }
1606        case 'simple_expansion': {
1607          // `$VAR` inside "...". Tracked/safe vars resolve; untracked reject.
1608          const v = resolveSimpleExpansion(child, varScope, true)
1609          if (typeof v !== 'string') return v
1610          // VAR_PLACEHOLDER = runtime-unknown (loop var, read var, $() output,
1611          // SAFE_ENV_VARS, special vars). Any other string = actual literal
1612          // value from a tracked static var (e.g. VAR=/tmp → v='/tmp').
1613          if (v === VAR_PLACEHOLDER) sawDynamicPlaceholder = true
1614          else sawLiteralContent = true
1615          result += v
1616          break
1617        }
1618        case 'arithmetic_expansion': {
1619          const err = walkArithmetic(child)
1620          if (err) return err
1621          result += child.text
1622          // Validated to be literal-numeric — static content.
1623          sawLiteralContent = true
1624          break
1625        }
1626        default:
1627          // expansion (${...}) inside "..."
1628          return tooComplex(child)
1629      }
1630    }
1631    // SECURITY: Reject solo-placeholder strings. `"$(cmd)"` or `"$VAR"` (where
1632    // VAR holds an unknown value) would produce an argv element that IS the
1633    // placeholder — which bypasses downstream path validation (validatePath
1634    // resolves placeholders as relative filenames within cwd). Only allow
1635    // placeholders embedded alongside literal content (`"prefix: $(cmd)"`).
1636    if (sawDynamicPlaceholder && !sawLiteralContent) {
1637      return tooComplex(node)
1638    }
1639    // SECURITY: tree-sitter-bash quirk — a double-quoted string containing
1640    // ONLY whitespace (` "`, `" "`, `"\t"`) produces NO string_content child;
1641    // the whitespace is attributed to the closing `"` node's text. Our loop
1642    // only adds to `result` from string_content/expansion children, so we'd
1643    // return "" when bash sees " ". Detect: we saw no content children
1644    // (both flags false — neither literal nor placeholder added) but the
1645    // source span is longer than bare `""`. Genuine `""` has text.length==2.
1646    // `"$V"` with V="" doesn't hit this — the simple_expansion child sets
1647    // sawLiteralContent via the `else` branch even when v is empty.
1648    if (!sawLiteralContent && !sawDynamicPlaceholder && node.text.length > 2) {
1649      return tooComplex(node)
1650    }
1651    return result
1652  }
1653  
1654  /**
1655   * Safe leaf nodes inside arithmetic expansion: integer literals (decimal,
1656   * hex, octal, bash base#digits) and operator/paren tokens. Anything else at
1657   * leaf position (notably variable_name that isn't a numeric literal) rejects.
1658   */
1659  const ARITH_LEAF_RE =
1660    /^(?:[0-9]+|0[xX][0-9a-fA-F]+|[0-9]+#[0-9a-zA-Z]+|[-+*/%^&|~!<>=?:(),]+|<<|>>|\*\*|&&|\|\||[<>=!]=|\$\(\(|\)\))$/
1661  
1662  /**
1663   * Recursively validate an arithmetic_expansion node. Allows only literal
1664   * numeric expressions — no variables, no substitutions. Returns null if
1665   * safe, or a too-complex result if not.
1666   *
1667   * Variables are rejected because bash arithmetic recursively evaluates
1668   * variable values: if x='a[$(cmd)]' then $((x)) executes cmd. See
1669   * https://www.vidarholen.net/contents/blog/?p=716 (arithmetic injection).
1670   *
1671   * When safe, the caller puts the full `$((…))` span into argv as a literal
1672   * string. bash will expand it to an integer at runtime; the static string
1673   * won't match any sensitive path/deny patterns.
1674   */
1675  function walkArithmetic(node: Node): ParseForSecurityResult | null {
1676    for (const child of node.children) {
1677      if (!child) continue
1678      if (child.children.length === 0) {
1679        if (!ARITH_LEAF_RE.test(child.text)) {
1680          return {
1681            kind: 'too-complex',
1682            reason: `Arithmetic expansion references variable or non-literal: ${child.text}`,
1683            nodeType: 'arithmetic_expansion',
1684          }
1685        }
1686        continue
1687      }
1688      switch (child.type) {
1689        case 'binary_expression':
1690        case 'unary_expression':
1691        case 'ternary_expression':
1692        case 'parenthesized_expression': {
1693          const err = walkArithmetic(child)
1694          if (err) return err
1695          break
1696        }
1697        default:
1698          return tooComplex(child)
1699      }
1700    }
1701    return null
1702  }
1703  
1704  /**
1705   * Check if a command_substitution node is exactly `$(cat <<'DELIM'...DELIM)`
1706   * and return the heredoc body if so. Any deviation (extra args to cat,
1707   * unquoted delimiter, additional commands) returns null.
1708   *
1709   * tree-sitter structure:
1710   *   command_substitution
1711   *     $(
1712   *     redirected_statement
1713   *       command → command_name → word "cat"    (exactly one child)
1714   *       heredoc_redirect
1715   *         <<
1716   *         heredoc_start 'DELIM'                (quoted)
1717   *         heredoc_body                         (pure heredoc_content)
1718   *         heredoc_end
1719   *     )
1720   */
1721  function extractSafeCatHeredoc(subNode: Node): string | 'DANGEROUS' | null {
1722    // Expect exactly: $( + one redirected_statement + )
1723    let stmt: Node | null = null
1724    for (const child of subNode.children) {
1725      if (!child) continue
1726      if (child.type === '$(' || child.type === ')') continue
1727      if (child.type === 'redirected_statement' && stmt === null) {
1728        stmt = child
1729      } else {
1730        return null
1731      }
1732    }
1733    if (!stmt) return null
1734  
1735    // redirected_statement must be: command(cat) + heredoc_redirect (quoted)
1736    let sawCat = false
1737    let body: string | null = null
1738    for (const child of stmt.children) {
1739      if (!child) continue
1740      if (child.type === 'command') {
1741        // Must be bare `cat` — no args, no env vars
1742        const cmdChildren = child.children.filter(c => c)
1743        if (cmdChildren.length !== 1) return null
1744        const nameNode = cmdChildren[0]
1745        if (nameNode?.type !== 'command_name' || nameNode.text !== 'cat') {
1746          return null
1747        }
1748        sawCat = true
1749      } else if (child.type === 'heredoc_redirect') {
1750        // Reuse the existing validator: quoted delimiter, body is pure text.
1751        // walkHeredocRedirect returns null on success, non-null on rejection.
1752        if (walkHeredocRedirect(child) !== null) return null
1753        for (const hc of child.children) {
1754          if (hc?.type === 'heredoc_body') body = hc.text
1755        }
1756      } else {
1757        return null
1758      }
1759    }
1760  
1761    if (!sawCat || body === null) return null
1762    // SECURITY: the heredoc body becomes the outer command's argv value via
1763    // substitution, so a body like `/proc/self/environ` is semantically
1764    // `cat /proc/self/environ`. checkSemantics never sees the body (we drop it
1765    // at the walkString call site to avoid newline+# FPs). Returning `null`
1766    // here would fall through to collectCommandSubstitution in walkString,
1767    // which would extract the inner `cat` via walkHeredocRedirect (body text
1768    // not inspected there) — effectively bypassing this check. Return a
1769    // distinct sentinel so the caller can reject instead of falling through.
1770    if (PROC_ENVIRON_RE.test(body)) return 'DANGEROUS'
1771    // Same for jq system(): checkSemantics checks argv but never sees the
1772    // heredoc body. Check unconditionally (we don't know the outer command).
1773    if (/\bsystem\s*\(/.test(body)) return 'DANGEROUS'
1774    return body
1775  }
1776  
1777  function walkVariableAssignment(
1778    node: Node,
1779    innerCommands: SimpleCommand[],
1780    varScope: Map<string, string>,
1781  ): { name: string; value: string; isAppend: boolean } | ParseForSecurityResult {
1782    let name: string | null = null
1783    let value = ''
1784    let isAppend = false
1785  
1786    for (const child of node.children) {
1787      if (!child) continue
1788      if (child.type === 'variable_name') {
1789        name = child.text
1790      } else if (child.type === '=' || child.type === '+=') {
1791        // `PATH+=":/new"` — tree-sitter emits `+=` as a distinct operator
1792        // node. Without this case it falls through to walkArgument below
1793        // → tooComplex on unknown type `+=`.
1794        isAppend = child.type === '+='
1795        continue
1796      } else if (child.type === 'command_substitution') {
1797        // $() as the variable's value. The output becomes a STRING stored in
1798        // the variable — it's NOT a positional argument (no path/flag concern).
1799        // `VAR=$(date)` runs `date`, stores output. `VAR=$(rm -rf /)` runs
1800        // `rm` — the inner command IS checked against permission rules, so
1801        // `rm` must match a rule. The variable just holds whatever `rm` prints.
1802        const err = collectCommandSubstitution(child, innerCommands, varScope)
1803        if (err) return err
1804        value = CMDSUB_PLACEHOLDER
1805      } else if (child.type === 'simple_expansion') {
1806        // `VAR=$OTHER` — assignment RHS does NOT word-split or glob-expand
1807        // in bash (unlike command arguments). So `A="a b"; B=$A` sets B to
1808        // the literal "a b". Resolve as if inside a string (insideString=true)
1809        // so BARE_VAR_UNSAFE_RE doesn't over-reject. The resulting value may
1810        // contain spaces/globs — if B is later used as a bare arg, THAT use
1811        // will correctly reject via BARE_VAR_UNSAFE_RE.
1812        const v = resolveSimpleExpansion(child, varScope, true)
1813        if (typeof v !== 'string') return v
1814        // If v is VAR_PLACEHOLDER (OTHER holds unknown), store it — combined
1815        // with containsAnyPlaceholder in the caller to treat as unknown.
1816        value = v
1817      } else {
1818        const v = walkArgument(child, innerCommands, varScope)
1819        if (typeof v !== 'string') return v
1820        value = v
1821      }
1822    }
1823  
1824    if (name === null) {
1825      return {
1826        kind: 'too-complex',
1827        reason: 'Variable assignment without name',
1828        nodeType: 'variable_assignment',
1829      }
1830    }
1831    // SECURITY: tree-sitter-bash accepts invalid var names (e.g. `1VAR=value`)
1832    // as variable_assignment. Bash only recognizes [A-Za-z_][A-Za-z0-9_]* —
1833    // anything else is run as a COMMAND. `1VAR=value` → bash tries to execute
1834    // `1VAR=value` from PATH. We must not treat it as an inert assignment.
1835    if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(name)) {
1836      return {
1837        kind: 'too-complex',
1838        reason: `Invalid variable name (bash treats as command): ${name}`,
1839        nodeType: 'variable_assignment',
1840      }
1841    }
1842    // SECURITY: Setting IFS changes word-splitting behavior for subsequent
1843    // unquoted $VAR expansions. `IFS=: && VAR=a:b && rm $VAR` → bash splits
1844    // on `:` → `rm a b`. Our BARE_VAR_UNSAFE_RE only checks default IFS
1845    // chars (space/tab/NL) — we can't model custom IFS. Reject.
1846    if (name === 'IFS') {
1847      return {
1848        kind: 'too-complex',
1849        reason: 'IFS assignment changes word-splitting — cannot model statically',
1850        nodeType: 'variable_assignment',
1851      }
1852    }
1853    // SECURITY: PS4 is expanded via promptvars (default on) on every command
1854    // traced after `set -x`. A raw_string value containing $(cmd) or `cmd`
1855    // executes at trace time: `PS4='$(id)' && set -x && :` runs id, but our
1856    // argv is only [["set","-x"],[":"]] — the payload is invisible to
1857    // permission checks. PS0-3 and PROMPT_COMMAND are not expanded in
1858    // non-interactive shells (BashTool).
1859    //
1860    // ALLOWLIST, not blocklist. 5 rounds of bypass patches taught us that a
1861    // value-dependent blocklist is structurally fragile:
1862    //   - `+=` effective-value computation diverges from bash in multiple
1863    //     scope-model gaps: `||` reset, env-prefix chain (PS4='' && PS4='$'
1864    //     PS4+='(id)' cmd reads stale parent value), subshell.
1865    //   - bash's decode_prompt_string runs BEFORE promptvars, so `\044(id)`
1866    //     (octal for `$`) becomes `$(id)` at trace time — any literal-char
1867    //     check must model prompt-escape decoding exactly.
1868    //   - assignment paths exist outside walkVariableAssignment (for_statement
1869    //     sets loopVar directly, see that handler's PS4 check).
1870    //
1871    // Policy: (1) reject += outright — no scope-tracking dependency; user can
1872    // combine into one PS4=... (2) reject placeholders — runtime unknowable.
1873    // (3) allowlist remaining value: ${identifier} refs (value-read only, safe)
1874    // plus [A-Za-z0-9 _+:.\/=[\]-]. No bare `$` (blocks split primitive), no
1875    // `\` (blocks octal \044/\140), no backtick, no parens. Covers all known
1876    // encoding vectors and future ones — anything off the allowlist fails.
1877    // Legit `PS4='+${BASH_SOURCE}:${LINENO}: '` still passes.
1878    if (name === 'PS4') {
1879      if (isAppend) {
1880        return {
1881          kind: 'too-complex',
1882          reason:
1883            'PS4 += cannot be statically verified — combine into a single PS4= assignment',
1884          nodeType: 'variable_assignment',
1885        }
1886      }
1887      if (containsAnyPlaceholder(value)) {
1888        return {
1889          kind: 'too-complex',
1890          reason: 'PS4 value derived from cmdsub/variable — runtime unknowable',
1891          nodeType: 'variable_assignment',
1892        }
1893      }
1894      if (
1895        !/^[A-Za-z0-9 _+:./=[\]-]*$/.test(
1896          value.replace(/\$\{[A-Za-z_][A-Za-z0-9_]*\}/g, ''),
1897        )
1898      ) {
1899        return {
1900          kind: 'too-complex',
1901          reason:
1902            'PS4 value outside safe charset — only ${VAR} refs and [A-Za-z0-9 _+:.=/[]-] allowed',
1903          nodeType: 'variable_assignment',
1904        }
1905      }
1906    }
1907    // SECURITY: Tilde expansion in assignment RHS. `VAR=~/x` (unquoted) →
1908    // bash expands `~` at ASSIGNMENT time → VAR='/home/user/x'. We see the
1909    // literal `~/x`. Later `cd $VAR` → our argv `['cd','~/x']`, bash runs
1910    // `cd /home/user/x`. Tilde expansion also happens after `=` and `:` in
1911    // assignment values (e.g. PATH=~/bin:~/sbin). We can't model it — reject
1912    // any value containing `~` that isn't already quoted-literal (where bash
1913    // doesn't expand). Conservative: any `~` in value → reject.
1914    if (value.includes('~')) {
1915      return {
1916        kind: 'too-complex',
1917        reason: 'Tilde in assignment value — bash may expand at assignment time',
1918        nodeType: 'variable_assignment',
1919      }
1920    }
1921    return { name, value, isAppend }
1922  }
1923  
1924  /**
1925   * Resolve a `simple_expansion` ($VAR) node. Returns VAR_PLACEHOLDER if
1926   * resolvable, too-complex otherwise.
1927   *
1928   * @param insideString true when $VAR is inside a `string` node ("...$VAR...")
1929   *   rather than a bare/concatenation argument. SAFE_ENV_VARS and unknown-value
1930   *   tracked vars are only allowed inside strings — as bare args their runtime
1931   *   value IS the argument and we don't know it statically.
1932   *   `cd $HOME/../x` would hide the real path behind the placeholder;
1933   *   `echo "Home: $HOME"` just embeds text in a string. Tracked vars holding
1934   *   STATIC strings (VAR=literal) are allowed in both positions since their
1935   *   value IS known.
1936   */
1937  function resolveSimpleExpansion(
1938    node: Node,
1939    varScope: Map<string, string>,
1940    insideString: boolean,
1941  ): string | ParseForSecurityResult {
1942    let varName: string | null = null
1943    let isSpecial = false
1944    for (const c of node.children) {
1945      if (c?.type === 'variable_name') {
1946        varName = c.text
1947        break
1948      }
1949      if (c?.type === 'special_variable_name') {
1950        varName = c.text
1951        isSpecial = true
1952        break
1953      }
1954    }
1955    if (varName === null) return tooComplex(node)
1956    // Tracked vars: check stored value. Literal strings (VAR=/tmp) are
1957    // returned DIRECTLY so downstream path validation sees the real path.
1958    // Non-literal values (containing any placeholder — loop vars, $() output,
1959    // read vars, composites like `VAR="prefix$(cmd)"`) are ONLY safe inside
1960    // strings; as bare args they'd hide the runtime path/flag from validation.
1961    //
1962    // SECURITY: Returning the actual trackedValue (not a placeholder) is the
1963    // critical fix. `VAR=/etc && rm $VAR` → argv ['rm', '/etc'] → validatePath
1964    // correctly rejects. Previously returned a placeholder → validatePath saw
1965    // '__LOOP_STATIC__', resolved as cwd-relative → PASSED → bypass.
1966    const trackedValue = varScope.get(varName)
1967    if (trackedValue !== undefined) {
1968      if (containsAnyPlaceholder(trackedValue)) {
1969        // Non-literal: bare → reject, inside string → VAR_PLACEHOLDER
1970        // (walkString's solo-placeholder gate rejects `"$VAR"` alone).
1971        if (!insideString) return tooComplex(node)
1972        return VAR_PLACEHOLDER
1973      }
1974      // Pure literal (e.g. '/tmp', 'foo') — return it directly. Downstream
1975      // path validation / checkSemantics operate on the REAL value.
1976      //
1977      // SECURITY: For BARE args (not inside a string), bash word-splits on
1978      // $IFS and glob-expands the result. `VAR="-rf /" && rm $VAR` → bash
1979      // runs `rm -rf /` (two args); `VAR="/etc/*" && cat $VAR` → expands to
1980      // all files. Reject values containing IFS/glob chars unless in "...".
1981      //
1982      // SECURITY: Empty value as bare arg. Bash word-splitting on "" produces
1983      // ZERO fields — the expansion disappears. `V="" && $V eval x` → bash
1984      // runs `eval x` (our argv would be ["","eval","x"] with name="" —
1985      // every EVAL_LIKE/ZSH/keyword check misses). `V="" && ls $V /etc` →
1986      // bash runs `ls /etc`, our argv has a phantom "" shifting positions.
1987      // Inside "...": `"$V"` → bash produces one empty-string arg → our ""
1988      // is correct, keep allowing.
1989      if (!insideString) {
1990        if (trackedValue === '') return tooComplex(node)
1991        if (BARE_VAR_UNSAFE_RE.test(trackedValue)) return tooComplex(node)
1992      }
1993      return trackedValue
1994    }
1995    // SAFE_ENV_VARS + special vars ($?, $$, $@, $1, etc.): value unknown
1996    // (shell-controlled). Only safe when embedded in a string, NOT as a
1997    // bare argument to a path-sensitive command.
1998    if (insideString) {
1999      if (SAFE_ENV_VARS.has(varName)) return VAR_PLACEHOLDER
2000      if (
2001        isSpecial &&
2002        (SPECIAL_VAR_NAMES.has(varName) || /^[0-9]+$/.test(varName))
2003      ) {
2004        return VAR_PLACEHOLDER
2005      }
2006    }
2007    return tooComplex(node)
2008  }
2009  
2010  /**
2011   * Apply a variable assignment to the scope, handling `+=` append semantics.
2012   * SECURITY: If EITHER side (existing value or appended value) contains a
2013   * placeholder, the result is non-literal — store VAR_PLACEHOLDER so later
2014   * $VAR correctly rejects as bare arg.
2015   * `VAR=/etc && VAR+=$(cmd)` must not leave VAR looking static.
2016   */
2017  function applyVarToScope(
2018    varScope: Map<string, string>,
2019    ev: { name: string; value: string; isAppend: boolean },
2020  ): void {
2021    const existing = varScope.get(ev.name) ?? ''
2022    const combined = ev.isAppend ? existing + ev.value : ev.value
2023    varScope.set(
2024      ev.name,
2025      containsAnyPlaceholder(combined) ? VAR_PLACEHOLDER : combined,
2026    )
2027  }
2028  
2029  function stripRawString(text: string): string {
2030    return text.slice(1, -1)
2031  }
2032  
2033  function tooComplex(node: Node): ParseForSecurityResult {
2034    const reason =
2035      node.type === 'ERROR'
2036        ? 'Parse error'
2037        : DANGEROUS_TYPES.has(node.type)
2038          ? `Contains ${node.type}`
2039          : `Unhandled node type: ${node.type}`
2040    return { kind: 'too-complex', reason, nodeType: node.type }
2041  }
2042  
2043  // ────────────────────────────────────────────────────────────────────────────
2044  // Post-argv semantic checks
2045  //
2046  // Everything above answers "can we tokenize?". Everything below answers
2047  // "is the resulting argv dangerous in ways that don't involve parsing?".
2048  // These are checks on argv[0] or argv content that the old bashSecurity.ts
2049  // validators performed but which have nothing to do with parser
2050  // differentials. They're here (not in bashSecurity.ts) because they operate
2051  // on SimpleCommand and need to run for every extracted command.
2052  // ────────────────────────────────────────────────────────────────────────────
2053  
2054  /**
2055   * Zsh module builtins. These are not binaries on PATH — they're zsh
2056   * internals loaded via zmodload. Since BashTool runs via the user's default
2057   * shell (often zsh), and these parse as plain `command` nodes with no
2058   * distinguishing syntax, we can only catch them by name.
2059   */
2060  const ZSH_DANGEROUS_BUILTINS = new Set([
2061    'zmodload',
2062    'emulate',
2063    'sysopen',
2064    'sysread',
2065    'syswrite',
2066    'sysseek',
2067    'zpty',
2068    'ztcp',
2069    'zsocket',
2070    'zf_rm',
2071    'zf_mv',
2072    'zf_ln',
2073    'zf_chmod',
2074    'zf_chown',
2075    'zf_mkdir',
2076    'zf_rmdir',
2077    'zf_chgrp',
2078  ])
2079  
2080  /**
2081   * Shell builtins that evaluate their arguments as code or otherwise escape
2082   * the argv abstraction. A command like `eval "rm -rf /"` has argv
2083   * ['eval', 'rm -rf /'] which looks inert to flag validation but executes
2084   * the string. Treat these the same as command substitution.
2085   */
2086  const EVAL_LIKE_BUILTINS = new Set([
2087    'eval',
2088    'source',
2089    '.',
2090    'exec',
2091    'command',
2092    'builtin',
2093    'fc',
2094    // `coproc rm -rf /` spawns rm as a coprocess. tree-sitter parses it as
2095    // a plain command with argv[0]='coproc', so permission rules and path
2096    // validation would check 'coproc' not 'rm'.
2097    'coproc',
2098    // Zsh precommand modifiers: `noglob cmd args` runs cmd with globbing off.
2099    // They parse as ordinary commands (noglob is argv[0], the real command is
2100    // argv[1]) so permission matching against argv[0] would see 'noglob', not
2101    // the wrapped command.
2102    'noglob',
2103    'nocorrect',
2104    // `trap 'cmd' SIGNAL` — cmd runs as shell code on signal/exit. EXIT fires
2105    // at end of every BashTool invocation, so this is guaranteed execution.
2106    'trap',
2107    // `enable -f /path/lib.so name` — dlopen arbitrary .so as a builtin.
2108    // Native code execution.
2109    'enable',
2110    // `mapfile -C callback -c N` / `readarray -C callback` — callback runs as
2111    // shell code every N input lines.
2112    'mapfile',
2113    'readarray',
2114    // `hash -p /path cmd` — poisons bash's command-lookup cache. Subsequent
2115    // `cmd` in the same command resolves to /path instead of PATH lookup.
2116    'hash',
2117    // `bind -x '"key":cmd'` / `complete -C cmd` — interactive-only callbacks
2118    // but still code-string arguments. Low impact in non-interactive BashTool
2119    // shells, blocked for consistency. `compgen -C cmd` is NOT interactive-only:
2120    // it immediately executes the -C argument to generate completions.
2121    'bind',
2122    'complete',
2123    'compgen',
2124    // `alias name='cmd'` — aliases not expanded in non-interactive bash by
2125    // default, but `shopt -s expand_aliases` enables them. Also blocked as
2126    // defense-in-depth (alias followed by name use in same command).
2127    'alias',
2128    // `let EXPR` arithmetically evaluates EXPR — identical to $(( EXPR )).
2129    // Array subscripts in the expression expand $(cmd) at eval time even when
2130    // the argument arrived single-quoted: `let 'x=a[$(id)]'` executes id.
2131    // tree-sitter sees the raw_string as an opaque leaf. Same primitive
2132    // walkArithmetic guards, but `let` is a plain command node.
2133    'let',
2134  ])
2135  
2136  /**
2137   * Builtins that re-parse a NAME operand internally and arithmetically
2138   * evaluate `arr[EXPR]` subscripts — including $(cmd) in the subscript —
2139   * even when the argv element arrived from a single-quoted raw_string.
2140   * `test -v 'a[$(id)]'` → tree-sitter sees an opaque leaf, bash runs id.
2141   * Maps: builtin name → set of flags whose next argument is a NAME.
2142   */
2143  const SUBSCRIPT_EVAL_FLAGS: Record<string, Set<string>> = {
2144    test: new Set(['-v', '-R']),
2145    '[': new Set(['-v', '-R']),
2146    '[[': new Set(['-v', '-R']),
2147    printf: new Set(['-v']),
2148    read: new Set(['-a']),
2149    unset: new Set(['-v']),
2150    // bash 5.1+: `wait -p VAR [id...]` stores the waited PID into VAR. When VAR
2151    // is `arr[EXPR]`, bash arithmetically evaluates the subscript — running
2152    // $(cmd) even from a single-quoted raw_string. Verified bash 5.3.9:
2153    // `: & wait -p 'a[$(id)]' %1` executes id.
2154    wait: new Set(['-p']),
2155  }
2156  
2157  /**
2158   * `[[ ARG1 OP ARG2 ]]` where OP is an arithmetic comparison. bash manual:
2159   * "When used with [[, Arg1 and Arg2 are evaluated as arithmetic
2160   * expressions." Arithmetic evaluation recursively expands array subscripts,
2161   * so `[[ 'a[$(id)]' -eq 0 ]]` executes `id` even though tree-sitter sees
2162   * the operand as an opaque raw_string leaf. Unlike -v/-R (unary, NAME after
2163   * flag), these are binary — the subscript can appear on EITHER side, so
2164   * SUBSCRIPT_EVAL_FLAGS's "next arg" logic is insufficient.
2165   * `[` / `test` are not vulnerable (bash errors with "integer expression
2166   * expected"), but the test_command handler normalizes argv[0]='[[' for
2167   * both forms, so they get this check too — mild over-blocking, safe side.
2168   */
2169  const TEST_ARITH_CMP_OPS = new Set(['-eq', '-ne', '-lt', '-le', '-gt', '-ge'])
2170  
2171  /**
2172   * Builtins where EVERY non-flag positional argument is a NAME that bash
2173   * re-parses and arithmetically evaluates subscripts on — no flag required.
2174   * `read 'a[$(id)]'` executes id: each positional is a variable name to
2175   * assign into, and `arr[EXPR]` is valid syntax there. `unset NAME...` is
2176   * the same (though tree-sitter's unset_command handler currently rejects
2177   * raw_string children before reaching here — this is defense-in-depth).
2178   * NOT printf (positional args are FORMAT/data), NOT test/[ (operands are
2179   * values, only -v/-R take a NAME). declare/typeset/local handled in
2180   * declaration_command since they never reach here as plain commands.
2181   */
2182  const BARE_SUBSCRIPT_NAME_BUILTINS = new Set(['read', 'unset'])
2183  
2184  /**
2185   * `read` flags whose NEXT argument is data (prompt/delimiter/count/fd),
2186   * not a NAME. `read -p '[foo] ' var` must not trip on the `[` in the
2187   * prompt string. `-a` is intentionally absent — its operand IS a NAME.
2188   */
2189  const READ_DATA_FLAGS = new Set(['-p', '-d', '-n', '-N', '-t', '-u', '-i'])
2190  
2191  // SHELL_KEYWORDS imported from bashParser.ts — shell reserved words can never
2192  // be legitimate argv[0]; if they appear, the parser mis-parsed a compound
2193  // command. Reject to avoid nonsense argv reaching downstream.
2194  
2195  // Use `.*` not `[^/]*` — Linux resolves `..` in procfs, so
2196  // `/proc/self/../self/environ` works and must be caught.
2197  const PROC_ENVIRON_RE = /\/proc\/.*\/environ/
2198  
2199  /**
2200   * Newline followed by `#` in an argv element, env var value, or redirect target.
2201   * Downstream stripSafeWrappers re-tokenizes .text line-by-line and treats `#`
2202   * after a newline as a comment, hiding arguments that follow.
2203   */
2204  const NEWLINE_HASH_RE = /\n[ \t]*#/
2205  
2206  export type SemanticCheckResult = { ok: true } | { ok: false; reason: string }
2207  
2208  /**
2209   * Post-argv semantic checks. Run after parseForSecurity returns 'simple' to
2210   * catch commands that tokenize fine but are dangerous by name or argument
2211   * content. Returns the first failure or {ok: true}.
2212   */
2213  export function checkSemantics(commands: SimpleCommand[]): SemanticCheckResult {
2214    for (const cmd of commands) {
2215      // Strip safe wrapper commands (nohup, time, timeout N, nice -n N) so
2216      // `nohup eval "..."` and `timeout 5 jq 'system(...)'` are checked
2217      // against the wrapped command, not the wrapper. Inlined here to avoid
2218      // circular import with bashPermissions.ts.
2219      let a = cmd.argv
2220      for (;;) {
2221        if (a[0] === 'time' || a[0] === 'nohup') {
2222          a = a.slice(1)
2223        } else if (a[0] === 'timeout') {
2224          // `timeout 5`, `timeout 5s`, `timeout 5.5`, plus optional GNU flags
2225          // preceding the duration. Long: --foreground, --kill-after=N,
2226          // --signal=SIG, --preserve-status. Short: -k DUR, -s SIG, -v (also
2227          // fused: -k5, -sTERM).
2228          // SECURITY (SAST Mar 2026): the previous loop only skipped `--long`
2229          // flags, so `timeout -k 5 10 eval ...` broke out with name='timeout'
2230          // and the wrapped eval was never checked. Now handle known short
2231          // flags AND fail closed on any unrecognized flag — an unknown flag
2232          // means we can't locate the wrapped command, so we must not silently
2233          // fall through to name='timeout'.
2234          let i = 1
2235          while (i < a.length) {
2236            const arg = a[i]!
2237            if (
2238              arg === '--foreground' ||
2239              arg === '--preserve-status' ||
2240              arg === '--verbose'
2241            ) {
2242              i++ // known no-value long flags
2243            } else if (/^--(?:kill-after|signal)=[A-Za-z0-9_.+-]+$/.test(arg)) {
2244              i++ // --kill-after=5, --signal=TERM (value fused with =)
2245            } else if (
2246              (arg === '--kill-after' || arg === '--signal') &&
2247              a[i + 1] &&
2248              /^[A-Za-z0-9_.+-]+$/.test(a[i + 1]!)
2249            ) {
2250              i += 2 // --kill-after 5, --signal TERM (space-separated)
2251            } else if (arg.startsWith('--')) {
2252              // Unknown long flag, OR --kill-after/--signal with non-allowlisted
2253              // value (e.g. placeholder from $() substitution). Fail closed.
2254              return {
2255                ok: false,
2256                reason: `timeout with ${arg} flag cannot be statically analyzed`,
2257              }
2258            } else if (arg === '-v') {
2259              i++ // --verbose, no argument
2260            } else if (
2261              (arg === '-k' || arg === '-s') &&
2262              a[i + 1] &&
2263              /^[A-Za-z0-9_.+-]+$/.test(a[i + 1]!)
2264            ) {
2265              i += 2 // -k DURATION / -s SIGNAL — separate value
2266            } else if (/^-[ks][A-Za-z0-9_.+-]+$/.test(arg)) {
2267              i++ // fused: -k5, -sTERM
2268            } else if (arg.startsWith('-')) {
2269              // Unknown flag OR -k/-s with non-allowlisted value — can't locate
2270              // wrapped cmd. Reject, don't fall through to name='timeout'.
2271              return {
2272                ok: false,
2273                reason: `timeout with ${arg} flag cannot be statically analyzed`,
2274              }
2275            } else {
2276              break // non-flag — should be the duration
2277            }
2278          }
2279          if (a[i] && /^\d+(?:\.\d+)?[smhd]?$/.test(a[i]!)) {
2280            a = a.slice(i + 1)
2281          } else if (a[i]) {
2282            // SECURITY (PR #21503 round 3): a[i] exists but doesn't match our
2283            // duration regex. GNU timeout parses via xstrtod() (libc strtod) and
2284            // accepts `.5`, `+5`, `5e-1`, `inf`, `infinity`, hex floats — none
2285            // of which match `/^\d+(\.\d+)?[smhd]?$/`. Empirically verified:
2286            // `timeout .5 echo ok` works. Previously this branch `break`ed
2287            // (fail-OPEN) so `timeout .5 eval "id"` with `Bash(timeout:*)` left
2288            // name='timeout' and eval was never checked. Now fail CLOSED —
2289            // consistent with the unknown-FLAG handling above (lines ~1895,1912).
2290            return {
2291              ok: false,
2292              reason: `timeout duration '${a[i]}' cannot be statically analyzed`,
2293            }
2294          } else {
2295            break // no more args — `timeout` alone, inert
2296          }
2297        } else if (a[0] === 'nice') {
2298          // `nice cmd`, `nice -n N cmd`, `nice -N cmd` (legacy). All run cmd
2299          // at a lower priority. argv[0] check must see the wrapped cmd.
2300          if (a[1] === '-n' && a[2] && /^-?\d+$/.test(a[2])) {
2301            a = a.slice(3)
2302          } else if (a[1] && /^-\d+$/.test(a[1])) {
2303            a = a.slice(2) // `nice -10 cmd`
2304          } else if (a[1] && /[$(`]/.test(a[1])) {
2305            // SECURITY: walkArgument returns node.text for arithmetic_expansion,
2306            // so `nice $((0-5)) jq ...` has a[1]='$((0-5))'. Bash expands it to
2307            // '-5' (legacy nice syntax) and execs jq; we'd slice(1) here and
2308            // set name='$((0-5))' which skips the jq system() check entirely.
2309            // Fail closed — mirrors the timeout-duration fail-closed above.
2310            return {
2311              ok: false,
2312              reason: `nice argument '${a[1]}' contains expansion — cannot statically determine wrapped command`,
2313            }
2314          } else {
2315            a = a.slice(1) // bare `nice cmd`
2316          }
2317        } else if (a[0] === 'env') {
2318          // `env [VAR=val...] [-i] [-0] [-v] [-u NAME...] cmd args` runs cmd.
2319          // argv[0] check must see cmd, not env. Skip known-safe forms only.
2320          // SECURITY: -S splits a string into argv (mini-shell) — must reject.
2321          // -C/-P change cwd/PATH — wrapped cmd runs elsewhere, reject.
2322          // Any OTHER flag → reject (fail-closed, not fail-open to name='env').
2323          let i = 1
2324          while (i < a.length) {
2325            const arg = a[i]!
2326            if (arg.includes('=') && !arg.startsWith('-')) {
2327              i++ // VAR=val assignment
2328            } else if (arg === '-i' || arg === '-0' || arg === '-v') {
2329              i++ // flags with no argument
2330            } else if (arg === '-u' && a[i + 1]) {
2331              i += 2 // -u NAME unsets; takes one arg
2332            } else if (arg.startsWith('-')) {
2333              // -S (argv splitter), -C (altwd), -P (altpath), --anything,
2334              // or unknown flag. Can't model — reject the whole command.
2335              return {
2336                ok: false,
2337                reason: `env with ${arg} flag cannot be statically analyzed`,
2338              }
2339            } else {
2340              break // the wrapped command
2341            }
2342          }
2343          if (i < a.length) {
2344            a = a.slice(i)
2345          } else {
2346            break // `env` alone (no wrapped cmd) — inert, name='env'
2347          }
2348        } else if (a[0] === 'stdbuf') {
2349          // `stdbuf -o0 cmd` (fused), `stdbuf -o 0 cmd` (space-separated),
2350          // multiple flags (`stdbuf -o0 -eL cmd`), long forms (`--output=0`).
2351          // SECURITY: previous handling only stripped ONE flag and fell through
2352          // to slice(2) for anything unrecognized, so `stdbuf --output 0 eval`
2353          // → ['0','eval',...] → name='0' hid eval. Now iterate all known flag
2354          // forms and fail closed on any unknown flag.
2355          let i = 1
2356          while (i < a.length) {
2357            const arg = a[i]!
2358            if (STDBUF_SHORT_SEP_RE.test(arg) && a[i + 1]) {
2359              i += 2 // -o MODE (space-separated)
2360            } else if (STDBUF_SHORT_FUSED_RE.test(arg)) {
2361              i++ // -o0 (fused)
2362            } else if (STDBUF_LONG_RE.test(arg)) {
2363              i++ // --output=MODE (fused long)
2364            } else if (arg.startsWith('-')) {
2365              // --output MODE (space-separated long) or unknown flag. GNU
2366              // stdbuf long options use `=` syntax, but getopt_long also
2367              // accepts space-separated — we can't enumerate safely, reject.
2368              return {
2369                ok: false,
2370                reason: `stdbuf with ${arg} flag cannot be statically analyzed`,
2371              }
2372            } else {
2373              break // the wrapped command
2374            }
2375          }
2376          if (i > 1 && i < a.length) {
2377            a = a.slice(i)
2378          } else {
2379            break // `stdbuf` with no flags or no wrapped cmd — inert
2380          }
2381        } else {
2382          break
2383        }
2384      }
2385      const name = a[0]
2386      if (name === undefined) continue
2387  
2388      // SECURITY: Empty command name. Quoted empty (`"" cmd`) is harmless —
2389      // bash tries to exec "" and fails with "command not found". But an
2390      // UNQUOTED empty expansion at command position (`V="" && $V cmd`) is a
2391      // bypass: bash drops the empty field and runs `cmd` as argv[0], while
2392      // our name="" skips every builtin check below. resolveSimpleExpansion
2393      // rejects the $V case; this catches any other path to empty argv[0]
2394      // (concatenation of empties, walkString whitespace-quirk, future bugs).
2395      if (name === '') {
2396        return {
2397          ok: false,
2398          reason: 'Empty command name — argv[0] may not reflect what bash runs',
2399        }
2400      }
2401  
2402      // Defense-in-depth: argv[0] should never be a placeholder after the
2403      // var-tracking fix (static vars return real value, unknown vars reject).
2404      // But if a bug upstream ever lets one through, catch it here — a
2405      // placeholder-as-command-name means runtime-determined command → unsafe.
2406      if (name.includes(CMDSUB_PLACEHOLDER) || name.includes(VAR_PLACEHOLDER)) {
2407        return {
2408          ok: false,
2409          reason: 'Command name is runtime-determined (placeholder argv[0])',
2410        }
2411      }
2412  
2413      // argv[0] starts with an operator/flag: this is a fragment, not a
2414      // command. Likely a line-continuation leak or a mistake.
2415      if (name.startsWith('-') || name.startsWith('|') || name.startsWith('&')) {
2416        return {
2417          ok: false,
2418          reason: 'Command appears to be an incomplete fragment',
2419        }
2420      }
2421  
2422      // SECURITY: builtins that re-parse a NAME operand internally. bash
2423      // arithmetically evaluates `arr[EXPR]` in NAME position, running $(cmd)
2424      // in the subscript even when the argv element arrived from a
2425      // single-quoted raw_string (opaque leaf to tree-sitter). Two forms:
2426      // separate (`printf -v NAME`) and fused (`printf -vNAME`, getopt-style).
2427      // `printf '[%s]' x` stays safe — `[` in format string, not after `-v`.
2428      const dangerFlags = SUBSCRIPT_EVAL_FLAGS[name]
2429      if (dangerFlags !== undefined) {
2430        for (let i = 1; i < a.length; i++) {
2431          const arg = a[i]!
2432          // Separate form: `-v` then NAME in next arg.
2433          if (dangerFlags.has(arg) && a[i + 1]?.includes('[')) {
2434            return {
2435              ok: false,
2436              reason: `'${name} ${arg}' operand contains array subscript — bash evaluates $(cmd) in subscripts`,
2437            }
2438          }
2439          // Combined short flags: `-ra` is bash shorthand for `-r -a`.
2440          // Check if any danger flag character appears in a combined flag
2441          // string. The danger flag's NAME operand is the next argument.
2442          if (
2443            arg.length > 2 &&
2444            arg[0] === '-' &&
2445            arg[1] !== '-' &&
2446            !arg.includes('[')
2447          ) {
2448            for (const flag of dangerFlags) {
2449              if (flag.length === 2 && arg.includes(flag[1]!)) {
2450                if (a[i + 1]?.includes('[')) {
2451                  return {
2452                    ok: false,
2453                    reason: `'${name} ${flag}' (combined in '${arg}') operand contains array subscript — bash evaluates $(cmd) in subscripts`,
2454                  }
2455                }
2456              }
2457            }
2458          }
2459          // Fused form: `-vNAME` in one arg. Only short-option flags fuse
2460          // (getopt), so check -v/-a/-R. `[[` uses test_operator nodes only.
2461          for (const flag of dangerFlags) {
2462            if (
2463              flag.length === 2 &&
2464              arg.startsWith(flag) &&
2465              arg.length > 2 &&
2466              arg.includes('[')
2467            ) {
2468              return {
2469                ok: false,
2470                reason: `'${name} ${flag}' (fused) operand contains array subscript — bash evaluates $(cmd) in subscripts`,
2471              }
2472            }
2473          }
2474        }
2475      }
2476  
2477      // SECURITY: `[[ ARG OP ARG ]]` arithmetic comparison. bash evaluates
2478      // BOTH operands as arithmetic expressions, recursively expanding
2479      // `arr[$(cmd)]` subscripts even from single-quoted raw_string. Check
2480      // the operand adjacent to each arith-cmp operator on BOTH sides —
2481      // SUBSCRIPT_EVAL_FLAGS's "flag then next-arg" pattern can't express
2482      // "either side of a binary op". String comparisons (==/!=/=~) do NOT
2483      // trigger arithmetic eval — `[[ 'a[x]' == y ]]` is a literal string cmp.
2484      if (name === '[[') {
2485        // i starts at 2: a[0]='[[' (contains '['), a[1] is the first real
2486        // operand. A binary op can't appear before index 2.
2487        for (let i = 2; i < a.length; i++) {
2488          if (!TEST_ARITH_CMP_OPS.has(a[i]!)) continue
2489          if (a[i - 1]?.includes('[') || a[i + 1]?.includes('[')) {
2490            return {
2491              ok: false,
2492              reason: `'[[ ... ${a[i]} ... ]]' operand contains array subscript — bash arithmetically evaluates $(cmd) in subscripts`,
2493            }
2494          }
2495        }
2496      }
2497  
2498      // SECURITY: `read`/`unset` treat EVERY bare positional as a NAME —
2499      // no flag needed. `read 'a[$(id)]' <<< data` executes id even though
2500      // argv[1] arrived from a single-quoted raw_string and no -a flag is
2501      // present. Same primitive as SUBSCRIPT_EVAL_FLAGS but the trigger is
2502      // positional, not flag-gated. Skip operands of read's data-taking
2503      // flags (-p PROMPT etc.) to avoid blocking `read -p '[foo] ' var`.
2504      if (BARE_SUBSCRIPT_NAME_BUILTINS.has(name)) {
2505        let skipNext = false
2506        for (let i = 1; i < a.length; i++) {
2507          const arg = a[i]!
2508          if (skipNext) {
2509            skipNext = false
2510            continue
2511          }
2512          if (arg[0] === '-') {
2513            if (name === 'read') {
2514              if (READ_DATA_FLAGS.has(arg)) {
2515                skipNext = true
2516              } else if (arg.length > 2 && arg[1] !== '-') {
2517                // Combined short flag like `-rp`. Getopt-style: first
2518                // data-flag char consumes rest-of-arg as its operand
2519                // (`-p[foo]` → prompt=`[foo]`), or next-arg if last
2520                // (`-rp '[foo]'` → prompt=`[foo]`). So skipNext iff a
2521                // data-flag char appears at the END after only no-arg
2522                // flags like `-r`/`-s`.
2523                for (let j = 1; j < arg.length; j++) {
2524                  if (READ_DATA_FLAGS.has('-' + arg[j])) {
2525                    if (j === arg.length - 1) skipNext = true
2526                    break
2527                  }
2528                }
2529              }
2530            }
2531            continue
2532          }
2533          if (arg.includes('[')) {
2534            return {
2535              ok: false,
2536              reason: `'${name}' positional NAME '${arg}' contains array subscript — bash evaluates $(cmd) in subscripts`,
2537            }
2538          }
2539        }
2540      }
2541  
2542      // SECURITY: Shell reserved keywords as argv[0] indicate a tree-sitter
2543      // mis-parse. `! for i in a; do :; done` parses as `command "for i in a"`
2544      // + `command "do :"` + `command "done"` — tree-sitter fails to recognize
2545      // `for` after `!` as a compound command start. Reject: keywords can never
2546      // be legitimate command names, and argv like ['do','false'] is nonsense.
2547      if (SHELL_KEYWORDS.has(name)) {
2548        return {
2549          ok: false,
2550          reason: `Shell keyword '${name}' as command name — tree-sitter mis-parse`,
2551        }
2552      }
2553  
2554      // Check argv (not .text) to catch both single-quote (`'\n#'`) and
2555      // double-quote (`"\n#"`) variants. Env vars and redirects are also
2556      // part of the .text span so the same downstream bug applies.
2557      // Heredoc bodies are excluded from argv so markdown `##` headers
2558      // don't trigger this.
2559      // TODO: remove once downstream path validation operates on argv.
2560      for (const arg of cmd.argv) {
2561        if (arg.includes('\n') && NEWLINE_HASH_RE.test(arg)) {
2562          return {
2563            ok: false,
2564            reason:
2565              'Newline followed by # inside a quoted argument can hide arguments from path validation',
2566          }
2567        }
2568      }
2569      for (const ev of cmd.envVars) {
2570        if (ev.value.includes('\n') && NEWLINE_HASH_RE.test(ev.value)) {
2571          return {
2572            ok: false,
2573            reason:
2574              'Newline followed by # inside an env var value can hide arguments from path validation',
2575          }
2576        }
2577      }
2578      for (const r of cmd.redirects) {
2579        if (r.target.includes('\n') && NEWLINE_HASH_RE.test(r.target)) {
2580          return {
2581            ok: false,
2582            reason:
2583              'Newline followed by # inside a redirect target can hide arguments from path validation',
2584          }
2585        }
2586      }
2587  
2588      // jq's system() built-in executes arbitrary shell commands, and flags
2589      // like --from-file can read arbitrary files into jq variables. On the
2590      // legacy path these are caught by validateJqCommand in bashSecurity.ts,
2591      // but that validator is gated behind `astSubcommands === null` and
2592      // never runs when the AST parse succeeds. Mirror the checks here so
2593      // the AST path has the same defence.
2594      if (name === 'jq') {
2595        for (const arg of a) {
2596          if (/\bsystem\s*\(/.test(arg)) {
2597            return {
2598              ok: false,
2599              reason:
2600                'jq command contains system() function which executes arbitrary commands',
2601            }
2602          }
2603        }
2604        if (
2605          a.some(arg =>
2606            /^(?:-[fL](?:$|[^A-Za-z])|--(?:from-file|rawfile|slurpfile|library-path)(?:$|=))/.test(
2607              arg,
2608            ),
2609          )
2610        ) {
2611          return {
2612            ok: false,
2613            reason:
2614              'jq command contains dangerous flags that could execute code or read arbitrary files',
2615          }
2616        }
2617      }
2618  
2619      if (ZSH_DANGEROUS_BUILTINS.has(name)) {
2620        return {
2621          ok: false,
2622          reason: `Zsh builtin '${name}' can bypass security checks`,
2623        }
2624      }
2625  
2626      if (EVAL_LIKE_BUILTINS.has(name)) {
2627        // `command -v foo` / `command -V foo` are POSIX existence checks that
2628        // only print paths — they never execute argv[1]. Bare `command foo`
2629        // does bypass function/alias lookup (the concern), so keep blocking it.
2630        if (name === 'command' && (a[1] === '-v' || a[1] === '-V')) {
2631          // fall through to remaining checks
2632        } else if (
2633          name === 'fc' &&
2634          !a.slice(1).some(arg => /^-[^-]*[es]/.test(arg))
2635        ) {
2636          // `fc -l`, `fc -ln` list history — safe. `fc -e ed` invokes an
2637          // editor then executes. `fc -s [pat=rep]` RE-EXECUTES the last
2638          // matching command (optionally with substitution) — as dangerous
2639          // as eval. Block any short-opt containing `e` or `s`.
2640          // to avoid introducing FPs for `fc -l` (list history).
2641        } else if (
2642          name === 'compgen' &&
2643          !a.slice(1).some(arg => /^-[^-]*[CFW]/.test(arg))
2644        ) {
2645          // `compgen -c/-f/-v` only list completions — safe. `compgen -C cmd`
2646          // immediately executes cmd; `-F func` calls a shell function; `-W list`
2647          // word-expands its argument (including $(cmd) even from single-quoted
2648          // raw_string). Block any short-opt containing C/F/W (case-sensitive:
2649          // -c/-f are safe).
2650        } else {
2651          return {
2652            ok: false,
2653            reason: `'${name}' evaluates arguments as shell code`,
2654          }
2655        }
2656      }
2657  
2658      // /proc/*/environ exposes env vars (including secrets) of other processes.
2659      // Check argv and redirect targets — `cat /proc/self/environ` and
2660      // `cat < /proc/self/environ` both read it.
2661      for (const arg of cmd.argv) {
2662        if (arg.includes('/proc/') && PROC_ENVIRON_RE.test(arg)) {
2663          return {
2664            ok: false,
2665            reason: 'Accesses /proc/*/environ which may expose secrets',
2666          }
2667        }
2668      }
2669      for (const r of cmd.redirects) {
2670        if (r.target.includes('/proc/') && PROC_ENVIRON_RE.test(r.target)) {
2671          return {
2672            ok: false,
2673            reason: 'Accesses /proc/*/environ which may expose secrets',
2674          }
2675        }
2676      }
2677    }
2678    return { ok: true }
2679  }