/ utils / readFileInRange.ts
readFileInRange.ts
  1  // ---------------------------------------------------------------------------
  2  // readFileInRange — line-oriented file reader with two code paths
  3  // ---------------------------------------------------------------------------
  4  //
  5  // Returns lines [offset, offset + maxLines) from a file.
  6  //
  7  // Fast path (regular files < 10 MB):
  8  //   Opens the file, stats the fd, reads the whole file with readFile(),
  9  //   then splits lines in memory.  This avoids the per-chunk async overhead
 10  //   of createReadStream and is ~2x faster for typical source files.
 11  //
 12  // Streaming path (large files, pipes, devices, etc.):
 13  //   Uses createReadStream with manual indexOf('\n') scanning.  Content is
 14  //   only accumulated for lines inside the requested range — lines outside
 15  //   the range are counted (for totalLines) but discarded, so reading line
 16  //   1 of a 100 GB file won't balloon RSS.
 17  //
 18  //   All event handlers (streamOnOpen/Data/End) are module-level named
 19  //   functions with zero closures.  State lives in a StreamState object;
 20  //   handlers access it via `this`, bound at registration time.
 21  //
 22  //   Lifecycle: `open`, `end`, and `error` use .once() (auto-remove).
 23  //   `data` fires until the stream ends or is destroyed — either way the
 24  //   stream and state become unreachable together and are GC'd.
 25  //
 26  //   On error (including maxBytes exceeded), stream.destroy(err) emits
 27  //   'error' → reject (passed directly to .once('error')).
 28  //
 29  // Both paths strip UTF-8 BOM and \r (CRLF → LF).
 30  //
 31  // mtime comes from fstat/stat on the already-open fd — no extra open().
 32  //
 33  // maxBytes behavior depends on options.truncateOnByteLimit:
 34  //   false (default): legacy semantics — throws FileTooLargeError if the FILE
 35  //     size (fast path) or total streamed bytes (streaming) exceed maxBytes.
 36  //   true: caps SELECTED OUTPUT at maxBytes.  Stops at the last complete line
 37  //     that fits; sets truncatedByBytes in the result.  Never throws.
 38  // ---------------------------------------------------------------------------
 39  
 40  import { createReadStream, fstat } from 'fs'
 41  import { stat as fsStat, readFile } from 'fs/promises'
 42  import { formatFileSize } from './format.js'
 43  
 44  const FAST_PATH_MAX_SIZE = 10 * 1024 * 1024 // 10 MB
 45  
 46  export type ReadFileRangeResult = {
 47    content: string
 48    lineCount: number
 49    totalLines: number
 50    totalBytes: number
 51    readBytes: number
 52    mtimeMs: number
 53    /** true when output was clipped to maxBytes under truncate mode */
 54    truncatedByBytes?: boolean
 55  }
 56  
 57  export class FileTooLargeError extends Error {
 58    constructor(
 59      public sizeInBytes: number,
 60      public maxSizeBytes: number,
 61    ) {
 62      super(
 63        `File content (${formatFileSize(sizeInBytes)}) exceeds maximum allowed size (${formatFileSize(maxSizeBytes)}). Use offset and limit parameters to read specific portions of the file, or search for specific content instead of reading the whole file.`,
 64      )
 65      this.name = 'FileTooLargeError'
 66    }
 67  }
 68  
 69  // ---------------------------------------------------------------------------
 70  // Public entry point
 71  // ---------------------------------------------------------------------------
 72  
 73  export async function readFileInRange(
 74    filePath: string,
 75    offset = 0,
 76    maxLines?: number,
 77    maxBytes?: number,
 78    signal?: AbortSignal,
 79    options?: { truncateOnByteLimit?: boolean },
 80  ): Promise<ReadFileRangeResult> {
 81    signal?.throwIfAborted()
 82    const truncateOnByteLimit = options?.truncateOnByteLimit ?? false
 83  
 84    // stat to decide the code path and guard against OOM.
 85    // For regular files under 10 MB: readFile + in-memory split (fast).
 86    // Everything else (large files, FIFOs, devices): streaming.
 87    const stats = await fsStat(filePath)
 88  
 89    if (stats.isDirectory()) {
 90      throw new Error(
 91        `EISDIR: illegal operation on a directory, read '${filePath}'`,
 92      )
 93    }
 94  
 95    if (stats.isFile() && stats.size < FAST_PATH_MAX_SIZE) {
 96      if (
 97        !truncateOnByteLimit &&
 98        maxBytes !== undefined &&
 99        stats.size > maxBytes
100      ) {
101        throw new FileTooLargeError(stats.size, maxBytes)
102      }
103  
104      const text = await readFile(filePath, { encoding: 'utf8', signal })
105      return readFileInRangeFast(
106        text,
107        stats.mtimeMs,
108        offset,
109        maxLines,
110        truncateOnByteLimit ? maxBytes : undefined,
111      )
112    }
113  
114    return readFileInRangeStreaming(
115      filePath,
116      offset,
117      maxLines,
118      maxBytes,
119      truncateOnByteLimit,
120      signal,
121    )
122  }
123  
124  // ---------------------------------------------------------------------------
125  // Fast path — readFile + in-memory split
126  // ---------------------------------------------------------------------------
127  
128  function readFileInRangeFast(
129    raw: string,
130    mtimeMs: number,
131    offset: number,
132    maxLines: number | undefined,
133    truncateAtBytes: number | undefined,
134  ): ReadFileRangeResult {
135    const endLine = maxLines !== undefined ? offset + maxLines : Infinity
136  
137    // Strip BOM.
138    const text = raw.charCodeAt(0) === 0xfeff ? raw.slice(1) : raw
139  
140    // Split lines, strip \r, select range.
141    const selectedLines: string[] = []
142    let lineIndex = 0
143    let startPos = 0
144    let newlinePos: number
145    let selectedBytes = 0
146    let truncatedByBytes = false
147  
148    function tryPush(line: string): boolean {
149      if (truncateAtBytes !== undefined) {
150        const sep = selectedLines.length > 0 ? 1 : 0
151        const nextBytes = selectedBytes + sep + Buffer.byteLength(line)
152        if (nextBytes > truncateAtBytes) {
153          truncatedByBytes = true
154          return false
155        }
156        selectedBytes = nextBytes
157      }
158      selectedLines.push(line)
159      return true
160    }
161  
162    while ((newlinePos = text.indexOf('\n', startPos)) !== -1) {
163      if (lineIndex >= offset && lineIndex < endLine && !truncatedByBytes) {
164        let line = text.slice(startPos, newlinePos)
165        if (line.endsWith('\r')) {
166          line = line.slice(0, -1)
167        }
168        tryPush(line)
169      }
170      lineIndex++
171      startPos = newlinePos + 1
172    }
173  
174    // Final fragment (no trailing newline).
175    if (lineIndex >= offset && lineIndex < endLine && !truncatedByBytes) {
176      let line = text.slice(startPos)
177      if (line.endsWith('\r')) {
178        line = line.slice(0, -1)
179      }
180      tryPush(line)
181    }
182    lineIndex++
183  
184    const content = selectedLines.join('\n')
185    return {
186      content,
187      lineCount: selectedLines.length,
188      totalLines: lineIndex,
189      totalBytes: Buffer.byteLength(text, 'utf8'),
190      readBytes: Buffer.byteLength(content, 'utf8'),
191      mtimeMs,
192      ...(truncatedByBytes ? { truncatedByBytes: true } : {}),
193    }
194  }
195  
196  // ---------------------------------------------------------------------------
197  // Streaming path — createReadStream + event handlers
198  // ---------------------------------------------------------------------------
199  
200  type StreamState = {
201    stream: ReturnType<typeof createReadStream>
202    offset: number
203    endLine: number
204    maxBytes: number | undefined
205    truncateOnByteLimit: boolean
206    resolve: (value: ReadFileRangeResult) => void
207    totalBytesRead: number
208    selectedBytes: number
209    truncatedByBytes: boolean
210    currentLineIndex: number
211    selectedLines: string[]
212    partial: string
213    isFirstChunk: boolean
214    resolveMtime: (ms: number) => void
215    mtimeReady: Promise<number>
216  }
217  
218  function streamOnOpen(this: StreamState, fd: number): void {
219    fstat(fd, (err, stats) => {
220      this.resolveMtime(err ? 0 : stats.mtimeMs)
221    })
222  }
223  
224  function streamOnData(this: StreamState, chunk: string): void {
225    if (this.isFirstChunk) {
226      this.isFirstChunk = false
227      if (chunk.charCodeAt(0) === 0xfeff) {
228        chunk = chunk.slice(1)
229      }
230    }
231  
232    this.totalBytesRead += Buffer.byteLength(chunk)
233    if (
234      !this.truncateOnByteLimit &&
235      this.maxBytes !== undefined &&
236      this.totalBytesRead > this.maxBytes
237    ) {
238      this.stream.destroy(
239        new FileTooLargeError(this.totalBytesRead, this.maxBytes),
240      )
241      return
242    }
243  
244    const data = this.partial.length > 0 ? this.partial + chunk : chunk
245    this.partial = ''
246  
247    let startPos = 0
248    let newlinePos: number
249    while ((newlinePos = data.indexOf('\n', startPos)) !== -1) {
250      if (
251        this.currentLineIndex >= this.offset &&
252        this.currentLineIndex < this.endLine
253      ) {
254        let line = data.slice(startPos, newlinePos)
255        if (line.endsWith('\r')) {
256          line = line.slice(0, -1)
257        }
258        if (this.truncateOnByteLimit && this.maxBytes !== undefined) {
259          const sep = this.selectedLines.length > 0 ? 1 : 0
260          const nextBytes = this.selectedBytes + sep + Buffer.byteLength(line)
261          if (nextBytes > this.maxBytes) {
262            // Cap hit — collapse the selection range so nothing more is
263            // accumulated.  Stream continues (to count totalLines).
264            this.truncatedByBytes = true
265            this.endLine = this.currentLineIndex
266          } else {
267            this.selectedBytes = nextBytes
268            this.selectedLines.push(line)
269          }
270        } else {
271          this.selectedLines.push(line)
272        }
273      }
274      this.currentLineIndex++
275      startPos = newlinePos + 1
276    }
277  
278    // Only keep the trailing fragment when inside the selected range.
279    // Outside the range we just count newlines — discarding prevents
280    // unbounded memory growth on huge single-line files.
281    if (startPos < data.length) {
282      if (
283        this.currentLineIndex >= this.offset &&
284        this.currentLineIndex < this.endLine
285      ) {
286        const fragment = data.slice(startPos)
287        // In truncate mode, `partial` can grow unboundedly if the selected
288        // range contains a huge single line (no newline across many chunks).
289        // Once the fragment alone would overflow the remaining budget, we know
290        // the completed line can never fit — set truncated, collapse the
291        // selection range, and discard the fragment to stop accumulation.
292        if (this.truncateOnByteLimit && this.maxBytes !== undefined) {
293          const sep = this.selectedLines.length > 0 ? 1 : 0
294          const fragBytes = this.selectedBytes + sep + Buffer.byteLength(fragment)
295          if (fragBytes > this.maxBytes) {
296            this.truncatedByBytes = true
297            this.endLine = this.currentLineIndex
298            return
299          }
300        }
301        this.partial = fragment
302      }
303    }
304  }
305  
306  function streamOnEnd(this: StreamState): void {
307    let line = this.partial
308    if (line.endsWith('\r')) {
309      line = line.slice(0, -1)
310    }
311    if (
312      this.currentLineIndex >= this.offset &&
313      this.currentLineIndex < this.endLine
314    ) {
315      if (this.truncateOnByteLimit && this.maxBytes !== undefined) {
316        const sep = this.selectedLines.length > 0 ? 1 : 0
317        const nextBytes = this.selectedBytes + sep + Buffer.byteLength(line)
318        if (nextBytes > this.maxBytes) {
319          this.truncatedByBytes = true
320        } else {
321          this.selectedLines.push(line)
322        }
323      } else {
324        this.selectedLines.push(line)
325      }
326    }
327    this.currentLineIndex++
328  
329    const content = this.selectedLines.join('\n')
330    const truncated = this.truncatedByBytes
331    this.mtimeReady.then(mtimeMs => {
332      this.resolve({
333        content,
334        lineCount: this.selectedLines.length,
335        totalLines: this.currentLineIndex,
336        totalBytes: this.totalBytesRead,
337        readBytes: Buffer.byteLength(content, 'utf8'),
338        mtimeMs,
339        ...(truncated ? { truncatedByBytes: true } : {}),
340      })
341    })
342  }
343  
344  function readFileInRangeStreaming(
345    filePath: string,
346    offset: number,
347    maxLines: number | undefined,
348    maxBytes: number | undefined,
349    truncateOnByteLimit: boolean,
350    signal?: AbortSignal,
351  ): Promise<ReadFileRangeResult> {
352    return new Promise((resolve, reject) => {
353      const state: StreamState = {
354        stream: createReadStream(filePath, {
355          encoding: 'utf8',
356          highWaterMark: 512 * 1024,
357          ...(signal ? { signal } : undefined),
358        }),
359        offset,
360        endLine: maxLines !== undefined ? offset + maxLines : Infinity,
361        maxBytes,
362        truncateOnByteLimit,
363        resolve,
364        totalBytesRead: 0,
365        selectedBytes: 0,
366        truncatedByBytes: false,
367        currentLineIndex: 0,
368        selectedLines: [],
369        partial: '',
370        isFirstChunk: true,
371        resolveMtime: () => {},
372        mtimeReady: null as unknown as Promise<number>,
373      }
374      state.mtimeReady = new Promise<number>(r => {
375        state.resolveMtime = r
376      })
377  
378      state.stream.once('open', streamOnOpen.bind(state))
379      state.stream.on('data', streamOnData.bind(state))
380      state.stream.once('end', streamOnEnd.bind(state))
381      state.stream.once('error', reject)
382    })
383  }