Cradicle Explorer

/ utils / pdf.ts
pdf.ts
  1  import { randomUUID } from 'crypto'
  2  import { mkdir, readdir, readFile } from 'fs/promises'
  3  import { join } from 'path'
  4  import {
  5    PDF_MAX_EXTRACT_SIZE,
  6    PDF_TARGET_RAW_SIZE,
  7  } from '../constants/apiLimits.js'
  8  import { errorMessage } from './errors.js'
  9  import { execFileNoThrow } from './execFileNoThrow.js'
 10  import { formatFileSize } from './format.js'
 11  import { getFsImplementation } from './fsOperations.js'
 12  import { getToolResultsDir } from './toolResultStorage.js'
 13  
 14  export type PDFError = {
 15    reason:
 16      | 'empty'
 17      | 'too_large'
 18      | 'password_protected'
 19      | 'corrupted'
 20      | 'unknown'
 21      | 'unavailable'
 22    message: string
 23  }
 24  
 25  export type PDFResult<T> =
 26    | { success: true; data: T }
 27    | { success: false; error: PDFError }
 28  
 29  /**
 30   * Read a PDF file and return it as base64-encoded data.
 31   * @param filePath Path to the PDF file
 32   * @returns Result containing PDF data or a structured error
 33   */
 34  export async function readPDF(filePath: string): Promise<
 35    PDFResult<{
 36      type: 'pdf'
 37      file: {
 38        filePath: string
 39        base64: string
 40        originalSize: number
 41      }
 42    }>
 43  > {
 44    try {
 45      const fs = getFsImplementation()
 46      const stats = await fs.stat(filePath)
 47      const originalSize = stats.size
 48  
 49      // Check if file is empty
 50      if (originalSize === 0) {
 51        return {
 52          success: false,
 53          error: { reason: 'empty', message: `PDF file is empty: ${filePath}` },
 54        }
 55      }
 56  
 57      // Check if PDF exceeds maximum size
 58      // The API has a 32MB total request limit. After base64 encoding (~33% larger),
 59      // a PDF must be under ~20MB raw to leave room for conversation context.
 60      if (originalSize > PDF_TARGET_RAW_SIZE) {
 61        return {
 62          success: false,
 63          error: {
 64            reason: 'too_large',
 65            message: `PDF file exceeds maximum allowed size of ${formatFileSize(PDF_TARGET_RAW_SIZE)}.`,
 66          },
 67        }
 68      }
 69  
 70      const fileBuffer = await readFile(filePath)
 71  
 72      // Validate PDF magic bytes — reject files that aren't actually PDFs
 73      // (e.g., HTML files renamed to .pdf) before they enter conversation context.
 74      // Once an invalid PDF document block is in the message history, every subsequent
 75      // API call fails with 400 "The PDF specified was not valid" and the session
 76      // becomes unrecoverable without /clear.
 77      const header = fileBuffer.subarray(0, 5).toString('ascii')
 78      if (!header.startsWith('%PDF-')) {
 79        return {
 80          success: false,
 81          error: {
 82            reason: 'corrupted',
 83            message: `File is not a valid PDF (missing %PDF- header): ${filePath}`,
 84          },
 85        }
 86      }
 87  
 88      const base64 = fileBuffer.toString('base64')
 89  
 90      // Note: We cannot check page count here without parsing the PDF
 91      // The API will enforce the 100-page limit and return an error if exceeded
 92  
 93      return {
 94        success: true,
 95        data: {
 96          type: 'pdf',
 97          file: {
 98            filePath,
 99            base64,
100            originalSize,
101          },
102        },
103      }
104    } catch (e: unknown) {
105      return {
106        success: false,
107        error: {
108          reason: 'unknown',
109          message: errorMessage(e),
110        },
111      }
112    }
113  }
114  
115  /**
116   * Get the number of pages in a PDF file using `pdfinfo` (from poppler-utils).
117   * Returns `null` if pdfinfo is not available or if the page count cannot be determined.
118   */
119  export async function getPDFPageCount(
120    filePath: string,
121  ): Promise<number | null> {
122    const { code, stdout } = await execFileNoThrow('pdfinfo', [filePath], {
123      timeout: 10_000,
124      useCwd: false,
125    })
126    if (code !== 0) {
127      return null
128    }
129    const match = /^Pages:\s+(\d+)/m.exec(stdout)
130    if (!match) {
131      return null
132    }
133    const count = parseInt(match[1]!, 10)
134    return isNaN(count) ? null : count
135  }
136  
137  export type PDFExtractPagesResult = {
138    type: 'parts'
139    file: {
140      filePath: string
141      originalSize: number
142      count: number
143      outputDir: string
144    }
145  }
146  
147  let pdftoppmAvailable: boolean | undefined
148  
149  /**
150   * Reset the pdftoppm availability cache. Used by tests only.
151   */
152  export function resetPdftoppmCache(): void {
153    pdftoppmAvailable = undefined
154  }
155  
156  /**
157   * Check whether the `pdftoppm` binary (from poppler-utils) is available.
158   * The result is cached for the lifetime of the process.
159   */
160  export async function isPdftoppmAvailable(): Promise<boolean> {
161    if (pdftoppmAvailable !== undefined) return pdftoppmAvailable
162    const { code, stderr } = await execFileNoThrow('pdftoppm', ['-v'], {
163      timeout: 5000,
164      useCwd: false,
165    })
166    // pdftoppm prints version info to stderr and exits 0 (or sometimes 99 on older versions)
167    pdftoppmAvailable = code === 0 || stderr.length > 0
168    return pdftoppmAvailable
169  }
170  
171  /**
172   * Extract PDF pages as JPEG images using pdftoppm.
173   * Produces page-01.jpg, page-02.jpg, etc. in an output directory.
174   * This enables reading large PDFs and works with all API providers.
175   *
176   * @param filePath Path to the PDF file
177   * @param options Optional page range (1-indexed, inclusive)
178   */
179  export async function extractPDFPages(
180    filePath: string,
181    options?: { firstPage?: number; lastPage?: number },
182  ): Promise<PDFResult<PDFExtractPagesResult>> {
183    try {
184      const fs = getFsImplementation()
185      const stats = await fs.stat(filePath)
186      const originalSize = stats.size
187  
188      if (originalSize === 0) {
189        return {
190          success: false,
191          error: { reason: 'empty', message: `PDF file is empty: ${filePath}` },
192        }
193      }
194  
195      if (originalSize > PDF_MAX_EXTRACT_SIZE) {
196        return {
197          success: false,
198          error: {
199            reason: 'too_large',
200            message: `PDF file exceeds maximum allowed size for text extraction (${formatFileSize(PDF_MAX_EXTRACT_SIZE)}).`,
201          },
202        }
203      }
204  
205      const available = await isPdftoppmAvailable()
206      if (!available) {
207        return {
208          success: false,
209          error: {
210            reason: 'unavailable',
211            message:
212              'pdftoppm is not installed. Install poppler-utils (e.g. `brew install poppler` or `apt-get install poppler-utils`) to enable PDF page rendering.',
213          },
214        }
215      }
216  
217      const uuid = randomUUID()
218      const outputDir = join(getToolResultsDir(), `pdf-${uuid}`)
219      await mkdir(outputDir, { recursive: true })
220  
221      // pdftoppm produces files like <prefix>-01.jpg, <prefix>-02.jpg, etc.
222      const prefix = join(outputDir, 'page')
223      const args = ['-jpeg', '-r', '100']
224      if (options?.firstPage) {
225        args.push('-f', String(options.firstPage))
226      }
227      if (options?.lastPage && options.lastPage !== Infinity) {
228        args.push('-l', String(options.lastPage))
229      }
230      args.push(filePath, prefix)
231      const { code, stderr } = await execFileNoThrow('pdftoppm', args, {
232        timeout: 120_000,
233        useCwd: false,
234      })
235  
236      if (code !== 0) {
237        if (/password/i.test(stderr)) {
238          return {
239            success: false,
240            error: {
241              reason: 'password_protected',
242              message:
243                'PDF is password-protected. Please provide an unprotected version.',
244            },
245          }
246        }
247        if (/damaged|corrupt|invalid/i.test(stderr)) {
248          return {
249            success: false,
250            error: {
251              reason: 'corrupted',
252              message: 'PDF file is corrupted or invalid.',
253            },
254          }
255        }
256        return {
257          success: false,
258          error: { reason: 'unknown', message: `pdftoppm failed: ${stderr}` },
259        }
260      }
261  
262      // Read generated image files and sort naturally
263      const entries = await readdir(outputDir)
264      const imageFiles = entries.filter(f => f.endsWith('.jpg')).sort()
265      const pageCount = imageFiles.length
266  
267      if (pageCount === 0) {
268        return {
269          success: false,
270          error: {
271            reason: 'corrupted',
272            message: 'pdftoppm produced no output pages. The PDF may be invalid.',
273          },
274        }
275      }
276  
277      const count = imageFiles.length
278  
279      return {
280        success: true,
281        data: {
282          type: 'parts',
283          file: {
284            filePath,
285            originalSize,
286            outputDir,
287            count,
288          },
289        },
290      }
291    } catch (e: unknown) {
292      return {
293        success: false,
294        error: {
295          reason: 'unknown',
296          message: errorMessage(e),
297        },
298      }
299    }
300  }