Cradicle Explorer

/ lib / pdf-parser.ts
pdf-parser.ts
  1  import pdfParse from 'pdf-parse';
  2  
  3  // ============================================================================
  4  // PDF Parser — Extract text from uploaded CV PDFs
  5  // ============================================================================
  6  
  7  export interface ParsedPDF {
  8    text: string;
  9    pageCount: number;
 10    info: {
 11      title?: string;
 12      author?: string;
 13      creationDate?: string;
 14    };
 15    /** 0-100 score indicating text extraction quality */
 16    qualityScore: number;
 17    /** Warning message if quality is below threshold */
 18    qualityWarning?: string;
 19  }
 20  
 21  /**
 22   * Parse a PDF file buffer and extract text content.
 23   *
 24   * @param buffer - The PDF file as a Buffer (from uploaded file)
 25   * @returns Parsed text content and metadata
 26   * @throws Error if the PDF is invalid, empty, or unreadable
 27   */
 28  export async function parsePDF(buffer: Buffer): Promise<ParsedPDF> {
 29    try {
 30      const data = await pdfParse(buffer, {
 31        // Limit to 20 pages — CVs shouldn't be longer than this
 32        max: 20,
 33      });
 34  
 35      const text = data.text?.trim();
 36  
 37      if (!text || text.length < 200) {
 38        throw new Error(
 39          'Could not extract meaningful text from this PDF. ' +
 40          'The file might be a scanned image without OCR, or it may be empty. ' +
 41          'Please upload a text-based PDF (you can test by selecting text in your PDF viewer).'
 42        );
 43      }
 44  
 45      const cleaned = cleanText(text);
 46      const quality = assessTextQuality(cleaned);
 47  
 48      return {
 49        text: cleaned,
 50        pageCount: data.numpages,
 51        info: {
 52          title: data.info?.Title || undefined,
 53          author: data.info?.Author || undefined,
 54          creationDate: data.info?.CreationDate || undefined,
 55        },
 56        qualityScore: quality.score,
 57        qualityWarning: quality.warning,
 58      };
 59    } catch (error) {
 60      if (error instanceof Error && error.message.includes('Could not extract')) {
 61        throw error; // Re-throw our custom error
 62      }
 63  
 64      throw new Error(
 65        'Failed to parse the PDF file. Please ensure it\'s a valid, non-corrupted PDF document.'
 66      );
 67    }
 68  }
 69  
 70  /**
 71   * Assess the quality of extracted PDF text.
 72   * Returns a 0-100 score and an optional warning message.
 73   *
 74   * Checks for:
 75   * - Word ratio: proportion of text that forms recognizable words
 76   * - Encoding artifacts: garbled characters, excessive special chars
 77   * - Structure signals: presence of typical CV sections
 78   */
 79  export function assessTextQuality(text: string): { score: number; warning?: string } {
 80    const warnings: string[] = [];
 81    let score = 100;
 82  
 83    // 1. Word ratio: split on whitespace, check how many tokens look like real words
 84    const tokens = text.split(/\s+/).filter((t) => t.length > 0);
 85    if (tokens.length === 0) return { score: 0, warning: 'No readable text extracted from PDF.' };
 86  
 87    const wordPattern = /^[a-zA-ZÀ-ÿ0-9@.,\-/()&'+:;!?#]+$/;
 88    const realWords = tokens.filter((t) => wordPattern.test(t));
 89    const wordRatio = realWords.length / tokens.length;
 90  
 91    if (wordRatio < 0.4) {
 92      score -= 50;
 93      warnings.push('Most of the extracted text appears garbled or unreadable.');
 94    } else if (wordRatio < 0.65) {
 95      score -= 25;
 96      warnings.push('Some parts of the extracted text may be garbled.');
 97    }
 98  
 99    // 2. Non-alphanumeric ratio: if >50% of chars are non-alphanumeric (excluding spaces/newlines), text is garbled
100    const alnumChars = (text.match(/[a-zA-ZÀ-ÿ0-9]/g) || []).length;
101    const contentChars = text.replace(/[\s\n\r]/g, '').length;
102    const nonAlnumRatio = contentChars > 0 ? 1 - (alnumChars / contentChars) : 1;
103  
104    if (nonAlnumRatio > 0.5) {
105      score -= 40;
106      warnings.push('More than 50% of extracted text is non-alphanumeric — file likely contains garbled or image-based content.');
107    } else if (nonAlnumRatio > 0.35) {
108      score -= 15;
109      warnings.push('High proportion of special characters in extracted text.');
110    }
111  
112    // 3. Encoding artifact detection: high concentration of replacement chars or control sequences
113    const artifactPattern = /[\uFFFD\u0000-\u0008\u000E-\u001F]|\\x[0-9a-f]{2}/gi;
114    const artifacts = (text.match(artifactPattern) || []).length;
115    const artifactRatio = artifacts / text.length;
116  
117    if (artifactRatio > 0.02) {
118      score -= 30;
119      warnings.push('PDF contains encoding artifacts — file may be a scanned image or use non-standard fonts.');
120    } else if (artifactRatio > 0.005) {
121      score -= 10;
122      warnings.push('Minor encoding issues detected in the PDF.');
123    }
124  
125    // 4. CV structure signals: check for common section headers
126    const sectionKeywords = [
127      /\b(experience|employment|work history)\b/i,
128      /\b(education|university|degree|bachelor|master)\b/i,
129      /\b(skill|competenc|proficienc)\b/i,
130      /\b(summary|profile|objective|about)\b/i,
131    ];
132    const sectionsFound = sectionKeywords.filter((kw) => kw.test(text)).length;
133  
134    if (sectionsFound === 0) {
135      score -= 15;
136      warnings.push('No recognizable CV section headers found — text extraction may be incomplete.');
137    }
138  
139    // 5. Text length vs page count heuristic (very short text from multi-page PDFs = likely image-based)
140    // This is checked externally since we need pageCount
141  
142    return {
143      score: Math.max(0, Math.min(100, score)),
144      warning: warnings.length > 0 ? warnings.join(' ') : undefined,
145    };
146  }
147  
148  /**
149   * Clean extracted PDF text:
150   * - Remove excessive whitespace
151   * - Fix common extraction artifacts
152   * - Normalize line breaks
153   */
154  export function cleanText(text: string): string {
155    return text
156      // Replace multiple spaces with single space
157      .replace(/[ \t]+/g, ' ')
158      // Replace 3+ consecutive newlines with 2
159      .replace(/\n{3,}/g, '\n\n')
160      // Remove null bytes and other control characters
161      .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g, '')
162      // Fix common PDF extraction artifacts (bullet characters)
163      .replace(/[•◦▪▸►]/g, '- ')
164      // Fix ligatures
165      .replace(/ﬁ/g, 'fi')
166      .replace(/ﬂ/g, 'fl')
167      .replace(/ﬀ/g, 'ff')
168      // Trim each line
169      .split('\n')
170      .map((line) => line.trim())
171      .join('\n')
172      // Final trim
173      .trim();
174  }
175  
176  /**
177   * Validate a PDF buffer before parsing.
178   * Checks the PDF magic bytes header.
179   */
180  export function validatePDFBuffer(buffer: Buffer): boolean {
181    // PDF files start with %PDF
182    if (buffer.length < 5) return false;
183    const header = buffer.subarray(0, 5).toString('ascii');
184    return header.startsWith('%PDF');
185  }