pdf-parser.ts
1 import pdfParse from 'pdf-parse'; 2 3 // ============================================================================ 4 // PDF Parser — Extract text from uploaded CV PDFs 5 // ============================================================================ 6 7 export interface ParsedPDF { 8 text: string; 9 pageCount: number; 10 info: { 11 title?: string; 12 author?: string; 13 creationDate?: string; 14 }; 15 /** 0-100 score indicating text extraction quality */ 16 qualityScore: number; 17 /** Warning message if quality is below threshold */ 18 qualityWarning?: string; 19 } 20 21 /** 22 * Parse a PDF file buffer and extract text content. 23 * 24 * @param buffer - The PDF file as a Buffer (from uploaded file) 25 * @returns Parsed text content and metadata 26 * @throws Error if the PDF is invalid, empty, or unreadable 27 */ 28 export async function parsePDF(buffer: Buffer): Promise<ParsedPDF> { 29 try { 30 const data = await pdfParse(buffer, { 31 // Limit to 20 pages — CVs shouldn't be longer than this 32 max: 20, 33 }); 34 35 const text = data.text?.trim(); 36 37 if (!text || text.length < 200) { 38 throw new Error( 39 'Could not extract meaningful text from this PDF. ' + 40 'The file might be a scanned image without OCR, or it may be empty. ' + 41 'Please upload a text-based PDF (you can test by selecting text in your PDF viewer).' 42 ); 43 } 44 45 const cleaned = cleanText(text); 46 const quality = assessTextQuality(cleaned); 47 48 return { 49 text: cleaned, 50 pageCount: data.numpages, 51 info: { 52 title: data.info?.Title || undefined, 53 author: data.info?.Author || undefined, 54 creationDate: data.info?.CreationDate || undefined, 55 }, 56 qualityScore: quality.score, 57 qualityWarning: quality.warning, 58 }; 59 } catch (error) { 60 if (error instanceof Error && error.message.includes('Could not extract')) { 61 throw error; // Re-throw our custom error 62 } 63 64 throw new Error( 65 'Failed to parse the PDF file. Please ensure it\'s a valid, non-corrupted PDF document.' 66 ); 67 } 68 } 69 70 /** 71 * Assess the quality of extracted PDF text. 72 * Returns a 0-100 score and an optional warning message. 73 * 74 * Checks for: 75 * - Word ratio: proportion of text that forms recognizable words 76 * - Encoding artifacts: garbled characters, excessive special chars 77 * - Structure signals: presence of typical CV sections 78 */ 79 export function assessTextQuality(text: string): { score: number; warning?: string } { 80 const warnings: string[] = []; 81 let score = 100; 82 83 // 1. Word ratio: split on whitespace, check how many tokens look like real words 84 const tokens = text.split(/\s+/).filter((t) => t.length > 0); 85 if (tokens.length === 0) return { score: 0, warning: 'No readable text extracted from PDF.' }; 86 87 const wordPattern = /^[a-zA-ZÀ-ÿ0-9@.,\-/()&'+:;!?#]+$/; 88 const realWords = tokens.filter((t) => wordPattern.test(t)); 89 const wordRatio = realWords.length / tokens.length; 90 91 if (wordRatio < 0.4) { 92 score -= 50; 93 warnings.push('Most of the extracted text appears garbled or unreadable.'); 94 } else if (wordRatio < 0.65) { 95 score -= 25; 96 warnings.push('Some parts of the extracted text may be garbled.'); 97 } 98 99 // 2. Non-alphanumeric ratio: if >50% of chars are non-alphanumeric (excluding spaces/newlines), text is garbled 100 const alnumChars = (text.match(/[a-zA-ZÀ-ÿ0-9]/g) || []).length; 101 const contentChars = text.replace(/[\s\n\r]/g, '').length; 102 const nonAlnumRatio = contentChars > 0 ? 1 - (alnumChars / contentChars) : 1; 103 104 if (nonAlnumRatio > 0.5) { 105 score -= 40; 106 warnings.push('More than 50% of extracted text is non-alphanumeric — file likely contains garbled or image-based content.'); 107 } else if (nonAlnumRatio > 0.35) { 108 score -= 15; 109 warnings.push('High proportion of special characters in extracted text.'); 110 } 111 112 // 3. Encoding artifact detection: high concentration of replacement chars or control sequences 113 const artifactPattern = /[\uFFFD\u0000-\u0008\u000E-\u001F]|\\x[0-9a-f]{2}/gi; 114 const artifacts = (text.match(artifactPattern) || []).length; 115 const artifactRatio = artifacts / text.length; 116 117 if (artifactRatio > 0.02) { 118 score -= 30; 119 warnings.push('PDF contains encoding artifacts — file may be a scanned image or use non-standard fonts.'); 120 } else if (artifactRatio > 0.005) { 121 score -= 10; 122 warnings.push('Minor encoding issues detected in the PDF.'); 123 } 124 125 // 4. CV structure signals: check for common section headers 126 const sectionKeywords = [ 127 /\b(experience|employment|work history)\b/i, 128 /\b(education|university|degree|bachelor|master)\b/i, 129 /\b(skill|competenc|proficienc)\b/i, 130 /\b(summary|profile|objective|about)\b/i, 131 ]; 132 const sectionsFound = sectionKeywords.filter((kw) => kw.test(text)).length; 133 134 if (sectionsFound === 0) { 135 score -= 15; 136 warnings.push('No recognizable CV section headers found — text extraction may be incomplete.'); 137 } 138 139 // 5. Text length vs page count heuristic (very short text from multi-page PDFs = likely image-based) 140 // This is checked externally since we need pageCount 141 142 return { 143 score: Math.max(0, Math.min(100, score)), 144 warning: warnings.length > 0 ? warnings.join(' ') : undefined, 145 }; 146 } 147 148 /** 149 * Clean extracted PDF text: 150 * - Remove excessive whitespace 151 * - Fix common extraction artifacts 152 * - Normalize line breaks 153 */ 154 export function cleanText(text: string): string { 155 return text 156 // Replace multiple spaces with single space 157 .replace(/[ \t]+/g, ' ') 158 // Replace 3+ consecutive newlines with 2 159 .replace(/\n{3,}/g, '\n\n') 160 // Remove null bytes and other control characters 161 .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g, '') 162 // Fix common PDF extraction artifacts (bullet characters) 163 .replace(/[•◦▪▸►]/g, '- ') 164 // Fix ligatures 165 .replace(/fi/g, 'fi') 166 .replace(/fl/g, 'fl') 167 .replace(/ff/g, 'ff') 168 // Trim each line 169 .split('\n') 170 .map((line) => line.trim()) 171 .join('\n') 172 // Final trim 173 .trim(); 174 } 175 176 /** 177 * Validate a PDF buffer before parsing. 178 * Checks the PDF magic bytes header. 179 */ 180 export function validatePDFBuffer(buffer: Buffer): boolean { 181 // PDF files start with %PDF 182 if (buffer.length < 5) return false; 183 const header = buffer.subarray(0, 5).toString('ascii'); 184 return header.startsWith('%PDF'); 185 }