ats-format-check.ts
1 // lib/ats-format-check.ts 2 // Analyzes PDF structure for ATS compatibility — pure code, no AI needed 3 4 import { ATSFormatIssue } from './types'; 5 6 // Standard ATS-recognized section headers 7 const ATS_STANDARD_HEADERS = [ 8 'experience', 'work experience', 'professional experience', 'employment history', 9 'education', 'academic background', 10 'skills', 'technical skills', 'core competencies', 'key skills', 11 'certifications', 'certificates', 'licenses', 12 'projects', 'key projects', 13 'summary', 'professional summary', 'objective', 'profile', 14 'languages', 'language skills', 15 'publications', 'awards', 'volunteer', 'references', 16 ]; 17 18 // Non-standard headers that confuse ATS parsers 19 const PROBLEMATIC_HEADERS = [ 20 'my journey', 'about me', 'who i am', 'my story', 'what i do', 21 'adventures', 'playground', 'toolbox', 'superpower', 'arsenal', 22 'what i bring', 'my expertise', 'passions', 'life philosophy', 23 ]; 24 25 interface PDFFormatAnalysis { 26 formatScore: number; // 0-100 27 issues: ATSFormatIssue[]; 28 stats: { 29 pageCount: number; 30 charCount: number; 31 isTextExtractable: boolean; 32 hasImages: boolean; 33 estimatedColumns: number; 34 fileSize: number; 35 }; 36 } 37 38 export function analyzeATSFormat( 39 cvText: string, 40 pdfMetadata: { 41 numpages?: number; 42 numrender?: number; 43 info?: Record<string, unknown>; 44 }, 45 fileSize: number // bytes 46 ): PDFFormatAnalysis { 47 const issues: ATSFormatIssue[] = []; 48 let deductions = 0; 49 50 const pageCount = pdfMetadata?.numpages ?? 1; 51 const charCount = cvText.length; 52 const isTextExtractable = charCount > 100; 53 54 // --- Check 1: Text extractability --- 55 if (!isTextExtractable) { 56 issues.push({ 57 issue: 'ats.format.noText', 58 severity: 'critical', 59 description: 'ats.format.noTextDesc', 60 fix: 'ats.format.noTextFix', 61 }); 62 deductions += 40; 63 } 64 65 // --- Check 2: Page count --- 66 if (pageCount > 5) { 67 issues.push({ 68 issue: 'ats.format.wayTooLong', 69 severity: 'critical', 70 description: 'ats.format.wayTooLongDesc', 71 fix: 'ats.format.wayTooLongFix', 72 }); 73 deductions += 20; 74 } else if (pageCount > 3) { 75 issues.push({ 76 issue: 'ats.format.tooLong', 77 severity: 'warning', 78 description: 'ats.format.tooLongDesc', 79 fix: 'ats.format.tooLongFix', 80 }); 81 deductions += 10; 82 } 83 84 // --- Check 3: File size --- 85 const fileSizeMB = fileSize / (1024 * 1024); 86 if (fileSizeMB > 2) { 87 issues.push({ 88 issue: 'ats.format.largeFile', 89 severity: 'warning', 90 description: 'ats.format.largeFileDesc', 91 fix: 'ats.format.largeFileFix', 92 }); 93 deductions += 5; 94 } 95 96 // --- Check 4: Column detection (heuristic) --- 97 const lines = cvText.split('\n').filter((l) => l.trim().length > 0); 98 const shortLineRatio = lines.filter((l) => l.trim().length < 30).length / Math.max(lines.length, 1); 99 const hasLargeGaps = cvText.includes(' ') || /\t{2,}/.test(cvText); 100 const estimatedColumns = shortLineRatio > 0.4 && hasLargeGaps ? 2 : 1; 101 102 if (estimatedColumns > 1) { 103 issues.push({ 104 issue: 'ats.format.multiColumn', 105 severity: 'warning', 106 description: 'ats.format.multiColumnDesc', 107 fix: 'ats.format.multiColumnFix', 108 }); 109 deductions += 15; 110 } 111 112 // --- Check 5: Section headers --- 113 const textLower = cvText.toLowerCase(); 114 const foundStandard = ATS_STANDARD_HEADERS.filter((h) => { 115 // Check for header-like patterns: header on its own line or followed by colon 116 const patterns = [ 117 new RegExp(`^${escapeRegex(h)}\\s*$`, 'mi'), 118 new RegExp(`^${escapeRegex(h)}\\s*:`, 'mi'), 119 new RegExp(`^${escapeRegex(h)}\\s*\\n`, 'mi'), 120 ]; 121 return patterns.some((p) => p.test(cvText)); 122 }); 123 124 const foundProblematic = PROBLEMATIC_HEADERS.filter((h) => 125 textLower.includes(h) 126 ); 127 128 if (foundProblematic.length > 0) { 129 issues.push({ 130 issue: 'ats.format.nonStandardHeaders', 131 severity: 'warning', 132 description: 'ats.format.nonStandardHeadersDesc', 133 fix: 'ats.format.nonStandardHeadersFix', 134 }); 135 deductions += 10; 136 } 137 138 // Check for essential sections 139 const hasExperience = foundStandard.some((h) => 140 ['experience', 'work experience', 'professional experience', 'employment history'].includes(h) 141 ); 142 const hasEducation = foundStandard.some((h) => 143 ['education', 'academic background'].includes(h) 144 ); 145 const hasSkills = foundStandard.some((h) => 146 ['skills', 'technical skills', 'core competencies', 'key skills'].includes(h) 147 ); 148 149 if (!hasExperience && isTextExtractable) { 150 issues.push({ 151 issue: 'ats.format.noExperience', 152 severity: 'critical', 153 description: 'ats.format.noExperienceDesc', 154 fix: 'ats.format.noExperienceFix', 155 }); 156 deductions += 15; 157 } 158 159 if (!hasSkills && isTextExtractable) { 160 issues.push({ 161 issue: 'ats.format.noSkills', 162 severity: 'warning', 163 description: 'ats.format.noSkillsDesc', 164 fix: 'ats.format.noSkillsFix', 165 }); 166 deductions += 10; 167 } 168 169 if (!hasEducation && isTextExtractable) { 170 issues.push({ 171 issue: 'ats.format.noEducation', 172 severity: 'info', 173 description: 'ats.format.noEducationDesc', 174 fix: 'ats.format.noEducationFix', 175 }); 176 deductions += 5; 177 } 178 179 // --- Check 6: Contact info --- 180 const hasEmail = /[\w.-]+@[\w.-]+\.\w+/.test(cvText); 181 const hasPhone = /[\+]?\d[\d\s\-().]{7,}/.test(cvText); 182 183 if (!hasEmail) { 184 issues.push({ 185 issue: 'ats.format.noEmail', 186 severity: 'critical', 187 description: 'ats.format.noEmailDesc', 188 fix: 'ats.format.noEmailFix', 189 }); 190 deductions += 10; 191 } 192 193 // --- Check 7: Special characters / encoding --- 194 const specialCharCount = (cvText.match(/[^\x20-\x7E\n\r\t\u00C0-\u024F]/g) || []).length; 195 const specialCharRatio = specialCharCount / Math.max(charCount, 1); 196 197 if (specialCharRatio > 0.05) { 198 issues.push({ 199 issue: 'ats.format.specialChars', 200 severity: 'warning', 201 description: 'ats.format.specialCharsDesc', 202 fix: 'ats.format.specialCharsFix', 203 }); 204 deductions += 10; 205 } 206 207 // --- Check 8: Very short CV --- 208 if (charCount > 100 && charCount < 500) { 209 issues.push({ 210 issue: 'ats.format.tooShort', 211 severity: 'warning', 212 description: 'ats.format.tooShortDesc', 213 fix: 'ats.format.tooShortFix', 214 }); 215 deductions += 10; 216 } 217 218 // --- Check 9: Detect potential images/graphics (heuristic) --- 219 const hasImages = pdfMetadata?.numrender ? pdfMetadata.numrender > 2 : false; 220 if (hasImages) { 221 issues.push({ 222 issue: 'ats.format.hasImages', 223 severity: 'info', 224 description: 'ats.format.hasImagesDesc', 225 fix: 'ats.format.hasImagesFix', 226 }); 227 deductions += 5; 228 } 229 230 // Calculate format score (cap deductions at 100) 231 const formatScore = Math.max(0, 100 - Math.min(deductions, 100)); 232 233 // If everything is clean, add a positive note 234 if (issues.length === 0) { 235 issues.push({ 236 issue: 'ats.format.allGood', 237 severity: 'info', 238 description: 'ats.format.allGoodDesc', 239 fix: '', 240 }); 241 } 242 243 return { 244 formatScore, 245 issues, 246 stats: { 247 pageCount, 248 charCount, 249 isTextExtractable, 250 hasImages, 251 estimatedColumns, 252 fileSize, 253 }, 254 }; 255 } 256 257 function escapeRegex(str: string): string { 258 return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); 259 }