Cradicle Explorer

/ lib / ats-format-check.ts
ats-format-check.ts
  1  // lib/ats-format-check.ts
  2  // Analyzes PDF structure for ATS compatibility — pure code, no AI needed
  3  
  4  import { ATSFormatIssue } from './types';
  5  
  6  // Standard ATS-recognized section headers
  7  const ATS_STANDARD_HEADERS = [
  8    'experience', 'work experience', 'professional experience', 'employment history',
  9    'education', 'academic background',
 10    'skills', 'technical skills', 'core competencies', 'key skills',
 11    'certifications', 'certificates', 'licenses',
 12    'projects', 'key projects',
 13    'summary', 'professional summary', 'objective', 'profile',
 14    'languages', 'language skills',
 15    'publications', 'awards', 'volunteer', 'references',
 16  ];
 17  
 18  // Non-standard headers that confuse ATS parsers
 19  const PROBLEMATIC_HEADERS = [
 20    'my journey', 'about me', 'who i am', 'my story', 'what i do',
 21    'adventures', 'playground', 'toolbox', 'superpower', 'arsenal',
 22    'what i bring', 'my expertise', 'passions', 'life philosophy',
 23  ];
 24  
 25  interface PDFFormatAnalysis {
 26    formatScore: number; // 0-100
 27    issues: ATSFormatIssue[];
 28    stats: {
 29      pageCount: number;
 30      charCount: number;
 31      isTextExtractable: boolean;
 32      hasImages: boolean;
 33      estimatedColumns: number;
 34      fileSize: number;
 35    };
 36  }
 37  
 38  export function analyzeATSFormat(
 39    cvText: string,
 40    pdfMetadata: {
 41      numpages?: number;
 42      numrender?: number;
 43      info?: Record<string, unknown>;
 44    },
 45    fileSize: number // bytes
 46  ): PDFFormatAnalysis {
 47    const issues: ATSFormatIssue[] = [];
 48    let deductions = 0;
 49  
 50    const pageCount = pdfMetadata?.numpages ?? 1;
 51    const charCount = cvText.length;
 52    const isTextExtractable = charCount > 100;
 53  
 54    // --- Check 1: Text extractability ---
 55    if (!isTextExtractable) {
 56      issues.push({
 57        issue: 'ats.format.noText',
 58        severity: 'critical',
 59        description: 'ats.format.noTextDesc',
 60        fix: 'ats.format.noTextFix',
 61      });
 62      deductions += 40;
 63    }
 64  
 65    // --- Check 2: Page count ---
 66    if (pageCount > 5) {
 67      issues.push({
 68        issue: 'ats.format.wayTooLong',
 69        severity: 'critical',
 70        description: 'ats.format.wayTooLongDesc',
 71        fix: 'ats.format.wayTooLongFix',
 72      });
 73      deductions += 20;
 74    } else if (pageCount > 3) {
 75      issues.push({
 76        issue: 'ats.format.tooLong',
 77        severity: 'warning',
 78        description: 'ats.format.tooLongDesc',
 79        fix: 'ats.format.tooLongFix',
 80      });
 81      deductions += 10;
 82    }
 83  
 84    // --- Check 3: File size ---
 85    const fileSizeMB = fileSize / (1024 * 1024);
 86    if (fileSizeMB > 2) {
 87      issues.push({
 88        issue: 'ats.format.largeFile',
 89        severity: 'warning',
 90        description: 'ats.format.largeFileDesc',
 91        fix: 'ats.format.largeFileFix',
 92      });
 93      deductions += 5;
 94    }
 95  
 96    // --- Check 4: Column detection (heuristic) ---
 97    const lines = cvText.split('\n').filter((l) => l.trim().length > 0);
 98    const shortLineRatio = lines.filter((l) => l.trim().length < 30).length / Math.max(lines.length, 1);
 99    const hasLargeGaps = cvText.includes('    ') || /\t{2,}/.test(cvText);
100    const estimatedColumns = shortLineRatio > 0.4 && hasLargeGaps ? 2 : 1;
101  
102    if (estimatedColumns > 1) {
103      issues.push({
104        issue: 'ats.format.multiColumn',
105        severity: 'warning',
106        description: 'ats.format.multiColumnDesc',
107        fix: 'ats.format.multiColumnFix',
108      });
109      deductions += 15;
110    }
111  
112    // --- Check 5: Section headers ---
113    const textLower = cvText.toLowerCase();
114    const foundStandard = ATS_STANDARD_HEADERS.filter((h) => {
115      // Check for header-like patterns: header on its own line or followed by colon
116      const patterns = [
117        new RegExp(`^${escapeRegex(h)}\\s*$`, 'mi'),
118        new RegExp(`^${escapeRegex(h)}\\s*:`, 'mi'),
119        new RegExp(`^${escapeRegex(h)}\\s*\\n`, 'mi'),
120      ];
121      return patterns.some((p) => p.test(cvText));
122    });
123  
124    const foundProblematic = PROBLEMATIC_HEADERS.filter((h) =>
125      textLower.includes(h)
126    );
127  
128    if (foundProblematic.length > 0) {
129      issues.push({
130        issue: 'ats.format.nonStandardHeaders',
131        severity: 'warning',
132        description: 'ats.format.nonStandardHeadersDesc',
133        fix: 'ats.format.nonStandardHeadersFix',
134      });
135      deductions += 10;
136    }
137  
138    // Check for essential sections
139    const hasExperience = foundStandard.some((h) =>
140      ['experience', 'work experience', 'professional experience', 'employment history'].includes(h)
141    );
142    const hasEducation = foundStandard.some((h) =>
143      ['education', 'academic background'].includes(h)
144    );
145    const hasSkills = foundStandard.some((h) =>
146      ['skills', 'technical skills', 'core competencies', 'key skills'].includes(h)
147    );
148  
149    if (!hasExperience && isTextExtractable) {
150      issues.push({
151        issue: 'ats.format.noExperience',
152        severity: 'critical',
153        description: 'ats.format.noExperienceDesc',
154        fix: 'ats.format.noExperienceFix',
155      });
156      deductions += 15;
157    }
158  
159    if (!hasSkills && isTextExtractable) {
160      issues.push({
161        issue: 'ats.format.noSkills',
162        severity: 'warning',
163        description: 'ats.format.noSkillsDesc',
164        fix: 'ats.format.noSkillsFix',
165      });
166      deductions += 10;
167    }
168  
169    if (!hasEducation && isTextExtractable) {
170      issues.push({
171        issue: 'ats.format.noEducation',
172        severity: 'info',
173        description: 'ats.format.noEducationDesc',
174        fix: 'ats.format.noEducationFix',
175      });
176      deductions += 5;
177    }
178  
179    // --- Check 6: Contact info ---
180    const hasEmail = /[\w.-]+@[\w.-]+\.\w+/.test(cvText);
181    const hasPhone = /[\+]?\d[\d\s\-().]{7,}/.test(cvText);
182  
183    if (!hasEmail) {
184      issues.push({
185        issue: 'ats.format.noEmail',
186        severity: 'critical',
187        description: 'ats.format.noEmailDesc',
188        fix: 'ats.format.noEmailFix',
189      });
190      deductions += 10;
191    }
192  
193    // --- Check 7: Special characters / encoding ---
194    const specialCharCount = (cvText.match(/[^\x20-\x7E\n\r\t\u00C0-\u024F]/g) || []).length;
195    const specialCharRatio = specialCharCount / Math.max(charCount, 1);
196  
197    if (specialCharRatio > 0.05) {
198      issues.push({
199        issue: 'ats.format.specialChars',
200        severity: 'warning',
201        description: 'ats.format.specialCharsDesc',
202        fix: 'ats.format.specialCharsFix',
203      });
204      deductions += 10;
205    }
206  
207    // --- Check 8: Very short CV ---
208    if (charCount > 100 && charCount < 500) {
209      issues.push({
210        issue: 'ats.format.tooShort',
211        severity: 'warning',
212        description: 'ats.format.tooShortDesc',
213        fix: 'ats.format.tooShortFix',
214      });
215      deductions += 10;
216    }
217  
218    // --- Check 9: Detect potential images/graphics (heuristic) ---
219    const hasImages = pdfMetadata?.numrender ? pdfMetadata.numrender > 2 : false;
220    if (hasImages) {
221      issues.push({
222        issue: 'ats.format.hasImages',
223        severity: 'info',
224        description: 'ats.format.hasImagesDesc',
225        fix: 'ats.format.hasImagesFix',
226      });
227      deductions += 5;
228    }
229  
230    // Calculate format score (cap deductions at 100)
231    const formatScore = Math.max(0, 100 - Math.min(deductions, 100));
232  
233    // If everything is clean, add a positive note
234    if (issues.length === 0) {
235      issues.push({
236        issue: 'ats.format.allGood',
237        severity: 'info',
238        description: 'ats.format.allGoodDesc',
239        fix: '',
240      });
241    }
242  
243    return {
244      formatScore,
245      issues,
246      stats: {
247        pageCount,
248        charCount,
249        isTextExtractable,
250        hasImages,
251        estimatedColumns,
252        fileSize,
253      },
254    };
255  }
256  
257  function escapeRegex(str: string): string {
258    return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
259  }