parse_cv.py
1 """Orchestrate CV-to-JSON extraction: read PDF, call service, anonymize, write output.""" 2 3 __all__ = ["extract", "render_preview", "write_profile", "run"] 4 5 import json 6 import logging 7 from pathlib import Path 8 9 from config import settings 10 from exceptions import DataError 11 from integrations.instrumented import InstrumentedProvider 12 from integrations.llm import create_llm_provider 13 from models.coaching import CandidateProfile 14 from services.cv_parser import parse_cv 15 from services.run_summary import print_summary 16 from telemetry import RunStats 17 18 logger = logging.getLogger(__name__) 19 20 21 def _anonymize_name(full_name: str) -> str: 22 """Truncate a full name to 'FirstName L.' format. 23 24 Rules: 25 - Empty string -> leave empty. 26 - Single word (no spaces) -> leave as-is. 27 - Multiple words -> first word + last word's initial + '.'. 28 """ 29 if not full_name: 30 return full_name 31 parts = full_name.split() 32 if len(parts) == 1: 33 return full_name 34 return f"{parts[0]} {parts[-1][0]}." 35 36 37 def _with_anonymized_name( 38 profile: CandidateProfile, anonymized_name: str 39 ) -> CandidateProfile: 40 """Return a new CandidateProfile with the identity name replaced.""" 41 identity_data = profile.identity.model_dump() 42 identity_data["full_name"] = anonymized_name 43 profile_data = profile.model_dump() 44 profile_data["identity"] = identity_data 45 return CandidateProfile.model_validate(profile_data) 46 47 48 # Fragile pattern: extract() stores a deferred summary printer as a function 49 # attribute (extract.print_run_summary) so that run() and the CLI can call it 50 # after their own output. This breaks silently if extract is renamed, wrapped 51 # with a decorator, or moved to a class — the hasattr() check in run() / CLI 52 # will return False and the summary won't print. If this becomes a problem, 53 # refactor to return a (profile, stats, model) tuple from an internal helper 54 # and have extract() / run() / CLI each handle printing at the right time. 55 def extract(cv_path: Path) -> CandidateProfile: 56 """Extract and anonymize a CandidateProfile from a PDF CV. 57 58 Args: 59 cv_path: Path to the input PDF file. 60 61 Returns: 62 Anonymized CandidateProfile ready for review or writing. 63 64 Raises: 65 DataError: If the file does not exist or is not a PDF. 66 ConfigurationError: If the LLM task config is missing. 67 APIError: If the provider is unreachable. 68 ParseError: If the LLM output cannot be validated. 69 """ 70 if not cv_path.exists(): 71 raise DataError(f"CV file not found: {cv_path}") 72 if cv_path.suffix.lower() != ".pdf": 73 raise DataError(f"Expected a .pdf file, got {cv_path.suffix!r}") 74 75 pdf_data = cv_path.read_bytes() 76 77 provider, model, temperature, seed, max_tokens = settings.get_llm_config("parse_cv") 78 llm = create_llm_provider(provider, model, settings, max_tokens=max_tokens) 79 stats = RunStats() 80 instrumented = InstrumentedProvider(llm, stats, phase="main") 81 logger.info("Parsing CV from %s using %s/%s", cv_path.name, provider, model) 82 profile = parse_cv(pdf_data, instrumented, temperature=temperature, seed=seed) 83 84 anonymized_name = _anonymize_name(profile.identity.full_name) 85 if anonymized_name != profile.identity.full_name: 86 logger.debug( 87 "Anonymized name: %s -> %s", profile.identity.full_name, anonymized_name 88 ) 89 90 stats.finish() 91 _deferred_summary = (stats, model) 92 93 def _print_deferred() -> None: 94 try: 95 print_summary( 96 _deferred_summary[0], 97 pipeline="parse_cv", 98 phase_models={"main": _deferred_summary[1]}, 99 ) 100 except Exception: 101 logger.warning("Failed to print run summary", exc_info=True) 102 103 extract.print_run_summary = _print_deferred # type: ignore[attr-defined] 104 105 return _with_anonymized_name(profile, anonymized_name) 106 107 108 def render_preview(profile: CandidateProfile) -> str: 109 """Render a CandidateProfile as human-readable plain text. 110 111 Args: 112 profile: The profile to render. 113 114 Returns: 115 Multi-line string suitable for terminal display. 116 """ 117 lines: list[str] = [] 118 ident = profile.identity 119 120 lines.append("=== IDENTITY ===") 121 if ident.full_name: 122 lines.append(f" Name: {ident.full_name}") 123 if ident.title: 124 lines.append(f" Title: {ident.title}") 125 if ident.location: 126 lines.append(f" Location: {ident.location}") 127 if ident.trajectory: 128 lines.append(f" Trajectory: {ident.trajectory}") 129 if ident.summary: 130 lines.append(f" Summary: {ident.summary}") 131 lines.append("") 132 133 lines.append(f"=== EXPERIENCE ({len(profile.experiences)} entries) ===") 134 for exp in profile.experiences: 135 lines.append(f" [{exp.id}]") 136 lines.append(f" company: {exp.company}") 137 lines.append(f" role: {exp.role}") 138 lines.append(f" start_date: {exp.start_date}") 139 lines.append(f" end_date: {exp.end_date}") 140 if exp.location: 141 lines.append(f" location: {exp.location}") 142 if exp.description: 143 lines.append(f" description: {exp.description}") 144 if exp.skills: 145 lines.append(f" skills: {', '.join(exp.skills)}") 146 if exp.achievements: 147 lines.append(" achievements:") 148 for achievement in exp.achievements: 149 lines.append(f" - {achievement}") 150 lines.append("") 151 152 if profile.education: 153 lines.append(f"=== EDUCATION ({len(profile.education)} entries) ===") 154 for edu in profile.education: 155 lines.append(f" [{edu.id}]") 156 lines.append(f" institution: {edu.institution}") 157 lines.append(f" degree: {edu.degree}") 158 if edu.field: 159 lines.append(f" field: {edu.field}") 160 if edu.year: 161 lines.append(f" year: {edu.year}") 162 if edu.honors: 163 lines.append(f" honors: {edu.honors}") 164 if edu.relevant_coursework: 165 lines.append(f" coursework: {', '.join(edu.relevant_coursework)}") 166 lines.append("") 167 168 if profile.skills_inventory: 169 lines.append(f"=== SKILLS ({len(profile.skills_inventory)}) ===") 170 lines.append(f" skills_inventory: {', '.join(profile.skills_inventory)}") 171 lines.append("") 172 173 if profile.languages: 174 lines.append(f"=== LANGUAGES ({len(profile.languages)}) ===") 175 lines.append(f" languages: {', '.join(profile.languages)}") 176 lines.append("") 177 178 if profile.certifications: 179 lines.append(f"=== CERTIFICATIONS ({len(profile.certifications)}) ===") 180 for cert in profile.certifications: 181 lines.append(f" - {cert}") 182 lines.append("") 183 184 if profile.projects: 185 lines.append(f"=== PROJECTS ({len(profile.projects)}) ===") 186 for proj in profile.projects: 187 lines.append(f" [{proj.id}]") 188 lines.append(f" name: {proj.name}") 189 if proj.description: 190 lines.append(f" description: {proj.description}") 191 if proj.technologies: 192 lines.append(f" technologies: {', '.join(proj.technologies)}") 193 if proj.url: 194 lines.append(f" url: {proj.url}") 195 if proj.highlights: 196 lines.append(" highlights:") 197 for hl in proj.highlights: 198 lines.append(f" - {hl}") 199 lines.append("") 200 201 if profile.publications: 202 lines.append(f"=== PUBLICATIONS ({len(profile.publications)}) ===") 203 for pub in profile.publications: 204 lines.append(f" [{pub.id}]") 205 lines.append(f" type: {pub.type}") 206 lines.append(f" title: {pub.title}") 207 if pub.reference: 208 lines.append(f" reference: {pub.reference}") 209 lines.append("") 210 211 return "\n".join(lines) 212 213 214 def write_profile(profile: CandidateProfile, output_path: Path) -> None: 215 """Write a CandidateProfile to a JSON file. 216 217 If the output file already exists, it is backed up to ``.json.bak`` 218 before writing. 219 220 Args: 221 profile: The profile to write. 222 output_path: Destination file path. 223 """ 224 if output_path.exists(): 225 stem = output_path.stem 226 counter = 1 227 while True: 228 backup = output_path.with_name(f"{stem}_{counter}.json") 229 if not backup.exists(): 230 break 231 counter += 1 232 output_path.rename(backup) 233 logger.info("Backed up existing file to %s", backup) 234 235 output_path.parent.mkdir(parents=True, exist_ok=True) 236 output_path.write_text( 237 json.dumps(profile.model_dump(), indent=2, ensure_ascii=False) + "\n", 238 encoding="utf-8", 239 ) 240 logger.info("Wrote profile to %s", output_path) 241 242 243 def run(cv_path: Path, *, output_path: Path | None = None) -> Path: 244 """Extract a CandidateProfile from a PDF CV and write it as JSON. 245 246 Args: 247 cv_path: Path to the input PDF file. 248 output_path: Where to write the output JSON. Defaults to 249 ``cv_path.with_suffix(".json")``. 250 251 Returns: 252 Path to the written JSON file. 253 254 Raises: 255 DataError: If the file does not exist or is not a PDF. 256 ConfigurationError: If the LLM task config is missing. 257 APIError: If the provider is unreachable. 258 ParseError: If the LLM output cannot be validated. 259 """ 260 resolved_output = output_path or cv_path.with_suffix(".json") 261 profile = extract(cv_path) 262 write_profile(profile, resolved_output) 263 if hasattr(extract, "print_run_summary"): 264 extract.print_run_summary() 265 return resolved_output