Cradicle Explorer

/ pipeline / parse_cv.py
parse_cv.py
  1  """Orchestrate CV-to-JSON extraction: read PDF, call service, anonymize, write output."""
  2  
  3  __all__ = ["extract", "render_preview", "write_profile", "run"]
  4  
  5  import json
  6  import logging
  7  from pathlib import Path
  8  
  9  from config import settings
 10  from exceptions import DataError
 11  from integrations.instrumented import InstrumentedProvider
 12  from integrations.llm import create_llm_provider
 13  from models.coaching import CandidateProfile
 14  from services.cv_parser import parse_cv
 15  from services.run_summary import print_summary
 16  from telemetry import RunStats
 17  
 18  logger = logging.getLogger(__name__)
 19  
 20  
 21  def _anonymize_name(full_name: str) -> str:
 22      """Truncate a full name to 'FirstName L.' format.
 23  
 24      Rules:
 25          - Empty string -> leave empty.
 26          - Single word (no spaces) -> leave as-is.
 27          - Multiple words -> first word + last word's initial + '.'.
 28      """
 29      if not full_name:
 30          return full_name
 31      parts = full_name.split()
 32      if len(parts) == 1:
 33          return full_name
 34      return f"{parts[0]} {parts[-1][0]}."
 35  
 36  
 37  def _with_anonymized_name(
 38      profile: CandidateProfile, anonymized_name: str
 39  ) -> CandidateProfile:
 40      """Return a new CandidateProfile with the identity name replaced."""
 41      identity_data = profile.identity.model_dump()
 42      identity_data["full_name"] = anonymized_name
 43      profile_data = profile.model_dump()
 44      profile_data["identity"] = identity_data
 45      return CandidateProfile.model_validate(profile_data)
 46  
 47  
 48  # Fragile pattern: extract() stores a deferred summary printer as a function
 49  # attribute (extract.print_run_summary) so that run() and the CLI can call it
 50  # after their own output.  This breaks silently if extract is renamed, wrapped
 51  # with a decorator, or moved to a class — the hasattr() check in run() / CLI
 52  # will return False and the summary won't print.  If this becomes a problem,
 53  # refactor to return a (profile, stats, model) tuple from an internal helper
 54  # and have extract() / run() / CLI each handle printing at the right time.
 55  def extract(cv_path: Path) -> CandidateProfile:
 56      """Extract and anonymize a CandidateProfile from a PDF CV.
 57  
 58      Args:
 59          cv_path: Path to the input PDF file.
 60  
 61      Returns:
 62          Anonymized CandidateProfile ready for review or writing.
 63  
 64      Raises:
 65          DataError: If the file does not exist or is not a PDF.
 66          ConfigurationError: If the LLM task config is missing.
 67          APIError: If the provider is unreachable.
 68          ParseError: If the LLM output cannot be validated.
 69      """
 70      if not cv_path.exists():
 71          raise DataError(f"CV file not found: {cv_path}")
 72      if cv_path.suffix.lower() != ".pdf":
 73          raise DataError(f"Expected a .pdf file, got {cv_path.suffix!r}")
 74  
 75      pdf_data = cv_path.read_bytes()
 76  
 77      provider, model, temperature, seed, max_tokens = settings.get_llm_config("parse_cv")
 78      llm = create_llm_provider(provider, model, settings, max_tokens=max_tokens)
 79      stats = RunStats()
 80      instrumented = InstrumentedProvider(llm, stats, phase="main")
 81      logger.info("Parsing CV from %s using %s/%s", cv_path.name, provider, model)
 82      profile = parse_cv(pdf_data, instrumented, temperature=temperature, seed=seed)
 83  
 84      anonymized_name = _anonymize_name(profile.identity.full_name)
 85      if anonymized_name != profile.identity.full_name:
 86          logger.debug(
 87              "Anonymized name: %s -> %s", profile.identity.full_name, anonymized_name
 88          )
 89  
 90      stats.finish()
 91      _deferred_summary = (stats, model)
 92  
 93      def _print_deferred() -> None:
 94          try:
 95              print_summary(
 96                  _deferred_summary[0],
 97                  pipeline="parse_cv",
 98                  phase_models={"main": _deferred_summary[1]},
 99              )
100          except Exception:
101              logger.warning("Failed to print run summary", exc_info=True)
102  
103      extract.print_run_summary = _print_deferred  # type: ignore[attr-defined]
104  
105      return _with_anonymized_name(profile, anonymized_name)
106  
107  
108  def render_preview(profile: CandidateProfile) -> str:
109      """Render a CandidateProfile as human-readable plain text.
110  
111      Args:
112          profile: The profile to render.
113  
114      Returns:
115          Multi-line string suitable for terminal display.
116      """
117      lines: list[str] = []
118      ident = profile.identity
119  
120      lines.append("=== IDENTITY ===")
121      if ident.full_name:
122          lines.append(f"  Name:       {ident.full_name}")
123      if ident.title:
124          lines.append(f"  Title:      {ident.title}")
125      if ident.location:
126          lines.append(f"  Location:   {ident.location}")
127      if ident.trajectory:
128          lines.append(f"  Trajectory: {ident.trajectory}")
129      if ident.summary:
130          lines.append(f"  Summary:    {ident.summary}")
131      lines.append("")
132  
133      lines.append(f"=== EXPERIENCE ({len(profile.experiences)} entries) ===")
134      for exp in profile.experiences:
135          lines.append(f"  [{exp.id}]")
136          lines.append(f"    company:     {exp.company}")
137          lines.append(f"    role:        {exp.role}")
138          lines.append(f"    start_date:  {exp.start_date}")
139          lines.append(f"    end_date:    {exp.end_date}")
140          if exp.location:
141              lines.append(f"    location:    {exp.location}")
142          if exp.description:
143              lines.append(f"    description: {exp.description}")
144          if exp.skills:
145              lines.append(f"    skills:      {', '.join(exp.skills)}")
146          if exp.achievements:
147              lines.append("    achievements:")
148              for achievement in exp.achievements:
149                  lines.append(f"      - {achievement}")
150          lines.append("")
151  
152      if profile.education:
153          lines.append(f"=== EDUCATION ({len(profile.education)} entries) ===")
154          for edu in profile.education:
155              lines.append(f"  [{edu.id}]")
156              lines.append(f"    institution: {edu.institution}")
157              lines.append(f"    degree:      {edu.degree}")
158              if edu.field:
159                  lines.append(f"    field:       {edu.field}")
160              if edu.year:
161                  lines.append(f"    year:        {edu.year}")
162              if edu.honors:
163                  lines.append(f"    honors:      {edu.honors}")
164              if edu.relevant_coursework:
165                  lines.append(f"    coursework:  {', '.join(edu.relevant_coursework)}")
166          lines.append("")
167  
168      if profile.skills_inventory:
169          lines.append(f"=== SKILLS ({len(profile.skills_inventory)}) ===")
170          lines.append(f"  skills_inventory: {', '.join(profile.skills_inventory)}")
171          lines.append("")
172  
173      if profile.languages:
174          lines.append(f"=== LANGUAGES ({len(profile.languages)}) ===")
175          lines.append(f"  languages: {', '.join(profile.languages)}")
176          lines.append("")
177  
178      if profile.certifications:
179          lines.append(f"=== CERTIFICATIONS ({len(profile.certifications)}) ===")
180          for cert in profile.certifications:
181              lines.append(f"  - {cert}")
182          lines.append("")
183  
184      if profile.projects:
185          lines.append(f"=== PROJECTS ({len(profile.projects)}) ===")
186          for proj in profile.projects:
187              lines.append(f"  [{proj.id}]")
188              lines.append(f"    name:        {proj.name}")
189              if proj.description:
190                  lines.append(f"    description: {proj.description}")
191              if proj.technologies:
192                  lines.append(f"    technologies: {', '.join(proj.technologies)}")
193              if proj.url:
194                  lines.append(f"    url:         {proj.url}")
195              if proj.highlights:
196                  lines.append("    highlights:")
197                  for hl in proj.highlights:
198                      lines.append(f"      - {hl}")
199          lines.append("")
200  
201      if profile.publications:
202          lines.append(f"=== PUBLICATIONS ({len(profile.publications)}) ===")
203          for pub in profile.publications:
204              lines.append(f"  [{pub.id}]")
205              lines.append(f"    type:      {pub.type}")
206              lines.append(f"    title:     {pub.title}")
207              if pub.reference:
208                  lines.append(f"    reference: {pub.reference}")
209          lines.append("")
210  
211      return "\n".join(lines)
212  
213  
214  def write_profile(profile: CandidateProfile, output_path: Path) -> None:
215      """Write a CandidateProfile to a JSON file.
216  
217      If the output file already exists, it is backed up to ``.json.bak``
218      before writing.
219  
220      Args:
221          profile: The profile to write.
222          output_path: Destination file path.
223      """
224      if output_path.exists():
225          stem = output_path.stem
226          counter = 1
227          while True:
228              backup = output_path.with_name(f"{stem}_{counter}.json")
229              if not backup.exists():
230                  break
231              counter += 1
232          output_path.rename(backup)
233          logger.info("Backed up existing file to %s", backup)
234  
235      output_path.parent.mkdir(parents=True, exist_ok=True)
236      output_path.write_text(
237          json.dumps(profile.model_dump(), indent=2, ensure_ascii=False) + "\n",
238          encoding="utf-8",
239      )
240      logger.info("Wrote profile to %s", output_path)
241  
242  
243  def run(cv_path: Path, *, output_path: Path | None = None) -> Path:
244      """Extract a CandidateProfile from a PDF CV and write it as JSON.
245  
246      Args:
247          cv_path: Path to the input PDF file.
248          output_path: Where to write the output JSON. Defaults to
249              ``cv_path.with_suffix(".json")``.
250  
251      Returns:
252          Path to the written JSON file.
253  
254      Raises:
255          DataError: If the file does not exist or is not a PDF.
256          ConfigurationError: If the LLM task config is missing.
257          APIError: If the provider is unreachable.
258          ParseError: If the LLM output cannot be validated.
259      """
260      resolved_output = output_path or cv_path.with_suffix(".json")
261      profile = extract(cv_path)
262      write_profile(profile, resolved_output)
263      if hasattr(extract, "print_run_summary"):
264          extract.print_run_summary()
265      return resolved_output