cv_parser.py
1 """Extract structured candidate profile from a PDF CV.""" 2 3 __all__ = ["parse_cv"] 4 5 import logging 6 from pathlib import Path 7 8 from pydantic import ValidationError 9 from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential 10 11 from exceptions import ConfigurationError, DataError, ParseError 12 from integrations.llm import LLMProvider 13 from models.coaching import CandidateProfile 14 from models.llm import Attachment 15 16 logger = logging.getLogger(__name__) 17 18 _MAX_PDF_BYTES = 10 * 1024 * 1024 # 10 MB — CVs are typically <2 MB 19 20 _PROMPT_PATH = Path(__file__).resolve().parents[1] / "prompts" / "parse_cv_system.txt" 21 if not _PROMPT_PATH.exists(): 22 raise ConfigurationError(f"prompt file {_PROMPT_PATH}") 23 _SYSTEM_PROMPT: str = _PROMPT_PATH.read_text(encoding="utf-8") 24 25 26 @retry( 27 stop=stop_after_attempt(3), 28 wait=wait_exponential(min=1, max=8), 29 retry=retry_if_exception_type(ParseError), 30 reraise=True, 31 ) 32 def parse_cv( 33 pdf_data: bytes, 34 llm: LLMProvider, 35 *, 36 temperature: float | None = None, 37 seed: int | None = None, 38 ) -> CandidateProfile: 39 """Extract a structured candidate profile from raw PDF bytes. 40 41 Args: 42 pdf_data: Raw PDF file bytes. 43 llm: LLM provider instance (injected by caller). 44 temperature: Sampling temperature forwarded to llm.complete(). 45 seed: Reproducibility seed forwarded to llm.complete(). 46 47 Returns: 48 Validated CandidateProfile parsed from the LLM response. 49 50 Raises: 51 DataError: If pdf_data is empty or exceeds the size limit. 52 ParseError: If the LLM response cannot be validated after 3 attempts. 53 """ 54 if not pdf_data: 55 raise DataError("pdf_data is empty") 56 if len(pdf_data) > _MAX_PDF_BYTES: 57 raise DataError( 58 f"pdf_data is {len(pdf_data)} bytes, exceeds limit of {_MAX_PDF_BYTES} bytes (10 MB)" 59 ) 60 61 attachment = Attachment(data=pdf_data, media_type="application/pdf") 62 63 raw = llm.complete( 64 system=_SYSTEM_PROMPT, 65 user="Extract all information from this CV.", 66 temperature=temperature, 67 seed=seed, 68 response_schema=CandidateProfile, 69 attachments=[attachment], 70 ) 71 72 try: 73 profile = CandidateProfile.model_validate_json(raw) 74 except ValidationError as e: 75 logger.debug("parse_cv raw response: %s", raw) 76 logger.warning("parse_cv validation failed (will retry): %s", e) 77 raise ParseError("parse_cv", str(e), raw=raw) from e 78 79 return profile