/ services / cv_parser.py
cv_parser.py
 1  """Extract structured candidate profile from a PDF CV."""
 2  
 3  __all__ = ["parse_cv"]
 4  
 5  import logging
 6  from pathlib import Path
 7  
 8  from pydantic import ValidationError
 9  from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential
10  
11  from exceptions import ConfigurationError, DataError, ParseError
12  from integrations.llm import LLMProvider
13  from models.coaching import CandidateProfile
14  from models.llm import Attachment
15  
16  logger = logging.getLogger(__name__)
17  
18  _MAX_PDF_BYTES = 10 * 1024 * 1024  # 10 MB — CVs are typically <2 MB
19  
20  _PROMPT_PATH = Path(__file__).resolve().parents[1] / "prompts" / "parse_cv_system.txt"
21  if not _PROMPT_PATH.exists():
22      raise ConfigurationError(f"prompt file {_PROMPT_PATH}")
23  _SYSTEM_PROMPT: str = _PROMPT_PATH.read_text(encoding="utf-8")
24  
25  
26  @retry(
27      stop=stop_after_attempt(3),
28      wait=wait_exponential(min=1, max=8),
29      retry=retry_if_exception_type(ParseError),
30      reraise=True,
31  )
32  def parse_cv(
33      pdf_data: bytes,
34      llm: LLMProvider,
35      *,
36      temperature: float | None = None,
37      seed: int | None = None,
38  ) -> CandidateProfile:
39      """Extract a structured candidate profile from raw PDF bytes.
40  
41      Args:
42          pdf_data: Raw PDF file bytes.
43          llm: LLM provider instance (injected by caller).
44          temperature: Sampling temperature forwarded to llm.complete().
45          seed: Reproducibility seed forwarded to llm.complete().
46  
47      Returns:
48          Validated CandidateProfile parsed from the LLM response.
49  
50      Raises:
51          DataError: If pdf_data is empty or exceeds the size limit.
52          ParseError: If the LLM response cannot be validated after 3 attempts.
53      """
54      if not pdf_data:
55          raise DataError("pdf_data is empty")
56      if len(pdf_data) > _MAX_PDF_BYTES:
57          raise DataError(
58              f"pdf_data is {len(pdf_data)} bytes, exceeds limit of {_MAX_PDF_BYTES} bytes (10 MB)"
59          )
60  
61      attachment = Attachment(data=pdf_data, media_type="application/pdf")
62  
63      raw = llm.complete(
64          system=_SYSTEM_PROMPT,
65          user="Extract all information from this CV.",
66          temperature=temperature,
67          seed=seed,
68          response_schema=CandidateProfile,
69          attachments=[attachment],
70      )
71  
72      try:
73          profile = CandidateProfile.model_validate_json(raw)
74      except ValidationError as e:
75          logger.debug("parse_cv raw response: %s", raw)
76          logger.warning("parse_cv validation failed (will retry): %s", e)
77          raise ParseError("parse_cv", str(e), raw=raw) from e
78  
79      return profile