Cradicle Explorer

/ tests / test_parse_cv_pipeline.py
test_parse_cv_pipeline.py
  1  """Unit tests for pipeline/parse_cv.py — anonymization and orchestration.
  2  
  3  Covers name anonymization edge cases, file validation, output path defaulting,
  4  and JSON serialization with mocked service.
  5  """
  6  
  7  __all__: list[str] = []
  8  
  9  import json
 10  from pathlib import Path
 11  from unittest.mock import MagicMock, patch
 12  
 13  import pytest
 14  
 15  from exceptions import DataError
 16  from models.coaching import CandidateIdentity, CandidateProfile
 17  from pipeline.parse_cv import _anonymize_name, render_preview, run, write_profile
 18  
 19  # ── Name anonymization tests ─────────────────────────────────────────────────
 20  
 21  
 22  class TestAnonymizeName:
 23      """Edge cases for _anonymize_name — empty, single word, two words, compound."""
 24  
 25      def test_anonymize_empty_name(self) -> None:
 26          assert _anonymize_name("") == ""
 27  
 28      def test_anonymize_single_word_name(self) -> None:
 29          assert _anonymize_name("Madonna") == "Madonna"
 30  
 31      def test_anonymize_two_word_name(self) -> None:
 32          assert _anonymize_name("John Smith") == "John S."
 33  
 34      def test_anonymize_compound_name(self) -> None:
 35          # First word + last word's initial: "Jean-Pierre De La Fontaine" -> "Jean-Pierre F."
 36          assert _anonymize_name("Jean-Pierre De La Fontaine") == "Jean-Pierre F."
 37  
 38  
 39  # ── Pipeline orchestration tests ─────────────────────────────────────────────
 40  
 41  
 42  class TestRenderPreview:
 43      """Tests for render_preview — human-readable profile rendering."""
 44  
 45      def test_render_includes_all_sections_and_values(self) -> None:
 46          # Verify all populated sections and their values appear in the output.
 47          from models.coaching import (
 48              CandidateEducation,
 49              CandidateExperience,
 50              CandidateProject,
 51              CandidatePublication,
 52          )
 53  
 54          profile = CandidateProfile(
 55              identity=CandidateIdentity(
 56                  full_name="John S.",
 57                  title="Engineer",
 58                  location="Paris",
 59                  summary="Experienced.",
 60              ),
 61              experiences=[
 62                  CandidateExperience(
 63                      id="exp_01", company="Acme", role="Dev",
 64                      start_date="2020-01", end_date="2023-06",
 65                      achievements=["Shipped product."],
 66                  )
 67              ],
 68              education=[
 69                  CandidateEducation(
 70                      id="edu_01", institution="MIT", degree="BS",
 71                      field="CS", year="2019",
 72                  )
 73              ],
 74              skills_inventory=["Python", "Docker"],
 75              languages=["English (Native)"],
 76              projects=[
 77                  CandidateProject(id="proj_01", name="Side project")
 78              ],
 79              publications=[
 80                  CandidatePublication(
 81                      id="pat_01", type="Patent", title="Cool thing",
 82                      reference="US123",
 83                  )
 84              ],
 85          )
 86          text = render_preview(profile)
 87  
 88          # Section headers present
 89          for section in ["IDENTITY", "EXPERIENCE", "EDUCATION", "SKILLS",
 90                          "LANGUAGES", "PROJECTS", "PUBLICATIONS"]:
 91              assert section in text
 92  
 93          # Key values present (format-agnostic)
 94          for value in ["John S.", "Engineer", "Paris", "Acme", "Dev",
 95                         "2020-01", "Shipped product.", "MIT", "BS", "CS",
 96                         "Python", "Docker", "English (Native)",
 97                         "Side project", "Patent", "Cool thing", "US123"]:
 98              assert value in text
 99  
100      def test_render_omits_empty_sections(self) -> None:
101          # Sections with no data must not appear.
102          profile = CandidateProfile(
103              identity=CandidateIdentity(full_name="Test"),
104              experiences=[],
105          )
106          text = render_preview(profile)
107          assert "=== IDENTITY ===" in text
108          assert "=== EXPERIENCE (0 entries) ===" in text
109          assert "EDUCATION" not in text
110          assert "SKILLS" not in text
111          assert "LANGUAGES" not in text
112  
113  
114  class TestWriteProfile:
115      """Tests for write_profile — backup on overwrite."""
116  
117      def test_existing_file_backed_up(self, tmp_path: Path) -> None:
118          # Existing JSON must be renamed to name_1.json before writing.
119          output = tmp_path / "cv.json"
120          output.write_text('{"old": true}')
121          profile = CandidateProfile(
122              identity=CandidateIdentity(full_name="Test"),
123              experiences=[],
124          )
125          write_profile(profile, output)
126  
127          backup = tmp_path / "cv_1.json"
128          assert backup.exists()
129          assert backup.read_text() == '{"old": true}'
130          assert output.exists()
131          assert '"full_name": "Test"' in output.read_text()
132  
133      def test_multiple_backups_increment(self, tmp_path: Path) -> None:
134          # Each overwrite creates the next numbered backup.
135          output = tmp_path / "cv.json"
136          profile = CandidateProfile(
137              identity=CandidateIdentity(full_name="Test"),
138              experiences=[],
139          )
140          output.write_text('{"v": 1}')
141          write_profile(profile, output)
142          assert (tmp_path / "cv_1.json").exists()
143  
144          output.write_text('{"v": 2}')  # simulate re-extraction overwriting
145          write_profile(profile, output)
146          assert (tmp_path / "cv_2.json").exists()
147  
148      def test_no_backup_when_file_missing(self, tmp_path: Path) -> None:
149          # No backup created when output doesn't exist yet.
150          output = tmp_path / "cv.json"
151          profile = CandidateProfile(
152              identity=CandidateIdentity(full_name="Test"),
153              experiences=[],
154          )
155          write_profile(profile, output)
156  
157          assert not (tmp_path / "cv_1.json").exists()
158          assert output.exists()
159  
160  
161  def _make_profile(full_name: str = "John Doe") -> CandidateProfile:
162      return CandidateProfile(
163          identity=CandidateIdentity(full_name=full_name, title="Engineer"),
164          experiences=[],
165      )
166  
167  
168  class TestRunPipeline:
169      """Tests for pipeline.parse_cv.run — file validation, output, anonymization."""
170  
171      def test_missing_file_raises_data_error(self, tmp_path: Path) -> None:
172          # Non-existent cv_path must be rejected before any LLM call.
173          missing = tmp_path / "nonexistent.pdf"
174          with pytest.raises(DataError, match="not found"):
175              run(missing)
176  
177      def test_non_pdf_extension_raises_data_error(self, tmp_path: Path) -> None:
178          # .txt, .docx, etc. must be rejected.
179          txt_file = tmp_path / "resume.txt"
180          txt_file.write_text("hello")
181          with pytest.raises(DataError, match=".pdf"):
182              run(txt_file)
183  
184      def test_output_path_defaults_to_json_suffix(self, tmp_path: Path) -> None:
185          # When output_path is None, defaults to cv_path.with_suffix(".json").
186          pdf = tmp_path / "resume.pdf"
187          pdf.write_bytes(b"%PDF-1.4 minimal")
188          expected_output = tmp_path / "resume.json"
189  
190          with patch("pipeline.parse_cv.settings") as mock_settings, \
191               patch("pipeline.parse_cv.create_llm_provider") as mock_create, \
192               patch("pipeline.parse_cv.parse_cv") as mock_parse:
193              mock_settings.get_llm_config.return_value = ("anthropic", "test", 0.0, 42, 4096)
194              mock_provider = MagicMock()
195              mock_create.return_value = mock_provider
196              mock_parse.return_value = _make_profile()
197  
198              result = run(pdf)
199              assert result == expected_output
200  
201      def test_run_writes_anonymized_json(self, tmp_path: Path) -> None:
202          # End-to-end: mocked service, verify written JSON has anonymized name.
203          pdf = tmp_path / "cv.pdf"
204          pdf.write_bytes(b"%PDF-1.4 minimal")
205          output = tmp_path / "cv.json"
206  
207          with patch("pipeline.parse_cv.settings") as mock_settings, \
208               patch("pipeline.parse_cv.create_llm_provider") as mock_create, \
209               patch("pipeline.parse_cv.parse_cv") as mock_parse:
210              mock_settings.get_llm_config.return_value = ("anthropic", "test", 0.0, 42, 4096)
211              mock_provider = MagicMock()
212              mock_create.return_value = mock_provider
213              mock_parse.return_value = _make_profile("John Smith")
214  
215              run(pdf, output_path=output)
216  
217              data = json.loads(output.read_text(encoding="utf-8"))
218              assert data["identity"]["full_name"] == "John S."
219  
220      def test_run_returns_output_path(self, tmp_path: Path) -> None:
221          # Return value is the Path to the written file.
222          pdf = tmp_path / "cv.pdf"
223          pdf.write_bytes(b"%PDF-1.4 minimal")
224          output = tmp_path / "result.json"
225  
226          with patch("pipeline.parse_cv.settings") as mock_settings, \
227               patch("pipeline.parse_cv.create_llm_provider") as mock_create, \
228               patch("pipeline.parse_cv.parse_cv") as mock_parse:
229              mock_settings.get_llm_config.return_value = ("anthropic", "test", 0.0, 42, 4096)
230              mock_provider = MagicMock()
231              mock_create.return_value = mock_provider
232              mock_parse.return_value = _make_profile()
233  
234              result = run(pdf, output_path=output)
235              assert result == output
236              assert output.exists()