test_parse_cv_pipeline.py
1 """Unit tests for pipeline/parse_cv.py — anonymization and orchestration. 2 3 Covers name anonymization edge cases, file validation, output path defaulting, 4 and JSON serialization with mocked service. 5 """ 6 7 __all__: list[str] = [] 8 9 import json 10 from pathlib import Path 11 from unittest.mock import MagicMock, patch 12 13 import pytest 14 15 from exceptions import DataError 16 from models.coaching import CandidateIdentity, CandidateProfile 17 from pipeline.parse_cv import _anonymize_name, render_preview, run, write_profile 18 19 # ── Name anonymization tests ───────────────────────────────────────────────── 20 21 22 class TestAnonymizeName: 23 """Edge cases for _anonymize_name — empty, single word, two words, compound.""" 24 25 def test_anonymize_empty_name(self) -> None: 26 assert _anonymize_name("") == "" 27 28 def test_anonymize_single_word_name(self) -> None: 29 assert _anonymize_name("Madonna") == "Madonna" 30 31 def test_anonymize_two_word_name(self) -> None: 32 assert _anonymize_name("John Smith") == "John S." 33 34 def test_anonymize_compound_name(self) -> None: 35 # First word + last word's initial: "Jean-Pierre De La Fontaine" -> "Jean-Pierre F." 36 assert _anonymize_name("Jean-Pierre De La Fontaine") == "Jean-Pierre F." 37 38 39 # ── Pipeline orchestration tests ───────────────────────────────────────────── 40 41 42 class TestRenderPreview: 43 """Tests for render_preview — human-readable profile rendering.""" 44 45 def test_render_includes_all_sections_and_values(self) -> None: 46 # Verify all populated sections and their values appear in the output. 47 from models.coaching import ( 48 CandidateEducation, 49 CandidateExperience, 50 CandidateProject, 51 CandidatePublication, 52 ) 53 54 profile = CandidateProfile( 55 identity=CandidateIdentity( 56 full_name="John S.", 57 title="Engineer", 58 location="Paris", 59 summary="Experienced.", 60 ), 61 experiences=[ 62 CandidateExperience( 63 id="exp_01", company="Acme", role="Dev", 64 start_date="2020-01", end_date="2023-06", 65 achievements=["Shipped product."], 66 ) 67 ], 68 education=[ 69 CandidateEducation( 70 id="edu_01", institution="MIT", degree="BS", 71 field="CS", year="2019", 72 ) 73 ], 74 skills_inventory=["Python", "Docker"], 75 languages=["English (Native)"], 76 projects=[ 77 CandidateProject(id="proj_01", name="Side project") 78 ], 79 publications=[ 80 CandidatePublication( 81 id="pat_01", type="Patent", title="Cool thing", 82 reference="US123", 83 ) 84 ], 85 ) 86 text = render_preview(profile) 87 88 # Section headers present 89 for section in ["IDENTITY", "EXPERIENCE", "EDUCATION", "SKILLS", 90 "LANGUAGES", "PROJECTS", "PUBLICATIONS"]: 91 assert section in text 92 93 # Key values present (format-agnostic) 94 for value in ["John S.", "Engineer", "Paris", "Acme", "Dev", 95 "2020-01", "Shipped product.", "MIT", "BS", "CS", 96 "Python", "Docker", "English (Native)", 97 "Side project", "Patent", "Cool thing", "US123"]: 98 assert value in text 99 100 def test_render_omits_empty_sections(self) -> None: 101 # Sections with no data must not appear. 102 profile = CandidateProfile( 103 identity=CandidateIdentity(full_name="Test"), 104 experiences=[], 105 ) 106 text = render_preview(profile) 107 assert "=== IDENTITY ===" in text 108 assert "=== EXPERIENCE (0 entries) ===" in text 109 assert "EDUCATION" not in text 110 assert "SKILLS" not in text 111 assert "LANGUAGES" not in text 112 113 114 class TestWriteProfile: 115 """Tests for write_profile — backup on overwrite.""" 116 117 def test_existing_file_backed_up(self, tmp_path: Path) -> None: 118 # Existing JSON must be renamed to name_1.json before writing. 119 output = tmp_path / "cv.json" 120 output.write_text('{"old": true}') 121 profile = CandidateProfile( 122 identity=CandidateIdentity(full_name="Test"), 123 experiences=[], 124 ) 125 write_profile(profile, output) 126 127 backup = tmp_path / "cv_1.json" 128 assert backup.exists() 129 assert backup.read_text() == '{"old": true}' 130 assert output.exists() 131 assert '"full_name": "Test"' in output.read_text() 132 133 def test_multiple_backups_increment(self, tmp_path: Path) -> None: 134 # Each overwrite creates the next numbered backup. 135 output = tmp_path / "cv.json" 136 profile = CandidateProfile( 137 identity=CandidateIdentity(full_name="Test"), 138 experiences=[], 139 ) 140 output.write_text('{"v": 1}') 141 write_profile(profile, output) 142 assert (tmp_path / "cv_1.json").exists() 143 144 output.write_text('{"v": 2}') # simulate re-extraction overwriting 145 write_profile(profile, output) 146 assert (tmp_path / "cv_2.json").exists() 147 148 def test_no_backup_when_file_missing(self, tmp_path: Path) -> None: 149 # No backup created when output doesn't exist yet. 150 output = tmp_path / "cv.json" 151 profile = CandidateProfile( 152 identity=CandidateIdentity(full_name="Test"), 153 experiences=[], 154 ) 155 write_profile(profile, output) 156 157 assert not (tmp_path / "cv_1.json").exists() 158 assert output.exists() 159 160 161 def _make_profile(full_name: str = "John Doe") -> CandidateProfile: 162 return CandidateProfile( 163 identity=CandidateIdentity(full_name=full_name, title="Engineer"), 164 experiences=[], 165 ) 166 167 168 class TestRunPipeline: 169 """Tests for pipeline.parse_cv.run — file validation, output, anonymization.""" 170 171 def test_missing_file_raises_data_error(self, tmp_path: Path) -> None: 172 # Non-existent cv_path must be rejected before any LLM call. 173 missing = tmp_path / "nonexistent.pdf" 174 with pytest.raises(DataError, match="not found"): 175 run(missing) 176 177 def test_non_pdf_extension_raises_data_error(self, tmp_path: Path) -> None: 178 # .txt, .docx, etc. must be rejected. 179 txt_file = tmp_path / "resume.txt" 180 txt_file.write_text("hello") 181 with pytest.raises(DataError, match=".pdf"): 182 run(txt_file) 183 184 def test_output_path_defaults_to_json_suffix(self, tmp_path: Path) -> None: 185 # When output_path is None, defaults to cv_path.with_suffix(".json"). 186 pdf = tmp_path / "resume.pdf" 187 pdf.write_bytes(b"%PDF-1.4 minimal") 188 expected_output = tmp_path / "resume.json" 189 190 with patch("pipeline.parse_cv.settings") as mock_settings, \ 191 patch("pipeline.parse_cv.create_llm_provider") as mock_create, \ 192 patch("pipeline.parse_cv.parse_cv") as mock_parse: 193 mock_settings.get_llm_config.return_value = ("anthropic", "test", 0.0, 42, 4096) 194 mock_provider = MagicMock() 195 mock_create.return_value = mock_provider 196 mock_parse.return_value = _make_profile() 197 198 result = run(pdf) 199 assert result == expected_output 200 201 def test_run_writes_anonymized_json(self, tmp_path: Path) -> None: 202 # End-to-end: mocked service, verify written JSON has anonymized name. 203 pdf = tmp_path / "cv.pdf" 204 pdf.write_bytes(b"%PDF-1.4 minimal") 205 output = tmp_path / "cv.json" 206 207 with patch("pipeline.parse_cv.settings") as mock_settings, \ 208 patch("pipeline.parse_cv.create_llm_provider") as mock_create, \ 209 patch("pipeline.parse_cv.parse_cv") as mock_parse: 210 mock_settings.get_llm_config.return_value = ("anthropic", "test", 0.0, 42, 4096) 211 mock_provider = MagicMock() 212 mock_create.return_value = mock_provider 213 mock_parse.return_value = _make_profile("John Smith") 214 215 run(pdf, output_path=output) 216 217 data = json.loads(output.read_text(encoding="utf-8")) 218 assert data["identity"]["full_name"] == "John S." 219 220 def test_run_returns_output_path(self, tmp_path: Path) -> None: 221 # Return value is the Path to the written file. 222 pdf = tmp_path / "cv.pdf" 223 pdf.write_bytes(b"%PDF-1.4 minimal") 224 output = tmp_path / "result.json" 225 226 with patch("pipeline.parse_cv.settings") as mock_settings, \ 227 patch("pipeline.parse_cv.create_llm_provider") as mock_create, \ 228 patch("pipeline.parse_cv.parse_cv") as mock_parse: 229 mock_settings.get_llm_config.return_value = ("anthropic", "test", 0.0, 42, 4096) 230 mock_provider = MagicMock() 231 mock_create.return_value = mock_provider 232 mock_parse.return_value = _make_profile() 233 234 result = run(pdf, output_path=output) 235 assert result == output 236 assert output.exists()