sync_notion.py
1 """Notion sync service — classify scored jobs and build Notion page data. 2 3 Pure domain logic, no I/O, no Notion client reference. Produces NotionJobPage 4 objects for the pipeline to pass to the integration layer. 5 """ 6 7 __all__ = ["classify_jobs", "build_page", "build_page_properties", "build_page_blocks"] 8 9 import logging 10 from typing import Any 11 12 import pandas as pd # type: ignore[import-untyped] 13 14 from config import NotionPropertyMapping, NotionStatusMapping 15 from integrations.html_to_notion import html_to_notion_blocks 16 from models.notion import NotionJobPage, SyncCategory 17 from services.notion_blocks import RICH_TEXT_MAX_LENGTH, heading 18 19 logger = logging.getLogger(__name__) 20 21 _OUTREACH_OPPORTUNITY_THRESHOLD = 3 22 # Exact-equality match — only jobs with opportunity score of exactly 3 qualify. 23 # Using >= would widen the outreach net; keep == until that's explicitly desired. 24 # Independent from _NETWORKING_MAX_OPPORTUNITY in models/job.py. 25 26 _OUTREACH_EXCLUDED_FIT = {"good_fit", "perfect_match"} 27 # Fit categories that disqualify a job from outreach (already strong fits). 28 29 30 def _normalize_fit_category(value: object) -> str: 31 """Normalize a fit category value from Excel to a lowercase identifier. 32 33 Strips the ``FitCategory.`` prefix that pandas may preserve from the enum 34 serialization, then lowercases and trims whitespace. 35 """ 36 return str(value).replace("FitCategory.", "").strip().lower() 37 38 39 def classify_jobs( 40 df: pd.DataFrame, 41 ) -> tuple[list[dict[str, Any]], list[dict[str, Any]], int]: 42 """Classify jobs into outreach and manual categories. 43 44 Args: 45 df: DataFrame with scored job rows. 46 47 Returns: 48 Tuple of (outreach_rows, manual_rows, overlap_count) where each row 49 is a dict from DataFrame.to_dict(orient="records"). 50 """ 51 # Normalize columns 52 df = df.copy() 53 df["OK_clean"] = df["OK"].fillna("").astype(str).str.strip().str.lower() 54 df["fit_clean"] = df["Fit Category"].apply(_normalize_fit_category) 55 df["net_opp"] = pd.to_numeric(df["Net Opportunity"], errors="coerce").fillna(0).astype(int) 56 57 # Manual: user marked OK=x 58 manual_mask = df["OK_clean"] == "x" 59 60 # Outreach: networking_opportunity == 3 AND fit not in excluded set AND NOT manual 61 outreach_mask = ( 62 (df["net_opp"] == _OUTREACH_OPPORTUNITY_THRESHOLD) 63 & (~df["fit_clean"].isin(_OUTREACH_EXCLUDED_FIT)) 64 & (~manual_mask) 65 ) 66 67 # Track overlap: jobs that qualify for both (classified as manual) 68 overlap_mask = manual_mask & ( 69 (df["net_opp"] == _OUTREACH_OPPORTUNITY_THRESHOLD) 70 & (~df["fit_clean"].isin(_OUTREACH_EXCLUDED_FIT)) 71 ) 72 overlap_count = int(overlap_mask.sum()) 73 74 manual_rows: list[dict[str, Any]] = df[manual_mask].to_dict(orient="records") 75 76 # Outreach: deduplicate by Company, keep highest Score 77 outreach_df = df[outreach_mask].copy() 78 if not outreach_df.empty: 79 outreach_df = outreach_df.sort_values("Score", ascending=False) 80 outreach_df = outreach_df.drop_duplicates(subset="Company", keep="first") 81 outreach_rows: list[dict[str, Any]] = outreach_df.to_dict(orient="records") 82 83 return outreach_rows, manual_rows, overlap_count 84 85 86 def build_page( 87 row: dict[str, Any], 88 category: SyncCategory, 89 status_mapping: NotionStatusMapping, 90 ) -> NotionJobPage: 91 """Map an Excel row dict to a NotionJobPage. 92 93 Args: 94 row: Dict from DataFrame.to_dict(orient="records"). 95 category: Sync category for this page. 96 status_mapping: Status text and icon per category. 97 98 Returns: 99 NotionJobPage value object. 100 """ 101 status, icon = _get_status_and_icon(category, status_mapping) 102 # Assumption: if LinkedIn did not provide a job_posting_id, the Notion page's Job ID 103 # property will be written as "". get_uncoached_pages skips pages with an empty Job ID, 104 # so such pages are invisible to the coach pipeline and cannot be coached via --sync-notion. 105 return NotionJobPage( 106 job_posting_id=str(row.get("job_posting_id", "")), 107 job_title=str(row.get("Title", "")), 108 company_name=str(row.get("Company", "")), 109 job_location=str(row.get("Job Location", "")), 110 url=str(row.get("URL", "")), 111 company_url=str(row.get("Company LinkedIn", "")), 112 score=int(row.get("Score", 0)), 113 fit_category=_normalize_fit_category(row.get("Fit Category", "")), 114 networking_opportunity=int(row.get("Net Opportunity", 0) or 0), 115 networking_rationale=str(row.get("Net Rationale", "")), 116 reasoning=str(row.get("AI Reasoning", "")), 117 job_description_html=str(row.get("job_description_html", "")), 118 category=category, 119 status=status, 120 icon=icon, 121 ) 122 123 124 def _get_status_and_icon( 125 category: SyncCategory, 126 status_mapping: NotionStatusMapping, 127 ) -> tuple[str, str]: 128 """Return the Notion status and icon emoji for a sync category.""" 129 if category == SyncCategory.OUTREACH: 130 return status_mapping.outreach_status, status_mapping.outreach_icon 131 return status_mapping.manual_status, status_mapping.manual_icon 132 133 134 # ── Property and block builders for Notion page creation ───────────────────── 135 136 _DESCRIPTION_TOGGLE_LABEL = "\U0001f4cb Job Description" 137 # Toggle heading wrapping the job description body — closed by default in Notion. 138 139 140 def build_page_properties( 141 page: NotionJobPage, 142 pm: NotionPropertyMapping, 143 ) -> dict[str, Any]: 144 """Build the Notion properties payload for a job page. 145 146 Args: 147 page: Job page data. 148 pm: Property name mapping. 149 150 Returns: 151 Dict of Notion property definitions ready for create_page(). 152 """ 153 title_text = f"{page.job_title} @ {page.company_name}" 154 155 props: dict[str, Any] = { 156 pm.title: {"title": [{"text": {"content": title_text}}]}, 157 pm.position: {"rich_text": [{"text": {"content": page.job_title}}]}, 158 pm.status: {"select": {"name": page.status}}, 159 pm.job_id: {"rich_text": [{"text": {"content": page.job_posting_id}}]}, 160 pm.ai_reasoning: { 161 "rich_text": [ 162 {"text": {"content": page.reasoning[:RICH_TEXT_MAX_LENGTH]}} 163 ] 164 }, 165 pm.company: {"rich_text": [{"text": {"content": page.company_name}}]}, 166 pm.location: {"rich_text": [{"text": {"content": page.job_location}}]}, 167 pm.fit_category: ( 168 {"select": {"name": page.fit_category}} if page.fit_category else {} 169 ), 170 pm.networking_signal: {"number": page.networking_opportunity}, 171 pm.networking_rationale: { 172 "rich_text": [{"text": {"content": page.networking_rationale}}] 173 }, 174 pm.score: {"number": page.score}, 175 pm.coached: {"checkbox": False}, 176 } 177 178 if page.url: 179 props[pm.job_url] = {"url": page.url} 180 if page.company_url: 181 props[pm.company_linkedin] = {"url": page.company_url} 182 183 # Remove empty property dicts (e.g. fit_category when blank). 184 return {k: v for k, v in props.items() if v} 185 186 187 def build_page_blocks(page: NotionJobPage) -> list[dict[str, Any]]: 188 """Build initial content blocks from a job page's HTML description. 189 190 Wraps the HTML blocks in a toggleable heading_1 with the "Job Description" 191 label. Returns an empty list if the description is empty. 192 193 Args: 194 page: Job page data. 195 196 Returns: 197 List of Notion block dicts ready for create_page(blocks=...). 198 """ 199 if not page.job_description_html: 200 return [] 201 202 html_blocks = html_to_notion_blocks(page.job_description_html) 203 if not html_blocks: 204 return [] 205 return [ 206 heading( 207 1, 208 _DESCRIPTION_TOGGLE_LABEL, 209 is_toggleable=True, 210 children=html_blocks, 211 ) 212 ]