/ db / migrations / 119-unique-domain-keyword-country.sql
119-unique-domain-keyword-country.sql
 1  -- Migration 119: Add UNIQUE constraint on (domain, keyword, country_code)
 2  --
 3  -- Prevents the SERP insertion stage from creating duplicate rows for the same
 4  -- domain+keyword+country combination. The INSERT OR IGNORE in serps.js was
 5  -- always intended to be idempotent, but had nothing to enforce uniqueness.
 6  --
 7  -- Strategy:
 8  --   For each (domain, keyword, country_code) group, keep the row with the
 9  --   highest status rank (most progressed). Among ties, keep the highest id.
10  --   All other rows are deleted.
11  --
12  -- Status rank (higher = more valuable to keep):
13  --   outreach_sent/outreach_partial/replied/interested/closed = 8
14  --   proposals_drafted = 7
15  --   enriched/enriched_regex/enriched_llm = 6
16  --   vision_scored/semantic_scored = 5
17  --   prog_scored/high_score/failing = 4
18  --   assets_captured = 3
19  --   found = 2
20  --   ignored = 1
21  --
22  -- NOTE: This migration may take several minutes on large databases due to the
23  --       DELETE + index build across ~1.2M rows.
24  
25  -- Step 1: Delete all duplicate rows, keeping the "best" row per combo
26  DELETE FROM sites
27  WHERE id NOT IN (
28    SELECT id
29    FROM (
30      SELECT
31        id,
32        domain,
33        keyword,
34        country_code,
35        CASE status
36          WHEN 'outreach_sent'     THEN 8
37          WHEN 'outreach_partial'  THEN 8
38          WHEN 'replied'           THEN 8
39          WHEN 'interested'        THEN 8
40          WHEN 'closed'            THEN 8
41          WHEN 'not_interested'    THEN 8
42          WHEN 'proposals_drafted' THEN 7
43          WHEN 'enriched'          THEN 6
44          WHEN 'enriched_regex'    THEN 6
45          WHEN 'enriched_llm'      THEN 6
46          WHEN 'vision_scored'     THEN 5
47          WHEN 'semantic_scored'   THEN 5
48          WHEN 'prog_scored'       THEN 4
49          WHEN 'high_score'        THEN 4
50          WHEN 'failing'           THEN 4
51          WHEN 'assets_captured'   THEN 3
52          WHEN 'found'             THEN 2
53          WHEN 'ignored'           THEN 1
54          ELSE                          2
55        END AS status_rank
56      FROM sites
57    ) ranked
58    WHERE (domain, keyword, country_code, status_rank, id) IN (
59      SELECT domain, keyword, country_code, MAX(status_rank), MAX(id)
60      FROM (
61        SELECT
62          id,
63          domain,
64          keyword,
65          country_code,
66          CASE status
67            WHEN 'outreach_sent'     THEN 8
68            WHEN 'outreach_partial'  THEN 8
69            WHEN 'replied'           THEN 8
70            WHEN 'interested'        THEN 8
71            WHEN 'closed'            THEN 8
72            WHEN 'not_interested'    THEN 8
73            WHEN 'proposals_drafted' THEN 7
74            WHEN 'enriched'          THEN 6
75            WHEN 'enriched_regex'    THEN 6
76            WHEN 'enriched_llm'      THEN 6
77            WHEN 'vision_scored'     THEN 5
78            WHEN 'semantic_scored'   THEN 5
79            WHEN 'prog_scored'       THEN 4
80            WHEN 'high_score'        THEN 4
81            WHEN 'failing'           THEN 4
82            WHEN 'assets_captured'   THEN 3
83            WHEN 'found'             THEN 2
84            WHEN 'ignored'           THEN 1
85            ELSE                          2
86          END AS status_rank
87        FROM sites
88      )
89      GROUP BY domain, keyword, country_code
90    )
91  );
92  
93  -- Step 2: Add the UNIQUE index
94  -- Will fail fast if any duplicates remain (they shouldn't after step 1).
95  CREATE UNIQUE INDEX IF NOT EXISTS idx_sites_domain_keyword_country
96    ON sites (domain, keyword, country_code);