119-unique-domain-keyword-country.sql
1 -- Migration 119: Add UNIQUE constraint on (domain, keyword, country_code) 2 -- 3 -- Prevents the SERP insertion stage from creating duplicate rows for the same 4 -- domain+keyword+country combination. The INSERT OR IGNORE in serps.js was 5 -- always intended to be idempotent, but had nothing to enforce uniqueness. 6 -- 7 -- Strategy: 8 -- For each (domain, keyword, country_code) group, keep the row with the 9 -- highest status rank (most progressed). Among ties, keep the highest id. 10 -- All other rows are deleted. 11 -- 12 -- Status rank (higher = more valuable to keep): 13 -- outreach_sent/outreach_partial/replied/interested/closed = 8 14 -- proposals_drafted = 7 15 -- enriched/enriched_regex/enriched_llm = 6 16 -- vision_scored/semantic_scored = 5 17 -- prog_scored/high_score/failing = 4 18 -- assets_captured = 3 19 -- found = 2 20 -- ignored = 1 21 -- 22 -- NOTE: This migration may take several minutes on large databases due to the 23 -- DELETE + index build across ~1.2M rows. 24 25 -- Step 1: Delete all duplicate rows, keeping the "best" row per combo 26 DELETE FROM sites 27 WHERE id NOT IN ( 28 SELECT id 29 FROM ( 30 SELECT 31 id, 32 domain, 33 keyword, 34 country_code, 35 CASE status 36 WHEN 'outreach_sent' THEN 8 37 WHEN 'outreach_partial' THEN 8 38 WHEN 'replied' THEN 8 39 WHEN 'interested' THEN 8 40 WHEN 'closed' THEN 8 41 WHEN 'not_interested' THEN 8 42 WHEN 'proposals_drafted' THEN 7 43 WHEN 'enriched' THEN 6 44 WHEN 'enriched_regex' THEN 6 45 WHEN 'enriched_llm' THEN 6 46 WHEN 'vision_scored' THEN 5 47 WHEN 'semantic_scored' THEN 5 48 WHEN 'prog_scored' THEN 4 49 WHEN 'high_score' THEN 4 50 WHEN 'failing' THEN 4 51 WHEN 'assets_captured' THEN 3 52 WHEN 'found' THEN 2 53 WHEN 'ignored' THEN 1 54 ELSE 2 55 END AS status_rank 56 FROM sites 57 ) ranked 58 WHERE (domain, keyword, country_code, status_rank, id) IN ( 59 SELECT domain, keyword, country_code, MAX(status_rank), MAX(id) 60 FROM ( 61 SELECT 62 id, 63 domain, 64 keyword, 65 country_code, 66 CASE status 67 WHEN 'outreach_sent' THEN 8 68 WHEN 'outreach_partial' THEN 8 69 WHEN 'replied' THEN 8 70 WHEN 'interested' THEN 8 71 WHEN 'closed' THEN 8 72 WHEN 'not_interested' THEN 8 73 WHEN 'proposals_drafted' THEN 7 74 WHEN 'enriched' THEN 6 75 WHEN 'enriched_regex' THEN 6 76 WHEN 'enriched_llm' THEN 6 77 WHEN 'vision_scored' THEN 5 78 WHEN 'semantic_scored' THEN 5 79 WHEN 'prog_scored' THEN 4 80 WHEN 'high_score' THEN 4 81 WHEN 'failing' THEN 4 82 WHEN 'assets_captured' THEN 3 83 WHEN 'found' THEN 2 84 WHEN 'ignored' THEN 1 85 ELSE 2 86 END AS status_rank 87 FROM sites 88 ) 89 GROUP BY domain, keyword, country_code 90 ) 91 ); 92 93 -- Step 2: Add the UNIQUE index 94 -- Will fail fast if any duplicates remain (they shouldn't after step 1). 95 CREATE UNIQUE INDEX IF NOT EXISTS idx_sites_domain_keyword_country 96 ON sites (domain, keyword, country_code);