/ db / migrations / 111-evidence-indexes.sql
111-evidence-indexes.sql
 1  -- Migration 111: Indexes for evidence pipeline queries
 2  --
 3  -- Context: Three new TEXT columns were added to sites in migration 108:
 4  --   evidence_pass1_json, evidence_pass2_json, evidence_json
 5  --
 6  -- Problem: 1.187M row table. Two of the three evidence queries anchor on
 7  -- idx_sites_score and return 54,103 rows before applying status residual
 8  -- predicates, ultimately yielding only ~425 rows. The index is 127x too wide.
 9  -- The third query (fetchEvidenceMerge) uses idx_sites_status and scans 17,542
10  -- rows to return 0 today (columns just added) — that ratio will worsen as the
11  -- status set grows.
12  --
13  -- Fix: Two partial composite indexes that match the tightest predicate
14  -- combinations in these queries:
15  --
16  -- Index 1: idx_sites_evidence_pipeline
17  --   Covers queries 1 (fetchProposalsBatch) and 3 (gather-evidence).
18  --   Both filter: status IN ('enriched','enriched_llm') AND score < 82
19  --   The partial WHERE clause restricts the index to 431 entries today vs
20  --   54,103 entries that idx_sites_score covers. score column is included so
21  --   the score < 82 range scan happens inside the index without a table lookup.
22  --
23  -- Index 2: idx_sites_evidence_merge
24  --   Covers query 2 (fetchEvidenceMerge).
25  --   Filters: status IN (5 values) AND evidence_json IS NULL
26  --   Partial WHERE restricts to rows where evidence_json IS NULL — the exact
27  --   working set this query targets. Once a site's evidence_json is written the
28  --   row drops out of the index automatically, keeping it narrow over time.
29  --   score_json column is included because fetchEvidenceMerge also requires
30  --   score_json IS NOT NULL as a residual filter.
31  
32  -- Index for fetchProposalsBatch and gather-evidence:
33  --   WHERE status IN ('enriched','enriched_llm') AND score IS NOT NULL AND score < 82
34  CREATE INDEX IF NOT EXISTS idx_sites_evidence_pipeline
35      ON sites(status, score)
36      WHERE status IN ('enriched', 'enriched_llm');
37  
38  -- Index for fetchEvidenceMerge:
39  --   WHERE evidence_json IS NULL AND status IN ('enriched','enriched_llm',
40  --         'proposals_drafted','outreach_partial','outreach_sent')
41  CREATE INDEX IF NOT EXISTS idx_sites_evidence_merge
42      ON sites(status, score_json)
43      WHERE evidence_json IS NULL
44        AND status IN ('enriched', 'enriched_llm', 'proposals_drafted',
45                       'outreach_partial', 'outreach_sent');