098-enriched-status-split.sql
1 -- Migration 098: Split 'enriched' status into 'enriched_regex' / 'enriched_llm', 2 -- and add key_pages_html column for storing rendered HTML from 3 -- contact/about pages during the browser enrichment pass. 4 -- 5 -- Status flow change: 6 -- semantic_scored → (pipeline browser pass) → enriched_regex 7 -- → (orchestrator LLM pass) → enriched_llm 8 -- 9 -- key_pages_html: JSON object mapping page URL → rendered HTML captured during 10 -- the browser pass. Used by the enrich_sites orchestrator batch. Cleared 11 -- (set to NULL) once enriched_llm is complete to reclaim disk space. 12 -- 13 -- 'enriched' is kept in the CHECK constraint so existing rows remain valid 14 -- and any sites that skip the browser pass (already have a contact form) can 15 -- still be set directly to 'enriched' by the pipeline. 16 -- 17 -- SQLite CHECK constraints cannot be modified in place; requires table recreation. 18 19 PRAGMA foreign_keys = OFF; 20 21 -- ============================================================ 22 -- SITES TABLE 23 -- ============================================================ 24 25 ALTER TABLE sites RENAME TO sites_old; 26 27 CREATE TABLE sites ( 28 id INTEGER PRIMARY KEY AUTOINCREMENT, 29 domain TEXT NOT NULL, 30 landing_page_url TEXT NOT NULL, 31 keyword TEXT NOT NULL, 32 screenshot_path TEXT, 33 html_dom TEXT, 34 http_status_code INTEGER, 35 score_json TEXT, 36 score REAL, 37 grade TEXT, 38 scored_at DATETIME, 39 rescored_at DATETIME, 40 enriched_at DATETIME, 41 competitor_domain TEXT, 42 contacts_json TEXT, 43 city TEXT, 44 country_code TEXT, 45 last_outreach_at DATETIME, 46 status TEXT DEFAULT 'found' CHECK(status IN ( 47 'found', 'assets_captured', 'prog_scored', 'semantic_scored', 'vision_scored', 48 'enriched_regex', 'enriched_llm', 'enriched', 49 'proposals_drafted', 'outreach_partial', 'outreach_sent', 50 'ignored', 'failing', 'high_score', 51 -- legacy values kept for rows that survived migration 097 not running 52 'scored', 'rescored', 'ignore' 53 )), 54 error_message TEXT, 55 created_at DATETIME DEFAULT CURRENT_TIMESTAMP, 56 updated_at DATETIME DEFAULT CURRENT_TIMESTAMP, 57 google_domain TEXT, 58 language_code TEXT, 59 currency_code TEXT, 60 company_proof TEXT DEFAULT NULL, 61 gdpr_verified BOOLEAN DEFAULT NULL, 62 gdpr_verified_at TIMESTAMP DEFAULT NULL, 63 ssl_status TEXT CHECK(ssl_status IN ('https', 'http', 'mixed', 'error')), 64 http_headers TEXT, 65 recapture_count INTEGER DEFAULT 0, 66 recapture_at DATETIME, 67 state TEXT, 68 locale_data TEXT, 69 retry_count INTEGER DEFAULT 0, 70 last_retry_at DATETIME, 71 assets_captured_at DATETIME, 72 form_fill_data TEXT, 73 conversation_status TEXT DEFAULT NULL CHECK(conversation_status IN ( 74 'draft', 'active', 'qualified', 'payment_requested', 'paid', 75 'report_delivered', 'not_interested', 'closed', 'unsubscribed' 76 )), 77 resulted_in_sale INTEGER DEFAULT 0, 78 sale_amount REAL DEFAULT 0, 79 key_pages_html TEXT DEFAULT NULL 80 ); 81 82 INSERT INTO sites SELECT 83 id, 84 domain, 85 landing_page_url, 86 keyword, 87 screenshot_path, 88 html_dom, 89 http_status_code, 90 score_json, 91 score, 92 grade, 93 scored_at, 94 rescored_at, 95 enriched_at, 96 competitor_domain, 97 contacts_json, 98 city, 99 country_code, 100 last_outreach_at, 101 status, 102 error_message, 103 created_at, 104 updated_at, 105 google_domain, 106 language_code, 107 currency_code, 108 company_proof, 109 gdpr_verified, 110 gdpr_verified_at, 111 ssl_status, 112 http_headers, 113 recapture_count, 114 recapture_at, 115 state, 116 locale_data, 117 retry_count, 118 last_retry_at, 119 assets_captured_at, 120 form_fill_data, 121 conversation_status, 122 resulted_in_sale, 123 sale_amount, 124 NULL -- key_pages_html: no existing data to migrate 125 FROM sites_old; 126 127 DROP TABLE sites_old; 128 129 -- Recreate indexes (same as migration 097) 130 CREATE INDEX IF NOT EXISTS idx_sites_domain ON sites(domain); 131 CREATE INDEX IF NOT EXISTS idx_sites_keyword ON sites(keyword); 132 CREATE INDEX IF NOT EXISTS idx_sites_status ON sites(status); 133 CREATE INDEX IF NOT EXISTS idx_sites_score ON sites(score); 134 CREATE INDEX IF NOT EXISTS idx_sites_last_outreach ON sites(last_outreach_at); 135 CREATE INDEX IF NOT EXISTS idx_sites_country ON sites(country_code); 136 CREATE INDEX IF NOT EXISTS idx_sites_gdpr_verified ON sites(gdpr_verified) WHERE gdpr_verified IS NOT NULL; 137 CREATE INDEX IF NOT EXISTS idx_sites_country_gdpr ON sites(country_code, gdpr_verified) WHERE country_code IN ( 138 'DE', 'FR', 'IT', 'ES', 'NL', 'BE', 'AT', 'SE', 'DK', 'NO', 'IE', 'UK', 'PL' 139 ); 140 CREATE INDEX IF NOT EXISTS idx_sites_recapture_ready ON sites(recapture_at) WHERE recapture_at IS NOT NULL; 141 CREATE INDEX IF NOT EXISTS idx_sites_recapture_count ON sites(recapture_count) WHERE recapture_count > 0; 142 CREATE INDEX IF NOT EXISTS idx_sites_retry_count ON sites(retry_count) WHERE retry_count > 0; 143 CREATE INDEX IF NOT EXISTS idx_sites_failing ON sites(status) WHERE status = 'failing'; 144 CREATE INDEX IF NOT EXISTS idx_sites_high_score ON sites(status) WHERE status = 'high_score'; 145 CREATE INDEX IF NOT EXISTS idx_sites_ignored ON sites(status) WHERE status = 'ignored'; 146 CREATE INDEX IF NOT EXISTS idx_sites_conversation_status ON sites(conversation_status) WHERE conversation_status IS NOT NULL; 147 CREATE INDEX IF NOT EXISTS idx_sites_enriched_regex ON sites(status) WHERE status = 'enriched_regex'; 148 CREATE INDEX IF NOT EXISTS idx_sites_key_pages_html ON sites(id) WHERE key_pages_html IS NOT NULL; 149 150 -- Recreate trigger 151 CREATE TRIGGER IF NOT EXISTS log_site_status_change 152 AFTER UPDATE OF status ON sites 153 FOR EACH ROW 154 WHEN NEW.status != OLD.status 155 BEGIN 156 INSERT INTO site_status (site_id, status, error_message, created_at) 157 VALUES (NEW.id, NEW.status, NEW.error_message, CURRENT_TIMESTAMP); 158 END; 159 160 -- ============================================================ 161 -- SITE_STATUS TABLE (history log) — add new status values 162 -- ============================================================ 163 164 ALTER TABLE site_status RENAME TO site_status_old; 165 166 CREATE TABLE site_status ( 167 id INTEGER PRIMARY KEY AUTOINCREMENT, 168 site_id INTEGER NOT NULL REFERENCES sites(id) ON DELETE CASCADE, 169 status TEXT NOT NULL CHECK(status IN ( 170 'found', 'assets_captured', 'prog_scored', 'semantic_scored', 'vision_scored', 171 'enriched_regex', 'enriched_llm', 'enriched', 172 'proposals_drafted', 'outreach_partial', 'outreach_sent', 173 'ignored', 'failing', 'high_score', 174 -- legacy values (sites/site_status rows pre-migration-097) 175 'scored', 'rescored', 'ignore' 176 )), 177 error_message TEXT, 178 created_at DATETIME DEFAULT CURRENT_TIMESTAMP 179 ); 180 181 INSERT INTO site_status SELECT * FROM site_status_old; 182 183 DROP TABLE site_status_old; 184 185 CREATE INDEX IF NOT EXISTS idx_site_status_site_id ON site_status(site_id); 186 CREATE INDEX IF NOT EXISTS idx_site_status_created_at ON site_status(created_at); 187 CREATE INDEX IF NOT EXISTS idx_site_status_site_created ON site_status(site_id, created_at); 188 CREATE INDEX IF NOT EXISTS idx_site_status_status ON site_status(status); 189 190 PRAGMA foreign_keys = ON;