/ db / migrations / 098-enriched-status-split.sql
098-enriched-status-split.sql
  1  -- Migration 098: Split 'enriched' status into 'enriched_regex' / 'enriched_llm',
  2  --               and add key_pages_html column for storing rendered HTML from
  3  --               contact/about pages during the browser enrichment pass.
  4  --
  5  -- Status flow change:
  6  --   semantic_scored → (pipeline browser pass) → enriched_regex
  7  --                   → (orchestrator LLM pass)  → enriched_llm
  8  --
  9  -- key_pages_html: JSON object mapping page URL → rendered HTML captured during
 10  --   the browser pass. Used by the enrich_sites orchestrator batch. Cleared
 11  --   (set to NULL) once enriched_llm is complete to reclaim disk space.
 12  --
 13  -- 'enriched' is kept in the CHECK constraint so existing rows remain valid
 14  -- and any sites that skip the browser pass (already have a contact form) can
 15  -- still be set directly to 'enriched' by the pipeline.
 16  --
 17  -- SQLite CHECK constraints cannot be modified in place; requires table recreation.
 18  
 19  PRAGMA foreign_keys = OFF;
 20  
 21  -- ============================================================
 22  -- SITES TABLE
 23  -- ============================================================
 24  
 25  ALTER TABLE sites RENAME TO sites_old;
 26  
 27  CREATE TABLE sites (
 28      id INTEGER PRIMARY KEY AUTOINCREMENT,
 29      domain TEXT NOT NULL,
 30      landing_page_url TEXT NOT NULL,
 31      keyword TEXT NOT NULL,
 32      screenshot_path TEXT,
 33      html_dom TEXT,
 34      http_status_code INTEGER,
 35      score_json TEXT,
 36      score REAL,
 37      grade TEXT,
 38      scored_at DATETIME,
 39      rescored_at DATETIME,
 40      enriched_at DATETIME,
 41      competitor_domain TEXT,
 42      contacts_json TEXT,
 43      city TEXT,
 44      country_code TEXT,
 45      last_outreach_at DATETIME,
 46      status TEXT DEFAULT 'found' CHECK(status IN (
 47          'found', 'assets_captured', 'prog_scored', 'semantic_scored', 'vision_scored',
 48          'enriched_regex', 'enriched_llm', 'enriched',
 49          'proposals_drafted', 'outreach_partial', 'outreach_sent',
 50          'ignored', 'failing', 'high_score',
 51          -- legacy values kept for rows that survived migration 097 not running
 52          'scored', 'rescored', 'ignore'
 53      )),
 54      error_message TEXT,
 55      created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
 56      updated_at DATETIME DEFAULT CURRENT_TIMESTAMP,
 57      google_domain TEXT,
 58      language_code TEXT,
 59      currency_code TEXT,
 60      company_proof TEXT DEFAULT NULL,
 61      gdpr_verified BOOLEAN DEFAULT NULL,
 62      gdpr_verified_at TIMESTAMP DEFAULT NULL,
 63      ssl_status TEXT CHECK(ssl_status IN ('https', 'http', 'mixed', 'error')),
 64      http_headers TEXT,
 65      recapture_count INTEGER DEFAULT 0,
 66      recapture_at DATETIME,
 67      state TEXT,
 68      locale_data TEXT,
 69      retry_count INTEGER DEFAULT 0,
 70      last_retry_at DATETIME,
 71      assets_captured_at DATETIME,
 72      form_fill_data TEXT,
 73      conversation_status TEXT DEFAULT NULL CHECK(conversation_status IN (
 74          'draft', 'active', 'qualified', 'payment_requested', 'paid',
 75          'report_delivered', 'not_interested', 'closed', 'unsubscribed'
 76      )),
 77      resulted_in_sale INTEGER DEFAULT 0,
 78      sale_amount REAL DEFAULT 0,
 79      key_pages_html TEXT DEFAULT NULL
 80  );
 81  
 82  INSERT INTO sites SELECT
 83      id,
 84      domain,
 85      landing_page_url,
 86      keyword,
 87      screenshot_path,
 88      html_dom,
 89      http_status_code,
 90      score_json,
 91      score,
 92      grade,
 93      scored_at,
 94      rescored_at,
 95      enriched_at,
 96      competitor_domain,
 97      contacts_json,
 98      city,
 99      country_code,
100      last_outreach_at,
101      status,
102      error_message,
103      created_at,
104      updated_at,
105      google_domain,
106      language_code,
107      currency_code,
108      company_proof,
109      gdpr_verified,
110      gdpr_verified_at,
111      ssl_status,
112      http_headers,
113      recapture_count,
114      recapture_at,
115      state,
116      locale_data,
117      retry_count,
118      last_retry_at,
119      assets_captured_at,
120      form_fill_data,
121      conversation_status,
122      resulted_in_sale,
123      sale_amount,
124      NULL  -- key_pages_html: no existing data to migrate
125  FROM sites_old;
126  
127  DROP TABLE sites_old;
128  
129  -- Recreate indexes (same as migration 097)
130  CREATE INDEX IF NOT EXISTS idx_sites_domain ON sites(domain);
131  CREATE INDEX IF NOT EXISTS idx_sites_keyword ON sites(keyword);
132  CREATE INDEX IF NOT EXISTS idx_sites_status ON sites(status);
133  CREATE INDEX IF NOT EXISTS idx_sites_score ON sites(score);
134  CREATE INDEX IF NOT EXISTS idx_sites_last_outreach ON sites(last_outreach_at);
135  CREATE INDEX IF NOT EXISTS idx_sites_country ON sites(country_code);
136  CREATE INDEX IF NOT EXISTS idx_sites_gdpr_verified ON sites(gdpr_verified) WHERE gdpr_verified IS NOT NULL;
137  CREATE INDEX IF NOT EXISTS idx_sites_country_gdpr ON sites(country_code, gdpr_verified) WHERE country_code IN (
138      'DE', 'FR', 'IT', 'ES', 'NL', 'BE', 'AT', 'SE', 'DK', 'NO', 'IE', 'UK', 'PL'
139  );
140  CREATE INDEX IF NOT EXISTS idx_sites_recapture_ready ON sites(recapture_at) WHERE recapture_at IS NOT NULL;
141  CREATE INDEX IF NOT EXISTS idx_sites_recapture_count ON sites(recapture_count) WHERE recapture_count > 0;
142  CREATE INDEX IF NOT EXISTS idx_sites_retry_count ON sites(retry_count) WHERE retry_count > 0;
143  CREATE INDEX IF NOT EXISTS idx_sites_failing ON sites(status) WHERE status = 'failing';
144  CREATE INDEX IF NOT EXISTS idx_sites_high_score ON sites(status) WHERE status = 'high_score';
145  CREATE INDEX IF NOT EXISTS idx_sites_ignored ON sites(status) WHERE status = 'ignored';
146  CREATE INDEX IF NOT EXISTS idx_sites_conversation_status ON sites(conversation_status) WHERE conversation_status IS NOT NULL;
147  CREATE INDEX IF NOT EXISTS idx_sites_enriched_regex ON sites(status) WHERE status = 'enriched_regex';
148  CREATE INDEX IF NOT EXISTS idx_sites_key_pages_html ON sites(id) WHERE key_pages_html IS NOT NULL;
149  
150  -- Recreate trigger
151  CREATE TRIGGER IF NOT EXISTS log_site_status_change
152  AFTER UPDATE OF status ON sites
153  FOR EACH ROW
154  WHEN NEW.status != OLD.status
155  BEGIN
156      INSERT INTO site_status (site_id, status, error_message, created_at)
157      VALUES (NEW.id, NEW.status, NEW.error_message, CURRENT_TIMESTAMP);
158  END;
159  
160  -- ============================================================
161  -- SITE_STATUS TABLE (history log) — add new status values
162  -- ============================================================
163  
164  ALTER TABLE site_status RENAME TO site_status_old;
165  
166  CREATE TABLE site_status (
167      id INTEGER PRIMARY KEY AUTOINCREMENT,
168      site_id INTEGER NOT NULL REFERENCES sites(id) ON DELETE CASCADE,
169      status TEXT NOT NULL CHECK(status IN (
170          'found', 'assets_captured', 'prog_scored', 'semantic_scored', 'vision_scored',
171          'enriched_regex', 'enriched_llm', 'enriched',
172          'proposals_drafted', 'outreach_partial', 'outreach_sent',
173          'ignored', 'failing', 'high_score',
174          -- legacy values (sites/site_status rows pre-migration-097)
175          'scored', 'rescored', 'ignore'
176      )),
177      error_message TEXT,
178      created_at DATETIME DEFAULT CURRENT_TIMESTAMP
179  );
180  
181  INSERT INTO site_status SELECT * FROM site_status_old;
182  
183  DROP TABLE site_status_old;
184  
185  CREATE INDEX IF NOT EXISTS idx_site_status_site_id ON site_status(site_id);
186  CREATE INDEX IF NOT EXISTS idx_site_status_created_at ON site_status(created_at);
187  CREATE INDEX IF NOT EXISTS idx_site_status_site_created ON site_status(site_id, created_at);
188  CREATE INDEX IF NOT EXISTS idx_site_status_status ON site_status(status);
189  
190  PRAGMA foreign_keys = ON;