Cradicle Explorer

/ src / stages / gather-evidence.js
gather-evidence.js
  1  /**
  2   * Evidence Gathering Stage
  3   *
  4   * Performs two-pass evidence collection on enriched sites before proposal generation:
  5   *   Pass 1 (unbriefed): fetches live site, extracts HTML + inline CSS, calls Haiku to
  6   *            identify specific evidence for each CRO factor with CSS selectors.
  7   *   Pass 2 (briefed): same fetch, but provides score_json as context so the LLM
  8   *            confirms/challenges existing scores rather than free-ranging.
  9   *
 10   * Results stored in:
 11   *   sites.evidence_pass1_json — raw JSON from unbriefed pass
 12   *   sites.evidence_pass2_json — raw JSON from briefed pass
 13   *
 14   * After both passes are stored, the evidence_merge orchestrator batch picks up the site
 15   * and merges them into sites.evidence_json (revised scores + evidence_summary for proposals).
 16   *
 17   * Pipeline gate: proposals_email/proposals_sms query requires evidence_json IS NOT NULL.
 18   */
 19  
 20  import { run, getAll } from '../utils/db.js';
 21  import { readFileSync } from 'fs';
 22  import { join, dirname } from 'path';
 23  import { fileURLToPath } from 'url';
 24  import Logger from '../utils/logger.js';
 25  import { processBatch } from '../utils/error-handler.js';
 26  import { callLLM } from '../utils/llm-provider.js';
 27  import { openRouterLimiter } from '../utils/rate-limiter.js';
 28  import { openRouterBreaker } from '../utils/circuit-breaker.js';
 29  import { safeJsonParse } from '../utils/error-handler.js';
 30  import { getAdaptiveConcurrencyFast } from '../utils/adaptive-concurrency.js';
 31  import { sanitizeHtmlForPrompt } from '../utils/llm-sanitizer.js';
 32  import { getScoreJsonWithFallback } from '../utils/score-storage.js';
 33  import '../utils/load-env.js';
 34  
 35  const __filename = fileURLToPath(import.meta.url);
 36  const __dirname = dirname(__filename);
 37  const projectRoot = join(__dirname, '../..');
 38  
 39  const logger = new Logger('GatherEvidence');
 40  const EVIDENCE_PROMPT = readFileSync(join(projectRoot, 'prompts/EVIDENCE-COLLECT.md'), 'utf-8');
 41  
 42  const BATCH_SIZE = parseInt(process.env.EVIDENCE_BATCH_SIZE || '8', 10);
 43  const MODEL = process.env.EVIDENCE_MODEL || process.env.CLAUDE_HAIKU_MODEL || 'anthropic/claude-haiku-4-5';
 44  const HTML_TRUNCATE = 50_000; // chars fed to LLM
 45  const CSS_TRUNCATE = 15_000;
 46  
 47  /**
 48   * Extract inline CSS from raw HTML (style tags + first 200 chars of linked stylesheets src attrs)
 49   */
 50  function extractInlineCss(html) {
 51    const styleMatches = html.match(/<style[^>]*>([\s\S]*?)<\/style>/gi) || [];
 52    return styleMatches
 53      .map(block => block.replace(/<\/?style[^>]*>/gi, ''))
 54      .join('\n')
 55      .slice(0, CSS_TRUNCATE);
 56  }
 57  
 58  /**
 59   * Validate that a URL is a safe external HTTP/HTTPS target.
 60   * Guards against SSRF from poisoned DB entries.
 61   */
 62  function isSafeUrl(urlStr) {
 63    try {
 64      const u = new URL(urlStr);
 65      if (u.protocol !== 'http:' && u.protocol !== 'https:') return false;
 66      const h = u.hostname;
 67      // Block localhost, loopback, link-local, and private RFC1918 ranges
 68      if (h === 'localhost' || h === '127.0.0.1' || h === '::1') return false;
 69      if (/^169\.254\./.test(h)) return false;
 70      if (/^10\./.test(h) || /^192\.168\./.test(h)) return false;
 71      if (/^172\.(1[6-9]|2\d|3[01])\./.test(h)) return false;
 72      return true;
 73    } catch {
 74      return false;
 75    }
 76  }
 77  
 78  /**
 79   * Fetch page HTML using Node's built-in fetch with browser-like headers.
 80   * Returns null on failure rather than throwing.
 81   */
 82  async function fetchPageHtml(url) {
 83    if (!isSafeUrl(url)) {
 84      logger.warn(`Skipping unsafe URL: ${url}`);
 85      return null;
 86    }
 87    try {
 88      const res = await fetch(url, {
 89        headers: {
 90          'User-Agent':
 91            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
 92          Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 93          'Accept-Language': 'en-US,en;q=0.9',
 94          'Accept-Encoding': 'gzip, deflate, br',
 95          'Cache-Control': 'no-cache',
 96        },
 97        signal: AbortSignal.timeout(15_000),
 98      });
 99      if (!res.ok) return null;
100      return await res.text();
101    } catch {
102      return null;
103    }
104  }
105  
106  /**
107   * Build the LLM prompt for one evidence pass.
108   */
109  function buildPrompt(domain, url, scoreJson, homepageHtml, cssContent, pass) {
110    const truncatedHtml = homepageHtml ? homepageHtml.slice(0, HTML_TRUNCATE) : '(unavailable)';
111    const truncatedCss = cssContent ? cssContent.slice(0, CSS_TRUNCATE) : '(unavailable)';
112  
113    const scoreContext =
114      pass === 'briefed' && scoreJson
115        ? JSON.stringify(safeJsonParse(scoreJson) || {}, null, 2).slice(0, 8000)
116        : '(not provided for this pass)';
117  
118    return EVIDENCE_PROMPT.replace('{{domain}}', sanitizeHtmlForPrompt(domain))
119      .replace('{{url}}', sanitizeHtmlForPrompt(url))
120      .replace('{{score_json}}', scoreContext)
121      .replace('{{homepage_html}}', sanitizeHtmlForPrompt(truncatedHtml))
122      .replace('{{css_content}}', sanitizeHtmlForPrompt(truncatedCss));
123  }
124  
125  /**
126   * Run one evidence pass (unbriefed or briefed) for a site.
127   * Returns the raw JSON result object, or null on failure.
128   */
129  async function runEvidencePass(site, pass) {
130    const homepageHtml = await fetchPageHtml(site.landing_page_url);
131  
132    if (!homepageHtml) {
133      logger.warn(`[${site.domain}] Could not fetch HTML for evidence pass (${pass})`);
134      return null;
135    }
136  
137    const cssContent = extractInlineCss(homepageHtml);
138    const prompt = buildPrompt(
139      site.domain,
140      site.landing_page_url,
141      pass === 'briefed' ? getScoreJsonWithFallback(site.id, site) : null,
142      homepageHtml,
143      cssContent,
144      pass
145    );
146  
147    const response = await openRouterLimiter.schedule(() =>
148      openRouterBreaker.fire(() =>
149        callLLM({
150          model: MODEL,
151          messages: [{ role: 'user', content: prompt }],
152          max_tokens: 4096,
153          temperature: 0,
154          stage: 'other',
155        })
156      )
157    );
158  
159    const result = safeJsonParse(response?.content);
160    if (!result) {
161      logger.warn(`[${site.domain}] Evidence pass (${pass}) returned non-JSON response`);
162      return null;
163    }
164  
165    return result;
166  }
167  
168  /**
169   * Process a single site through both evidence passes.
170   */
171  async function gatherEvidenceForSite(site) {
172    logger.info(`[${site.domain}] Starting evidence collection`);
173  
174    // Pass 1: unbriefed
175    let pass1 = null;
176    if (!site.evidence_pass1_json) {
177      pass1 = await runEvidencePass(site, 'unbriefed');
178      if (pass1) {
179        const { changes } = await run(
180          'UPDATE sites SET evidence_pass1_json = $1, updated_at = NOW() WHERE id = $2',
181          [JSON.stringify(pass1), site.id]
182        );
183        if (changes === 0) {
184          logger.error(`[${site.domain}] Pass 1 UPDATE affected 0 rows — site id=${site.id} may have been deleted`);
185          return;
186        }
187        logger.info(`[${site.domain}] Pass 1 (unbriefed) stored`);
188      }
189    } else {
190      logger.debug(`[${site.domain}] Pass 1 already exists — skipping`);
191      pass1 = safeJsonParse(site.evidence_pass1_json);
192    }
193  
194    // Pass 2: briefed (provides score_json context)
195    if (!site.evidence_pass2_json) {
196      const pass2 = await runEvidencePass(site, 'briefed');
197      if (pass2) {
198        const { changes } = await run(
199          'UPDATE sites SET evidence_pass2_json = $1, updated_at = NOW() WHERE id = $2',
200          [JSON.stringify(pass2), site.id]
201        );
202        if (changes === 0) {
203          logger.error(`[${site.domain}] Pass 2 UPDATE affected 0 rows — site id=${site.id} may have been deleted`);
204          return;
205        }
206        logger.info(`[${site.domain}] Pass 2 (briefed) stored`);
207      }
208    } else {
209      logger.debug(`[${site.domain}] Pass 2 already exists — skipping`);
210    }
211  
212    logger.info(`[${site.domain}] Evidence collection complete`);
213  }
214  
215  export async function gatherEvidence() {
216    try {
217      // Skip sites in blocked countries — no proposals will ever be sent to them
218      const blockedCountries = (process.env.OUTREACH_BLOCKED_COUNTRIES || '')
219        .split(',').map(s => s.trim().toUpperCase()).filter(Boolean);
220  
221      // Pick enriched sites missing either evidence pass
222      let sitesQuery;
223      let sitesParams;
224  
225      if (blockedCountries.length) {
226        sitesQuery = `
227          SELECT id, domain, landing_page_url,
228                 evidence_pass1_json, evidence_pass2_json
229          FROM sites
230          WHERE status IN ('enriched', 'enriched_llm')
231            AND score IS NOT NULL
232            AND score < 82
233            AND (evidence_pass1_json IS NULL OR evidence_pass2_json IS NULL)
234            AND UPPER(country_code) != ALL($1::text[])
235          ORDER BY score ASC
236          LIMIT $2
237        `;
238        sitesParams = [blockedCountries, BATCH_SIZE];
239      } else {
240        sitesQuery = `
241          SELECT id, domain, landing_page_url,
242                 evidence_pass1_json, evidence_pass2_json
243          FROM sites
244          WHERE status IN ('enriched', 'enriched_llm')
245            AND score IS NOT NULL
246            AND score < 82
247            AND (evidence_pass1_json IS NULL OR evidence_pass2_json IS NULL)
248          ORDER BY score ASC
249          LIMIT $1
250        `;
251        sitesParams = [BATCH_SIZE];
252      }
253  
254      const sites = await getAll(sitesQuery, sitesParams);
255  
256      if (sites.length === 0) {
257        logger.info('No sites pending evidence collection');
258        return;
259      }
260  
261      logger.info(`Gathering evidence for ${sites.length} sites`);
262  
263      const concurrency = getAdaptiveConcurrencyFast(2, parseInt(process.env.EVIDENCE_CONCURRENCY || '8', 10));
264  
265      await processBatch(sites, site => gatherEvidenceForSite(site), {
266        concurrency,
267        label: 'evidence',
268        logger,
269      });
270  
271      logger.success(`Evidence collection complete for ${sites.length} sites`);
272    } catch (err) {
273      logger.error('Evidence gathering failed', err);
274      throw err;
275    }
276  }
277  
278  // CLI entry point
279  if (process.argv[1] === fileURLToPath(import.meta.url)) {
280    gatherEvidence().catch(err => {
281      logger.error('Evidence gathering failed', err);
282      process.exit(1);
283    });
284  }