Cradicle Explorer

/ src / stages / scoring.js
scoring.js
  1  /**
  2   * Scoring Stage
  3   * Initial AI-powered conversion scoring for sites
  4   */
  5  
  6  import { run, getOne, getAll, query, withTransaction } from '../utils/db.js';
  7  import { scoreWebsite } from '../score.js';
  8  import { normaliseCountryCode } from '../config/countries.js';
  9  import Logger from '../utils/logger.js';
 10  import { generateStageCompletion, displayProgress } from '../utils/summary-generator.js';
 11  import { processBatch } from '../utils/error-handler.js';
 12  import { loadScreenshot } from '../utils/screenshot-storage.js';
 13  import { checkBlocklist, classifyIndustry } from '../utils/site-filters.js';
 14  import { incrementLowScoring } from '../utils/keyword-counters.js';
 15  import { recordFailure, resetRetries } from '../utils/retry-handler.js';
 16  import { scoreWebsiteProgrammatically } from '../utils/programmatic-scorer.js';
 17  import { readHtmlDom } from '../utils/html-storage.js';
 18  import { setScoreJson } from '../utils/score-storage.js';
 19  import { setContactsJson } from '../utils/contacts-storage.js';
 20  
 21  const logger = new Logger('Scoring');
 22  const success = (...args) => logger.success(...args);
 23  const info = (...args) => logger.info(...args);
 24  const error = (...args) => logger.error(...args);
 25  
 26  const ENGLISH_ONLY_MARKETS = process.env.ENGLISH_ONLY_MARKETS
 27    ? process.env.ENGLISH_ONLY_MARKETS.split(',').map(c => c.trim().toUpperCase()).filter(Boolean)
 28    : null;
 29  
 30  /**
 31   * Run the scoring stage
 32   * @param {Object} options - Stage options
 33   * @param {number} options.limit - Limit number of sites to score
 34   * @param {number} options.concurrency - Number of concurrent scoring operations (default: 10)
 35   * @returns {Promise<Object>} Stage results
 36   */
 37  export async function runScoringStage(options = {}) {
 38    const startTime = Date.now();
 39    const concurrency = options.concurrency || parseInt(process.env.SCORING_CONCURRENCY || '10', 10);
 40  
 41    info('Starting Scoring Stage...');
 42  
 43    // Check ENABLE_VISION flag
 44    const ENABLE_VISION = process.env.ENABLE_VISION !== 'false';
 45  
 46    // Show deprecation warning if old flags are used
 47    const legacyFlags = [
 48      process.env.USE_COMPUTER_VISION_SCORING,
 49      process.env.USE_COMPUTER_VISION_RESCORING,
 50      process.env.USE_COMPUTER_VISION_ENRICHMENT,
 51    ];
 52    if (legacyFlags.some(flag => flag !== undefined)) {
 53      logger.warn(
 54        '[scoring] WARN: Vision flags (USE_COMPUTER_VISION_*) are deprecated. Use ENABLE_VISION instead.'
 55      );
 56    }
 57  
 58    const modeLabel = ENABLE_VISION ? 'vision enabled' : 'HTML-only (vision disabled)';
 59    info(`[scoring] Mode: ${modeLabel}`);
 60  
 61    // Orchestrator scoring mode: when ENABLE_VISION=false and ENABLE_LLM_SCORING=true,
 62    // the orchestrator's score_sites batch handles LLM scoring via Claude Max (claude -p)
 63    // at zero API cost. The pipeline stage is bypassed — orchestrator advances sites to 'scored'.
 64    const ENABLE_LLM_SCORING = process.env.ENABLE_LLM_SCORING !== 'false';
 65    if (ENABLE_LLM_SCORING && !ENABLE_VISION) {
 66      info(
 67        '[scoring] ORCHESTRATOR MODE — score_sites batch will handle LLM scoring via Claude Max. Pipeline stage skipped.'
 68      );
 69      return {
 70        processed: 0,
 71        succeeded: 0,
 72        failed: 0,
 73        skipped: 0,
 74        duration: Date.now() - startTime,
 75        mode: 'orchestrator',
 76      };
 77    }
 78  
 79    try {
 80      // Get sites that need initial scoring.
 81      // score IS NULL is the precise condition — scoring always sets score before
 82      // advancing status to 'prog_scored', so assets_captured + score IS NULL = not yet scored
 83      // (also covers retry cases: scoring errors keep status='assets_captured', score=NULL).
 84      // Avoids retrying asset-capture errors (those leave sites at 'found', not 'assets_captured').
 85      // Only check IS NOT NULL — can satisfy this without loading the html_dom value,
 86      // which avoids blocking on reading data from 3000+ rows.
 87      // The scoring function itself fetches html_dom per-site and skips empty/placeholder values.
 88      // Also respect recapture_at: broken-site retries are delayed 7 days, don't re-queue early.
 89      const englishParams = ENGLISH_ONLY_MARKETS || [];
 90      const englishFilter = ENGLISH_ONLY_MARKETS
 91        ? `AND country_code = ANY($${1}::text[])`
 92        : '';
 93  
 94      const sitesQuery = `
 95        SELECT id, domain, landing_page_url as url, country_code
 96        FROM sites
 97        WHERE status = 'assets_captured'
 98          AND html_dom IS NOT NULL
 99          AND score IS NULL
100          AND (recapture_at IS NULL OR recapture_at <= CURRENT_TIMESTAMP)
101          ${englishFilter}
102        ORDER BY CASE WHEN country_code IN ('AU','CA','GB','IE','IN','NZ','US','ZA') THEN 0 ELSE 1 END ASC
103        ${options.limit ? `LIMIT ${options.limit}` : ''}
104      `;
105  
106      const sitesParams = ENGLISH_ONLY_MARKETS ? [ENGLISH_ONLY_MARKETS] : [];
107      const sites = await getAll(sitesQuery, sitesParams.length ? sitesParams : undefined);
108  
109      // Auto-promote assets_captured sites that already have scores → semantic_scored.
110      // Happens when assets re-captures html_dom for a previously-scored site (e.g. re-enrichment
111      // pass). Scoring skips them (score IS NOT NULL), so we promote here to unblock enrich.
112      const { changes: alreadyScoredPromoted } = await run(
113        `UPDATE sites
114         SET status = 'semantic_scored',
115             rescored_at = NOW(),
116             updated_at = NOW()
117         WHERE status = 'assets_captured'
118           AND score IS NOT NULL
119           AND html_dom IS NOT NULL`
120      );
121      if (alreadyScoredPromoted > 0) {
122        info(
123          `Auto-promoted ${alreadyScoredPromoted} already-scored assets_captured → semantic_scored (re-enrichment pass)`
124        );
125      }
126  
127      if (sites.length === 0) {
128        info('No sites need initial scoring');
129        return {
130          processed: alreadyScoredPromoted,
131          succeeded: alreadyScoredPromoted,
132          failed: 0,
133          skipped: 0,
134          duration: Date.now() - startTime,
135        };
136      }
137  
138      // Filter out blocklisted sites (directories/social media/franchises)
139      let ignoredCount = 0;
140      for (const site of sites) {
141        const blocked = checkBlocklist(site.domain, site.country_code);
142        if (blocked) {
143          await run(
144            `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`,
145            [blocked.reason, site.id]
146          );
147          ignoredCount++;
148        }
149      }
150  
151      if (ignoredCount > 0) {
152        info(`Marked ${ignoredCount} sites as ignored (directories/social media/franchises)`);
153      }
154  
155      // Filter out legal/regulated industry sites by domain keyword (pre-LLM, zero cost)
156      let industryIgnoredCount = 0;
157      for (const site of sites) {
158        const industry = classifyIndustry(site.domain);
159        if (industry) {
160          const reason =
161            industry.type === 'legal'
162              ? `Ignored: Legal site detected from domain (${industry.reason})`
163              : `Ignored: Regulated industry (${industry.type}) detected from domain (${industry.reason})`;
164          await run(
165            `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`,
166            [reason, site.id]
167          );
168          industryIgnoredCount++;
169        }
170      }
171  
172      if (industryIgnoredCount > 0) {
173        info(
174          `Marked ${industryIgnoredCount} sites as ignored (legal/regulated industry by domain keyword)`
175        );
176      }
177  
178      info(`Scoring ${sites.length} sites (concurrency: ${concurrency})`);
179  
180      const stats = {
181        processed: 0,
182        succeeded: 0,
183        failed: 0,
184        skipped: 0,
185        gradeDistribution: {},
186      };
187  
188      // Process sites in batches
189      const { results, errors } = await processBatch(
190        sites,
191        (site, index) => {
192          displayProgress(index + 1, sites.length, `Scoring ${site.url}`);
193          return scoreSite(site.id, site.url);
194        },
195        { concurrency }
196      );
197  
198      // Count successes and failures
199      stats.processed = sites.length;
200      stats.succeeded = results.filter(r => r !== null).length;
201      stats.failed = errors.length;
202  
203      // Calculate grade distribution
204      const grades = await getAll(
205        `SELECT grade, COUNT(*) as count
206         FROM sites
207         WHERE grade IS NOT NULL
208         GROUP BY grade`
209      );
210  
211      for (const g of grades) {
212        stats.gradeDistribution[g.grade] = g.count;
213      }
214  
215      // Log errors
216      for (const err of errors) {
217        error(`  Scoring error: ${err.message || err}`);
218      }
219  
220      stats.duration = Date.now() - startTime;
221      generateStageCompletion('Scoring', stats);
222  
223      return stats;
224    } catch (err) {
225      error(`Scoring stage failed: ${err.message}`);
226      throw err;
227    }
228  }
229  
230  /**
231   * Score a single site
232   * @param {number} siteId - Site ID
233   * @param {string} url - Site URL
234   * @returns {Promise<Object>} Scoring result
235   */
236  async function scoreSite(siteId, url) {
237    try {
238      // Get site data
239      const site = await getOne(
240        `SELECT
241          id, landing_page_url as url, screenshot_path, ssl_status, http_headers, locale_data, perf_json
242         FROM sites
243         WHERE id = $1`,
244        [siteId]
245      );
246  
247      // Load HTML from filesystem (moved out of DB to reduce bloat)
248      const htmlDom = readHtmlDom(siteId);
249  
250      // Load screenshots from file system (if available)
251      // When ENABLE_SCREENSHOT_CAPTURE=false, screenshot_path will be null and scoring uses HTML only
252      let desktopAbove = null;
253      let mobileAbove = null;
254  
255      if (site.screenshot_path) {
256        desktopAbove = await loadScreenshot(site.screenshot_path, 'desktop_above');
257        mobileAbove = await loadScreenshot(site.screenshot_path, 'mobile_above');
258      }
259  
260      // Prepare site data for scoring (match scoreWebsite expectations)
261      // Screenshots are optional - scoring works with HTML only when USE_COMPUTER_VISION_SCORING=false
262      const siteData = {
263        url: site.url,
264        domain: new URL(site.url).hostname,
265        screenshots: {
266          desktop_above: desktopAbove,
267          mobile_above: mobileAbove,
268        },
269        html: htmlDom || '',
270        httpHeaders: site.http_headers,
271        localeData: site.locale_data, // Locale indicators for country detection
272      };
273  
274      // Score the site — LLM or programmatic
275      const ENABLE_LLM_SCORING = process.env.ENABLE_LLM_SCORING !== 'false';
276      let result;
277  
278      if (ENABLE_LLM_SCORING) {
279        result = await scoreWebsite(siteData, siteId);
280      } else {
281        // Programmatic scoring — no LLM call
282        // Semantic factors (headline, value prop, USP) will be scored by Haiku via
283        // the orchestrator's score_semantic batch type in a follow-up pass.
284        const keywordRow = await getOne('SELECT keyword FROM sites WHERE id = $1', [siteId]);
285        const keyword = keywordRow?.keyword;
286        let perfData = null;
287        try {
288          perfData = site.perf_json ? JSON.parse(site.perf_json) : null;
289        } catch (e) {
290          logger.warn(`Failed to parse perf_json for site ${siteId}: ${e.message}`);
291        }
292        const progResult = scoreWebsiteProgrammatically(htmlDom, site.url, keyword, perfData);
293  
294        // Wrap in the same structure the rest of scoreSite expects
295        result = {
296          scoring_method: 'programmatic',
297          overall_calculation: {
298            conversion_score: progResult.conversion_score,
299            letter_grade: progResult.letter_grade,
300            is_error_page: progResult.is_error_page,
301            is_broken_site: progResult.is_broken_site,
302            is_business_directory: progResult.is_business_directory,
303            is_local_business: progResult.is_local_business,
304            is_law_firm: progResult.is_law_firm,
305            industry_classification: progResult.industry_classification,
306            country_code: progResult.country_code,
307            city: progResult.city,
308            state: progResult.state,
309          },
310          factor_scores: progResult.factor_scores,
311          contact_details: progResult.contacts
312            ? {
313                contacts: [
314                  ...progResult.contacts.email_addresses.map(e => ({
315                    type: 'email',
316                    uri: e.email,
317                    source: e.source,
318                  })),
319                  ...progResult.contacts.phone_numbers.map(p => ({
320                    type: 'sms',
321                    uri: p.number,
322                    source: p.source,
323                  })),
324                  ...progResult.contacts.social_profiles
325                    .filter(s => s.usable)
326                    .map(s => ({ type: s.label, uri: s.url, source: 'social' })),
327                ],
328                has_contact_form: progResult.contacts.has_contact_form,
329              }
330            : null,
331        };
332      }
333  
334      // Extract grade and score from nested structure
335      const grade = result?.overall_calculation?.letter_grade || null;
336      const score = result?.overall_calculation?.conversion_score || null;
337  
338      // Check if LLM detected a business directory
339      // Skip in test mode (E2E test page is not a real local business)
340      const isDirectory = result?.overall_calculation?.is_business_directory;
341  
342      if (isDirectory && process.env.NODE_ENV !== 'test') {
343        // Mark as ignored if LLM detected directory
344        setScoreJson(siteId, result);
345        await run(
346          `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`,
347          ['Ignored: Business directory (LLM detected)', siteId]
348        );
349  
350        info(`  ${url} detected as directory by LLM - marked as ignored`);
351  
352        return result;
353      }
354  
355      // Check if LLM detected a non-local business
356      // Skip in test mode (E2E test page is not a real local business)
357      const isLocalBusiness = result?.overall_calculation?.is_local_business;
358  
359      if (isLocalBusiness === false && process.env.NODE_ENV !== 'test') {
360        // Mark as ignored if LLM detected non-local business
361        setScoreJson(siteId, result);
362        await run(
363          `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`,
364          ['Ignored: Not a local business (LLM detected)', siteId]
365        );
366  
367        info(`  ${url} detected as non-local business by LLM - marked as ignored`);
368  
369        return result;
370      }
371  
372      // Check if LLM detected a law firm
373      const isLawFirm = result?.overall_calculation?.is_law_firm;
374      if (isLawFirm && process.env.NODE_ENV !== 'test') {
375        setScoreJson(siteId, result);
376        await run(
377          `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`,
378          ['Ignored: Law firm (LLM detected)', siteId]
379        );
380        info(`  ${url} detected as law firm by LLM - marked as ignored`);
381        return result;
382      }
383  
384      // Check if LLM detected a regulated industry (healthcare/financial)
385      const industryClassification =
386        result?.overall_calculation?.industry_classification?.toLowerCase() || '';
387      const regulatedPatterns = [
388        'dental',
389        'medical',
390        'clinic',
391        'hospital',
392        'pharmacy',
393        'veterinary',
394        'financial advis',
395        'mortgage broker',
396        'accounting',
397      ];
398      const isRegulated = regulatedPatterns.some(p => industryClassification.includes(p));
399      if (isRegulated && process.env.NODE_ENV !== 'test') {
400        setScoreJson(siteId, result);
401        await run(
402          `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`,
403          [`Ignored: Regulated industry (${industryClassification})`, siteId]
404        );
405        info(
406          `  ${url} detected as regulated industry (${industryClassification}) - marked as ignored`
407        );
408        return result;
409      }
410  
411      // Check if LLM detected an error page
412      const isErrorPage = result?.overall_calculation?.is_error_page;
413      const errorType = result?.overall_calculation?.error_type;
414      const errorDescription = result?.overall_calculation?.error_description;
415  
416      if (isErrorPage && errorType) {
417        // Permanent errors (404, 403, 410) - mark as ignored
418        const permanentErrors = ['404', '403', '410'];
419  
420        if (permanentErrors.includes(errorType)) {
421          const errorMsg = errorDescription || `Permanent error: ${errorType}`;
422          setScoreJson(siteId, result);
423          await run(
424            `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`,
425            [errorMsg, siteId]
426          );
427  
428          info(`  ${url} detected as ${errorType} error - marked as ignored`);
429  
430          return result;
431        }
432  
433        // Temporary errors (5xx, maintenance, redirect) - keep status unchanged for retry
434        const temporaryErrors = ['5xx', 'maintenance', 'redirect'];
435  
436        if (temporaryErrors.includes(errorType)) {
437          const errorMsg = errorDescription || `Temporary error: ${errorType}`;
438          setScoreJson(siteId, result);
439          await run(
440            `UPDATE sites SET status = 'assets_captured', error_message = $1 WHERE id = $2`,
441            [errorMsg, siteId]
442          );
443  
444          info(`  ${url} detected as ${errorType} error - will retry later`);
445  
446          return result;
447        }
448      }
449  
450      // Check if LLM detected a broken site (visual rendering failure)
451      const isBrokenSite = result?.overall_calculation?.is_broken_site || false;
452      const brokenSiteDetails = result?.overall_calculation?.broken_site_details || [];
453  
454      if (isBrokenSite) {
455        // Get current recapture count
456        const siteRow = await getOne('SELECT recapture_count FROM sites WHERE id = $1', [siteId]);
457        const recaptureCount = (siteRow?.recapture_count || 0) + 1;
458  
459        // Check if max retries exceeded (3 attempts)
460        if (recaptureCount > 3) {
461          const errorMsg = `Max recapture attempts reached (3) - Issues: ${brokenSiteDetails.join(', ')}`;
462          setScoreJson(siteId, result);
463          await run(
464            `UPDATE sites SET status = 'ignored', error_message = $1 WHERE id = $2`,
465            [errorMsg, siteId]
466          );
467  
468          error(`  ${url} broken site detected but max retries exceeded - marked as ignored`);
469  
470          return result;
471        }
472  
473        // Requeue for recapture with 1-week delay
474        const errorMsg = `Broken site detected (attempt ${recaptureCount}/3) - Issues: ${brokenSiteDetails.join(', ')}`;
475        setScoreJson(siteId, result);
476        await run(
477          `UPDATE sites SET
478            status = 'assets_captured',
479            error_message = $1,
480            recapture_count = $2,
481            recapture_at = NOW() + INTERVAL '7 days'
482           WHERE id = $3`,
483          [errorMsg, recaptureCount, siteId]
484        );
485  
486        info(
487          `  ${url} broken site detected - recapture scheduled for 7 days (attempt ${recaptureCount}/3)`
488        );
489        info(`      Issues: ${brokenSiteDetails.join(', ')}`);
490  
491        return result;
492      }
493  
494      // Extract location information from overall_calculation
495      const city = result?.overall_calculation?.city || null;
496      const countryCode = normaliseCountryCode(result?.overall_calculation?.country_code) || null;
497      const state = result?.overall_calculation?.state || null;
498  
499      // Determine status based on score threshold
500      // High-scoring sites (> cutoff) are marked as 'high_score' and end their journey
501      // Low-scoring sites (<= cutoff) advance to prog_scored (vision on) or semantic_scored (vision off)
502      const scoreThreshold = parseInt(process.env.LOW_SCORE_CUTOFF || '82', 10);
503      // In HTML-only mode, skip straight to 'semantic_scored' - rescoring stage is a no-op without screenshots
504      const ENABLE_VISION_LOCAL = process.env.ENABLE_VISION !== 'false';
505      const newStatus =
506        score !== null && score > scoreThreshold
507          ? 'high_score'
508          : ENABLE_VISION_LOCAL
509            ? 'prog_scored'
510            : 'semantic_scored';
511  
512      // Get ENABLE_VISION flag to determine if we need to save contacts
513      const ENABLE_VISION = process.env.ENABLE_VISION !== 'false';
514  
515      // When vision disabled, scoring includes contact extraction (save to contacts_json)
516      // When vision enabled, contacts are extracted in rescoring stage
517      let contactsJson = null;
518      if (!ENABLE_VISION && result.contact_details) {
519        contactsJson = JSON.stringify(result.contact_details);
520        info(`  Saved contacts for ${url} (HTML-only mode)`);
521      }
522  
523      // Wrap all DB updates for this site in a transaction so status and keyword counters
524      // are always consistent — no partial writes on crash.
525      // Score/contacts are written to filesystem; DB no longer stores the blobs.
526      // When contacts are written, store sentinel in DB to preserve IS NOT NULL gating.
527      setScoreJson(siteId, result);
528      if (contactsJson) setContactsJson(siteId, contactsJson);
529  
530      await withTransaction(async (client) => {
531        await client.query(
532          `UPDATE sites SET
533            grade = $1,
534            score = $2,
535            city = $3,
536            country_code = $4,
537            state = $5,
538            status = $6,
539            scored_at = NOW(),
540            rescored_at = CASE WHEN status = 'semantic_scored' THEN NOW() ELSE rescored_at END,
541            error_message = NULL
542           WHERE id = $7`,
543          [grade, score, city, countryCode, state, newStatus, siteId]
544        );
545  
546        // Reset retry count on successful scoring
547        await resetRetries(siteId);
548  
549        // Increment keyword counter if low-scoring (<= 82)
550        if (score !== null && score <= scoreThreshold) {
551          const siteRow = await client.query(
552            'SELECT keyword, country_code FROM sites WHERE id = $1',
553            [siteId]
554          );
555          const siteData = siteRow.rows[0];
556          if (siteData?.keyword && siteData?.country_code) {
557            await incrementLowScoring(siteData.keyword, siteData.country_code);
558          }
559        }
560      });
561  
562      const statusLabel = newStatus === 'high_score' ? ' [HIGH SCORE - COMPLETE]' : '';
563      success(`  Scored ${url}: ${grade} (${score})${statusLabel}`);
564  
565      return result;
566    } catch (err) {
567      // Record failure and increment retry count (marks as 'failing' if limit exceeded)
568      await recordFailure(siteId, 'scoring', err, 'assets_captured');
569      throw err;
570    }
571  }
572  
573  /**
574   * Get scoring statistics
575   * @returns {Promise<Object>} Scoring statistics
576   */
577  export async function getScoringStats() {
578    const stats = await getOne(
579      `SELECT
580        COUNT(id) as total_sites,
581        COUNT(CASE WHEN score IS NOT NULL THEN 1 END) as scored_sites,
582        COUNT(CASE WHEN score < 83 THEN 1 END) as low_score_sites,
583        AVG(score) as avg_score,
584        MIN(score) as min_score,
585        MAX(score) as max_score
586       FROM sites`
587    );
588  
589    const gradeDistribution = await getAll(
590      `SELECT grade, COUNT(*) as count
591       FROM sites
592       WHERE grade IS NOT NULL
593       GROUP BY grade
594       ORDER BY
595         CASE grade
596           WHEN 'A+' THEN 1 WHEN 'A' THEN 2 WHEN 'A-' THEN 3
597           WHEN 'B+' THEN 4 WHEN 'B' THEN 5 WHEN 'B-' THEN 6
598           WHEN 'C' THEN 7 WHEN 'D' THEN 8 WHEN 'E' THEN 9 WHEN 'F' THEN 10
599         END`
600    );
601  
602    return {
603      ...stats,
604      gradeDistribution: gradeDistribution.reduce((acc, g) => {
605        acc[g.grade] = g.count;
606        return acc;
607      }, {}),
608    };
609  }