/ scripts / run-full-pipeline.js
run-full-pipeline.js
  1  #!/usr/bin/env node
  2  /**
  3   * Run full pipeline for specific site IDs (all stages: assets → scoring → rescoring → enrich → proposals)
  4   * Usage: node scripts/run-full-pipeline.js 17314 17315 17316 17318 17319 17321 17322
  5   */
  6  
  7  import 'dotenv/config';
  8  import { createDatabaseConnection } from '../src/utils/db.js';
  9  import { readFileSync } from 'fs';
 10  import { captureWebsite } from '../src/capture.js';
 11  import { scoreWebsite } from '../src/score.js';
 12  import { generateProposalVariants } from '../src/proposal-generator-v2.js';
 13  import { saveScreenshots, loadScreenshot } from '../src/utils/screenshot-storage.js';
 14  import Logger from '../src/utils/logger.js';
 15  import {
 16    incrementAssetsScraped,
 17    incrementLowScoring,
 18    incrementRescored,
 19  } from '../src/utils/keyword-counters.js';
 20  import { callLLM } from '../src/utils/llm-provider.js';
 21  import { setScoreJson, getScoreDataWithFallback } from '../src/utils/score-storage.js';
 22  import { setContactsJson } from '../src/utils/contacts-storage.js';
 23  
 24  const FS_SENTINEL = '{"_fs":true}';
 25  
 26  const logger = new Logger('ProcessSiteIds');
 27  const dbPath = process.env.DATABASE_PATH || './db/sites.db';
 28  const siteIds = process.argv.slice(2).map(id => parseInt(id, 10));
 29  
 30  // Load vision prompt for text extraction
 31  const VISION_PROMPT = readFileSync(new URL('../prompts/VISION.md', import.meta.url), 'utf-8');
 32  const VISION_MODEL = process.env.VISION_MODEL || 'openai/gpt-4o-mini';
 33  
 34  /**
 35   * Extract text from below-fold screenshot using vision LLM
 36   */
 37  async function extractTextFromImage(screenshotBase64) {
 38    try {
 39      const messages = [
 40        {
 41          role: 'system',
 42          content: VISION_PROMPT,
 43        },
 44        {
 45          role: 'user',
 46          content: [
 47            {
 48              type: 'text',
 49              text: 'Extract all visible text from this screenshot:',
 50            },
 51            {
 52              type: 'image_url',
 53              image_url: {
 54                url: `data:image/jpeg;base64,${screenshotBase64}`,
 55              },
 56            },
 57          ],
 58        },
 59      ];
 60  
 61      const response = await callLLM({
 62        model: VISION_MODEL,
 63        messages,
 64        temperature: 0.1,
 65        max_tokens: 2000,
 66      });
 67  
 68      return response.content || '';
 69    } catch (err) {
 70      logger.error(`Error extracting text from image: ${err.message}`);
 71      return '';
 72    }
 73  }
 74  
 75  if (siteIds.length === 0) {
 76    console.error('Usage: node scripts/process-site-ids.js ID1 ID2 ID3 ...');
 77    process.exit(1);
 78  }
 79  
 80  const db = createDatabaseConnection(dbPath);
 81  
 82  async function processOneSite(siteId) {
 83    const site = db.prepare('SELECT * FROM sites WHERE id = ?').get(siteId);
 84  
 85    if (!site) {
 86      logger.error(`Site ${siteId} not found`);
 87      return false;
 88    }
 89  
 90    logger.info(`\n${'='.repeat(80)}`);
 91    logger.info(`Processing Site ${site.id}: ${site.domain}`);
 92    logger.info(`URL: ${site.landing_page_url}`);
 93    logger.info(`Keyword: ${site.keyword}`);
 94    logger.info(`Current Status: ${site.status}`);
 95    logger.info(`${'='.repeat(80)}\n`);
 96  
 97    try {
 98      // Step 1: Assets (if needed)
 99      if (site.status === 'found') {
100        logger.info('[1/3] Capturing screenshots...');
101        const result = await captureWebsite(site.landing_page_url);
102  
103        const screenshotData = {
104          desktop_above: result.screenshots.desktop_above,
105          desktop_below: result.screenshots.desktop_below,
106          mobile_above: result.screenshots.mobile_above,
107          desktop_above_uncropped: result.screenshotsUncropped.desktop_above,
108          desktop_below_uncropped: result.screenshotsUncropped.desktop_below,
109          mobile_above_uncropped: result.screenshotsUncropped.mobile_above,
110        };
111  
112        const screenshotPath = await saveScreenshots(site.id, screenshotData);
113  
114        const isSuccess = result.httpStatusCode >= 200 && result.httpStatusCode < 400;
115        db.prepare(
116          `UPDATE sites SET
117            screenshot_path = ?,
118            html_dom = ?,
119            http_status_code = ?,
120            status = ?,
121            error_message = NULL
122          WHERE id = ?`
123        ).run(
124          screenshotPath,
125          result.html,
126          result.httpStatusCode,
127          isSuccess ? 'assets_captured' : 'found',
128          site.id
129        );
130  
131        if (!isSuccess) {
132          throw new Error(`HTTP ${result.httpStatusCode}`);
133        }
134  
135        incrementAssetsScraped(db, site.keyword, site.country_code);
136        logger.success(`✓ Screenshots: ${screenshotPath}/`);
137  
138        // Reload site
139        Object.assign(site, db.prepare('SELECT * FROM sites WHERE id = ?').get(site.id));
140      } else {
141        logger.info('[1/3] ✓ Already has screenshots');
142      }
143  
144      // Step 2: Scoring (if needed)
145      if (site.status === 'assets_captured') {
146        logger.info('[2/3] Scoring...');
147  
148        // Load screenshots from file system
149        const desktopAbove = await loadScreenshot(site.screenshot_path, 'desktop_above');
150        const mobileAbove = await loadScreenshot(site.screenshot_path, 'mobile_above');
151  
152        // Prepare site data for scoring
153        const siteData = {
154          url: site.landing_page_url,
155          domain: new URL(site.landing_page_url).hostname,
156          screenshots: {
157            desktop_above: desktopAbove,
158            mobile_above: mobileAbove,
159          },
160          html: site.html_dom || '',
161        };
162  
163        const result = await scoreWebsite(siteData, site.id);
164  
165        // Extract grade and score from nested structure
166        const grade = result?.overall_calculation?.letter_grade || null;
167        const score = result?.overall_calculation?.conversion_score || null;
168  
169        setScoreJson(site.id, JSON.stringify(result));
170        db.prepare(
171          `UPDATE sites SET
172            score = ?,
173            grade = ?,
174            score_json = '{"_fs":true}',
175            status = 'prog_scored',
176            scored_at = CURRENT_TIMESTAMP,
177            error_message = NULL
178          WHERE id = ?`
179        ).run(score, grade, site.id);
180  
181        // Increment keyword counter if low-scoring (< 82)
182        if (score !== null && score < 82 && site.keyword && site.country_code) {
183          incrementLowScoring(db, site.keyword, site.country_code);
184        }
185  
186        logger.success(`✓ Score: ${score} (${grade})`);
187  
188        // Reload site
189        Object.assign(site, db.prepare('SELECT * FROM sites WHERE id = ?').get(site.id));
190      } else if (['prog_scored', 'semantic_scored', 'vision_scored'].includes(site.status)) {
191        logger.info(`[2/3] ✓ Already scored: ${site.score} (${site.grade})`);
192      } else {
193        logger.info('[2/3] Skipped (not at assets_captured)');
194      }
195  
196      // Step 2.5: Rescoring (if score <= 82, to get vision_analysis with contacts)
197      if (site.status === 'prog_scored' && site.score !== null && site.score <= 82) {
198        logger.info('[2.5/4] Rescoring with below-fold screenshot...');
199  
200        // Load below-fold screenshot
201        const desktopAbove = await loadScreenshot(site.screenshot_path, 'desktop_above');
202        const desktopBelow = await loadScreenshot(site.screenshot_path, 'desktop_below');
203        const mobileAbove = await loadScreenshot(site.screenshot_path, 'mobile_above');
204  
205        // Extract text from below-fold screenshot
206        let visionText = '';
207        if (desktopBelow) {
208          const desktopBelowBase64 = desktopBelow.toString('base64');
209          visionText = await extractTextFromImage(desktopBelowBase64);
210          if (visionText) {
211            logger.info(`  Extracted ${visionText.length} chars of text from below-fold screenshot`);
212          }
213        }
214  
215        // Prepare site data for rescoring
216        const siteData = {
217          url: site.landing_page_url,
218          domain: new URL(site.landing_page_url).hostname,
219          screenshots: {
220            desktop_above: desktopAbove,
221            desktop_below: desktopBelow,
222            mobile_above: mobileAbove,
223          },
224          html: site.html_dom || '',
225          visionText, // Include vision text for contact extraction
226        };
227  
228        const result = await scoreWebsite(siteData, site.id);
229  
230        // Extract grade and score from nested structure
231        const grade = result?.overall_calculation?.letter_grade || null;
232        const score = result?.overall_calculation?.conversion_score || null;
233        const city = result?.contact_details?.city || null;
234        const countryCode = result?.contact_details?.country_code || null;
235  
236        setScoreJson(site.id, JSON.stringify(result));
237        db.prepare(
238          `UPDATE sites SET
239            score = ?,
240            grade = ?,
241            score_json = '{"_fs":true}',
242            city = ?,
243            country_code = ?,
244            status = 'vision_scored',
245            rescored_at = CURRENT_TIMESTAMP,
246            error_message = NULL
247          WHERE id = ?`
248        ).run(score, grade, city, countryCode, site.id);
249  
250        // Increment keyword counter
251        if (site.keyword && site.country_code) {
252          incrementRescored(db, site.keyword, site.country_code);
253        }
254  
255        const oldScore = site.score;
256        const improved = score > oldScore;
257        const change = improved ? `↑ +${(score - oldScore).toFixed(1)}` : `→ ${score}`;
258  
259        logger.success(`✓ Rescored: ${score} (${grade}) [${oldScore} ${change}]`);
260  
261        // Reload site
262        Object.assign(site, db.prepare('SELECT * FROM sites WHERE id = ?').get(site.id));
263      } else if (site.status === 'vision_scored') {
264        logger.info(`[2.5/4] ✓ Already vision_scored: ${site.score} (${site.grade})`);
265      } else if (site.score > 82) {
266        logger.info(`[2.5/4] Score too high (${site.score}), rescoring not needed`);
267      } else {
268        logger.info('[2.5/4] Not ready for rescoring');
269      }
270  
271      // Step 3: Enrichment (extract contacts from scoring data)
272      if (['prog_scored', 'semantic_scored', 'vision_scored'].includes(site.status)) {
273        logger.info('[3/4] Enriching (extracting contacts from scoring)...');
274  
275        // Get contacts from score_json (contact_details section)
276        const scoreJson = getScoreDataWithFallback(site.id, site);
277        const contactDetails = scoreJson?.contact_details || {};
278  
279        // Build contacts from contact_details
280        const contacts = {
281          email_addresses: contactDetails.email_addresses || [],
282          phone_numbers: contactDetails.phone_numbers || [],
283          primary_contact_form: contactDetails.primary_contact_form || null,
284        };
285  
286        // Extract city and country from contact_details or keyword
287        let city = contactDetails.city || null;
288        const countryCode = contactDetails.country_code || null;
289  
290        if (!city && site.keyword) {
291          const parts = site.keyword.split(' ');
292          city = parts[parts.length - 1];
293        }
294  
295        setContactsJson(site.id, JSON.stringify(contacts));
296        db.prepare(
297          `UPDATE sites SET
298            contacts_json = '{"_fs":true}',
299            city = ?,
300            country_code = ?,
301            status = 'enriched',
302            enriched_at = CURRENT_TIMESTAMP,
303            error_message = NULL
304          WHERE id = ?`
305        ).run(city, countryCode, site.id);
306  
307        logger.success(
308          `✓ Enriched: ${contacts.email_addresses.length} emails, ${contacts.phone_numbers.length} phones${contacts.primary_contact_form ? ', 1 form' : ''}`
309        );
310  
311        // Reload site
312        Object.assign(site, db.prepare('SELECT * FROM sites WHERE id = ?').get(site.id));
313      } else {
314        logger.info('[3/4] ✓ Already enriched or not ready');
315      }
316  
317      // Step 4: Proposals (if score <= 82)
318      if (
319        (['enriched', 'enriched_llm'].includes(site.status) ||
320          ['semantic_scored', 'vision_scored'].includes(site.status)) &&
321        site.score !== null &&
322        site.score <= 82
323      ) {
324        logger.info('[4/4] Generating proposals...');
325        await generateProposalVariants(site.id);
326  
327        const proposalCount = db
328          .prepare('SELECT COUNT(*) as count FROM outreaches WHERE site_id = ?')
329          .get(site.id).count;
330  
331        db.prepare('UPDATE sites SET status = ? WHERE id = ?').run('proposals_drafted', site.id);
332  
333        logger.success(`✓ Generated ${proposalCount} proposals`);
334  
335        // Show proposals
336        const proposals = db
337          .prepare(
338            `SELECT id, contact_method, contact_uri
339             FROM outreaches
340             WHERE site_id = ?
341             LIMIT 5`
342          )
343          .all(site.id);
344  
345        for (const p of proposals) {
346          logger.info(`  [${p.id}] ${p.contact_method}: ${p.contact_uri}`);
347        }
348      } else if (site.score > 82) {
349        logger.info(`[4/4] Score too high (${site.score}), no proposals needed`);
350      } else {
351        logger.info('[4/4] Not ready for proposals yet');
352      }
353  
354      logger.success(`\n✓ Site ${site.id} completed successfully\n`);
355      return true;
356    } catch (err) {
357      logger.error(`✗ Failed: ${err.message}\n`);
358      db.prepare('UPDATE sites SET error_message = ? WHERE id = ?').run(err.message, site.id);
359      return false;
360    }
361  }
362  
363  try {
364    logger.info(`Processing ${siteIds.length} sites...\n`);
365  
366    let succeeded = 0;
367    let failed = 0;
368  
369    for (const siteId of siteIds) {
370      const success = await processOneSite(siteId);
371      if (success) {
372        succeeded++;
373      } else {
374        failed++;
375      }
376    }
377  
378    logger.info(`\n${'='.repeat(80)}`);
379    logger.info('Summary');
380    logger.info(`${'='.repeat(80)}`);
381    logger.success(`Succeeded: ${succeeded}`);
382    if (failed > 0) {
383      logger.error(`Failed: ${failed}`);
384    }
385    logger.info(`${'='.repeat(80)}\n`);
386  } finally {
387    db.close();
388  }