run-full-pipeline.js
1 #!/usr/bin/env node 2 /** 3 * Run full pipeline for specific site IDs (all stages: assets → scoring → rescoring → enrich → proposals) 4 * Usage: node scripts/run-full-pipeline.js 17314 17315 17316 17318 17319 17321 17322 5 */ 6 7 import 'dotenv/config'; 8 import { createDatabaseConnection } from '../src/utils/db.js'; 9 import { readFileSync } from 'fs'; 10 import { captureWebsite } from '../src/capture.js'; 11 import { scoreWebsite } from '../src/score.js'; 12 import { generateProposalVariants } from '../src/proposal-generator-v2.js'; 13 import { saveScreenshots, loadScreenshot } from '../src/utils/screenshot-storage.js'; 14 import Logger from '../src/utils/logger.js'; 15 import { 16 incrementAssetsScraped, 17 incrementLowScoring, 18 incrementRescored, 19 } from '../src/utils/keyword-counters.js'; 20 import { callLLM } from '../src/utils/llm-provider.js'; 21 import { setScoreJson, getScoreDataWithFallback } from '../src/utils/score-storage.js'; 22 import { setContactsJson } from '../src/utils/contacts-storage.js'; 23 24 const FS_SENTINEL = '{"_fs":true}'; 25 26 const logger = new Logger('ProcessSiteIds'); 27 const dbPath = process.env.DATABASE_PATH || './db/sites.db'; 28 const siteIds = process.argv.slice(2).map(id => parseInt(id, 10)); 29 30 // Load vision prompt for text extraction 31 const VISION_PROMPT = readFileSync(new URL('../prompts/VISION.md', import.meta.url), 'utf-8'); 32 const VISION_MODEL = process.env.VISION_MODEL || 'openai/gpt-4o-mini'; 33 34 /** 35 * Extract text from below-fold screenshot using vision LLM 36 */ 37 async function extractTextFromImage(screenshotBase64) { 38 try { 39 const messages = [ 40 { 41 role: 'system', 42 content: VISION_PROMPT, 43 }, 44 { 45 role: 'user', 46 content: [ 47 { 48 type: 'text', 49 text: 'Extract all visible text from this screenshot:', 50 }, 51 { 52 type: 'image_url', 53 image_url: { 54 url: `data:image/jpeg;base64,${screenshotBase64}`, 55 }, 56 }, 57 ], 58 }, 59 ]; 60 61 const response = await callLLM({ 62 model: VISION_MODEL, 63 messages, 64 temperature: 0.1, 65 max_tokens: 2000, 66 }); 67 68 return response.content || ''; 69 } catch (err) { 70 logger.error(`Error extracting text from image: ${err.message}`); 71 return ''; 72 } 73 } 74 75 if (siteIds.length === 0) { 76 console.error('Usage: node scripts/process-site-ids.js ID1 ID2 ID3 ...'); 77 process.exit(1); 78 } 79 80 const db = createDatabaseConnection(dbPath); 81 82 async function processOneSite(siteId) { 83 const site = db.prepare('SELECT * FROM sites WHERE id = ?').get(siteId); 84 85 if (!site) { 86 logger.error(`Site ${siteId} not found`); 87 return false; 88 } 89 90 logger.info(`\n${'='.repeat(80)}`); 91 logger.info(`Processing Site ${site.id}: ${site.domain}`); 92 logger.info(`URL: ${site.landing_page_url}`); 93 logger.info(`Keyword: ${site.keyword}`); 94 logger.info(`Current Status: ${site.status}`); 95 logger.info(`${'='.repeat(80)}\n`); 96 97 try { 98 // Step 1: Assets (if needed) 99 if (site.status === 'found') { 100 logger.info('[1/3] Capturing screenshots...'); 101 const result = await captureWebsite(site.landing_page_url); 102 103 const screenshotData = { 104 desktop_above: result.screenshots.desktop_above, 105 desktop_below: result.screenshots.desktop_below, 106 mobile_above: result.screenshots.mobile_above, 107 desktop_above_uncropped: result.screenshotsUncropped.desktop_above, 108 desktop_below_uncropped: result.screenshotsUncropped.desktop_below, 109 mobile_above_uncropped: result.screenshotsUncropped.mobile_above, 110 }; 111 112 const screenshotPath = await saveScreenshots(site.id, screenshotData); 113 114 const isSuccess = result.httpStatusCode >= 200 && result.httpStatusCode < 400; 115 db.prepare( 116 `UPDATE sites SET 117 screenshot_path = ?, 118 html_dom = ?, 119 http_status_code = ?, 120 status = ?, 121 error_message = NULL 122 WHERE id = ?` 123 ).run( 124 screenshotPath, 125 result.html, 126 result.httpStatusCode, 127 isSuccess ? 'assets_captured' : 'found', 128 site.id 129 ); 130 131 if (!isSuccess) { 132 throw new Error(`HTTP ${result.httpStatusCode}`); 133 } 134 135 incrementAssetsScraped(db, site.keyword, site.country_code); 136 logger.success(`✓ Screenshots: ${screenshotPath}/`); 137 138 // Reload site 139 Object.assign(site, db.prepare('SELECT * FROM sites WHERE id = ?').get(site.id)); 140 } else { 141 logger.info('[1/3] ✓ Already has screenshots'); 142 } 143 144 // Step 2: Scoring (if needed) 145 if (site.status === 'assets_captured') { 146 logger.info('[2/3] Scoring...'); 147 148 // Load screenshots from file system 149 const desktopAbove = await loadScreenshot(site.screenshot_path, 'desktop_above'); 150 const mobileAbove = await loadScreenshot(site.screenshot_path, 'mobile_above'); 151 152 // Prepare site data for scoring 153 const siteData = { 154 url: site.landing_page_url, 155 domain: new URL(site.landing_page_url).hostname, 156 screenshots: { 157 desktop_above: desktopAbove, 158 mobile_above: mobileAbove, 159 }, 160 html: site.html_dom || '', 161 }; 162 163 const result = await scoreWebsite(siteData, site.id); 164 165 // Extract grade and score from nested structure 166 const grade = result?.overall_calculation?.letter_grade || null; 167 const score = result?.overall_calculation?.conversion_score || null; 168 169 setScoreJson(site.id, JSON.stringify(result)); 170 db.prepare( 171 `UPDATE sites SET 172 score = ?, 173 grade = ?, 174 score_json = '{"_fs":true}', 175 status = 'prog_scored', 176 scored_at = CURRENT_TIMESTAMP, 177 error_message = NULL 178 WHERE id = ?` 179 ).run(score, grade, site.id); 180 181 // Increment keyword counter if low-scoring (< 82) 182 if (score !== null && score < 82 && site.keyword && site.country_code) { 183 incrementLowScoring(db, site.keyword, site.country_code); 184 } 185 186 logger.success(`✓ Score: ${score} (${grade})`); 187 188 // Reload site 189 Object.assign(site, db.prepare('SELECT * FROM sites WHERE id = ?').get(site.id)); 190 } else if (['prog_scored', 'semantic_scored', 'vision_scored'].includes(site.status)) { 191 logger.info(`[2/3] ✓ Already scored: ${site.score} (${site.grade})`); 192 } else { 193 logger.info('[2/3] Skipped (not at assets_captured)'); 194 } 195 196 // Step 2.5: Rescoring (if score <= 82, to get vision_analysis with contacts) 197 if (site.status === 'prog_scored' && site.score !== null && site.score <= 82) { 198 logger.info('[2.5/4] Rescoring with below-fold screenshot...'); 199 200 // Load below-fold screenshot 201 const desktopAbove = await loadScreenshot(site.screenshot_path, 'desktop_above'); 202 const desktopBelow = await loadScreenshot(site.screenshot_path, 'desktop_below'); 203 const mobileAbove = await loadScreenshot(site.screenshot_path, 'mobile_above'); 204 205 // Extract text from below-fold screenshot 206 let visionText = ''; 207 if (desktopBelow) { 208 const desktopBelowBase64 = desktopBelow.toString('base64'); 209 visionText = await extractTextFromImage(desktopBelowBase64); 210 if (visionText) { 211 logger.info(` Extracted ${visionText.length} chars of text from below-fold screenshot`); 212 } 213 } 214 215 // Prepare site data for rescoring 216 const siteData = { 217 url: site.landing_page_url, 218 domain: new URL(site.landing_page_url).hostname, 219 screenshots: { 220 desktop_above: desktopAbove, 221 desktop_below: desktopBelow, 222 mobile_above: mobileAbove, 223 }, 224 html: site.html_dom || '', 225 visionText, // Include vision text for contact extraction 226 }; 227 228 const result = await scoreWebsite(siteData, site.id); 229 230 // Extract grade and score from nested structure 231 const grade = result?.overall_calculation?.letter_grade || null; 232 const score = result?.overall_calculation?.conversion_score || null; 233 const city = result?.contact_details?.city || null; 234 const countryCode = result?.contact_details?.country_code || null; 235 236 setScoreJson(site.id, JSON.stringify(result)); 237 db.prepare( 238 `UPDATE sites SET 239 score = ?, 240 grade = ?, 241 score_json = '{"_fs":true}', 242 city = ?, 243 country_code = ?, 244 status = 'vision_scored', 245 rescored_at = CURRENT_TIMESTAMP, 246 error_message = NULL 247 WHERE id = ?` 248 ).run(score, grade, city, countryCode, site.id); 249 250 // Increment keyword counter 251 if (site.keyword && site.country_code) { 252 incrementRescored(db, site.keyword, site.country_code); 253 } 254 255 const oldScore = site.score; 256 const improved = score > oldScore; 257 const change = improved ? `↑ +${(score - oldScore).toFixed(1)}` : `→ ${score}`; 258 259 logger.success(`✓ Rescored: ${score} (${grade}) [${oldScore} ${change}]`); 260 261 // Reload site 262 Object.assign(site, db.prepare('SELECT * FROM sites WHERE id = ?').get(site.id)); 263 } else if (site.status === 'vision_scored') { 264 logger.info(`[2.5/4] ✓ Already vision_scored: ${site.score} (${site.grade})`); 265 } else if (site.score > 82) { 266 logger.info(`[2.5/4] Score too high (${site.score}), rescoring not needed`); 267 } else { 268 logger.info('[2.5/4] Not ready for rescoring'); 269 } 270 271 // Step 3: Enrichment (extract contacts from scoring data) 272 if (['prog_scored', 'semantic_scored', 'vision_scored'].includes(site.status)) { 273 logger.info('[3/4] Enriching (extracting contacts from scoring)...'); 274 275 // Get contacts from score_json (contact_details section) 276 const scoreJson = getScoreDataWithFallback(site.id, site); 277 const contactDetails = scoreJson?.contact_details || {}; 278 279 // Build contacts from contact_details 280 const contacts = { 281 email_addresses: contactDetails.email_addresses || [], 282 phone_numbers: contactDetails.phone_numbers || [], 283 primary_contact_form: contactDetails.primary_contact_form || null, 284 }; 285 286 // Extract city and country from contact_details or keyword 287 let city = contactDetails.city || null; 288 const countryCode = contactDetails.country_code || null; 289 290 if (!city && site.keyword) { 291 const parts = site.keyword.split(' '); 292 city = parts[parts.length - 1]; 293 } 294 295 setContactsJson(site.id, JSON.stringify(contacts)); 296 db.prepare( 297 `UPDATE sites SET 298 contacts_json = '{"_fs":true}', 299 city = ?, 300 country_code = ?, 301 status = 'enriched', 302 enriched_at = CURRENT_TIMESTAMP, 303 error_message = NULL 304 WHERE id = ?` 305 ).run(city, countryCode, site.id); 306 307 logger.success( 308 `✓ Enriched: ${contacts.email_addresses.length} emails, ${contacts.phone_numbers.length} phones${contacts.primary_contact_form ? ', 1 form' : ''}` 309 ); 310 311 // Reload site 312 Object.assign(site, db.prepare('SELECT * FROM sites WHERE id = ?').get(site.id)); 313 } else { 314 logger.info('[3/4] ✓ Already enriched or not ready'); 315 } 316 317 // Step 4: Proposals (if score <= 82) 318 if ( 319 (['enriched', 'enriched_llm'].includes(site.status) || 320 ['semantic_scored', 'vision_scored'].includes(site.status)) && 321 site.score !== null && 322 site.score <= 82 323 ) { 324 logger.info('[4/4] Generating proposals...'); 325 await generateProposalVariants(site.id); 326 327 const proposalCount = db 328 .prepare('SELECT COUNT(*) as count FROM outreaches WHERE site_id = ?') 329 .get(site.id).count; 330 331 db.prepare('UPDATE sites SET status = ? WHERE id = ?').run('proposals_drafted', site.id); 332 333 logger.success(`✓ Generated ${proposalCount} proposals`); 334 335 // Show proposals 336 const proposals = db 337 .prepare( 338 `SELECT id, contact_method, contact_uri 339 FROM outreaches 340 WHERE site_id = ? 341 LIMIT 5` 342 ) 343 .all(site.id); 344 345 for (const p of proposals) { 346 logger.info(` [${p.id}] ${p.contact_method}: ${p.contact_uri}`); 347 } 348 } else if (site.score > 82) { 349 logger.info(`[4/4] Score too high (${site.score}), no proposals needed`); 350 } else { 351 logger.info('[4/4] Not ready for proposals yet'); 352 } 353 354 logger.success(`\n✓ Site ${site.id} completed successfully\n`); 355 return true; 356 } catch (err) { 357 logger.error(`✗ Failed: ${err.message}\n`); 358 db.prepare('UPDATE sites SET error_message = ? WHERE id = ?').run(err.message, site.id); 359 return false; 360 } 361 } 362 363 try { 364 logger.info(`Processing ${siteIds.length} sites...\n`); 365 366 let succeeded = 0; 367 let failed = 0; 368 369 for (const siteId of siteIds) { 370 const success = await processOneSite(siteId); 371 if (success) { 372 succeeded++; 373 } else { 374 failed++; 375 } 376 } 377 378 logger.info(`\n${'='.repeat(80)}`); 379 logger.info('Summary'); 380 logger.info(`${'='.repeat(80)}`); 381 logger.success(`Succeeded: ${succeeded}`); 382 if (failed > 0) { 383 logger.error(`Failed: ${failed}`); 384 } 385 logger.info(`${'='.repeat(80)}\n`); 386 } finally { 387 db.close(); 388 }