poc.js
1 #!/usr/bin/env node 2 3 /** 4 * POC Orchestration 5 * Main entry point for proof of concept - scans sites and scores them 6 */ 7 8 import Logger from './utils/logger.js'; 9 import { processBatch, extractDomain } from './utils/error-handler.js'; 10 import { scrapeSERP } from './scrape.js'; 11 import { launchBrowser, createStealthContext, captureScreenshots } from './capture.js'; 12 import { scoreWebsite, extractGrade, extractScore } from './score.js'; 13 import { setContactsJson } from './utils/contacts-storage.js'; 14 import './utils/load-env.js'; 15 import { run, getOne, getAll, query, withTransaction, closePool, getPool } from './utils/db.js'; 16 17 const FS_SENTINEL = '{"_fs":true}'; 18 19 const logger = new Logger('POC'); 20 21 /** 22 * Main POC function 23 */ 24 async function runPOC(keyword, limit = 10) { 25 logger.info(`Starting POC for keyword: "${keyword}" (limit: ${limit})`); 26 27 try { 28 logger.success('Database connected'); 29 30 // Step 1: Scrape SERP 31 logger.info('Step 1: Scraping SERP...'); 32 const serpResults = await scrapeSERP(keyword, limit); 33 logger.success(`Found ${serpResults.length} sites from SERP`); 34 35 // Step 1.5: Deduplicate against existing domains in database 36 const existingDomains = await getExistingDomains(); 37 const newSites = serpResults.filter(site => { 38 const domain = extractDomain(site.url); 39 return !existingDomains.has(domain); 40 }); 41 42 if (newSites.length < serpResults.length) { 43 const skipped = serpResults.length - newSites.length; 44 logger.info(`Skipping ${skipped} sites already in database`); 45 } 46 47 if (newSites.length === 0) { 48 logger.info('All sites already processed. Nothing to do.'); 49 return; 50 } 51 52 logger.success(`Processing ${newSites.length} new sites`); 53 54 // Step 2: Launch browser 55 logger.info('Step 2: Launching browser...'); 56 const browser = await launchBrowser({ headless: false, slowMo: 100 }); 57 const context = await createStealthContext(browser); 58 59 // Step 3: Process sites 60 logger.info('Step 3: Processing sites...'); 61 const results = await processSiteBatch(context, newSites, keyword); 62 63 // Step 4: Close browser 64 await context.close(); 65 await browser.close(); 66 logger.info('Browser closed'); 67 68 // Step 5: Generate summary 69 generateSummary(results, keyword); 70 71 logger.success('POC Complete!'); 72 } catch (error) { 73 logger.error('POC failed', error); 74 process.exit(1); 75 } 76 } 77 78 /** 79 * Process a batch of sites 80 */ 81 async function processSiteBatch(context, sites, keyword) { 82 const results = { 83 total: sites.length, 84 processed: 0, 85 succeeded: 0, 86 failed: 0, 87 grades: {}, 88 }; 89 90 // Process sites with concurrency control 91 const { results: siteResults, errors } = await processBatch( 92 sites, 93 // eslint-disable-next-line require-await -- Calls async function 94 async site => processSingleSite(context, site, keyword), 95 { 96 concurrency: 5, // Lower concurrency for headed browser 97 onProgress: (completed, total) => { 98 logger.progress(completed, total, `Processing ${completed}/${total} sites`); 99 }, 100 onError: error => { 101 logger.error('Site processing error', error); 102 results.failed++; 103 }, 104 } 105 ); 106 107 results.processed = siteResults.length; 108 results.succeeded = siteResults.filter(r => r.success).length; 109 results.failed = errors.length; 110 111 // Count grades 112 siteResults.forEach(result => { 113 if (result.grade) { 114 results.grades[result.grade] = (results.grades[result.grade] || 0) + 1; 115 } 116 }); 117 118 return results; 119 } 120 121 /** 122 * Process a single site 123 */ 124 async function processSingleSite(context, site, keyword) { 125 const domain = extractDomain(site.url); 126 127 try { 128 logger.info(`Processing: ${domain}`); 129 130 // Update status to processing 131 await updateSiteStatus(domain, site.url, keyword, 'processing'); 132 133 // Step 1: Capture screenshots 134 const captureResults = await captureScreenshots(context, site.url, domain); 135 136 if (captureResults.error) { 137 throw new Error(`Capture failed: ${captureResults.error}`); 138 } 139 140 // Check if site returned 4xx/5xx error - set aside for later retry 141 if (captureResults.httpStatusCode && captureResults.httpStatusCode >= 400) { 142 const errorMsg = `HTTP ${captureResults.httpStatusCode} - Site returned error status, set aside for later retry`; 143 logger.warn(errorMsg); 144 await updateSiteStatus(domain, site.url, keyword, 'failed', errorMsg); 145 146 return { 147 success: false, 148 domain, 149 error: errorMsg, 150 httpError: true, 151 }; 152 } 153 154 // Step 2: Score website 155 const scoreResults = await scoreWebsite(captureResults); 156 const grade = extractGrade(scoreResults); 157 const score = extractScore(scoreResults); 158 159 // Step 3: Store in database 160 await storeSiteData(domain, site.url, keyword, captureResults, scoreResults); 161 162 logger.success(`${domain}: ${grade} (${score})`); 163 164 return { 165 success: true, 166 domain, 167 grade, 168 score, 169 }; 170 } catch (error) { 171 logger.error(`Failed to process ${domain}`, error); 172 173 // Store error in database 174 await updateSiteStatus(domain, site.url, keyword, 'failed', error.message); 175 176 return { 177 success: false, 178 domain, 179 error: error.message, 180 }; 181 } 182 } 183 184 /** 185 * Get existing domains from database 186 */ 187 async function getExistingDomains() { 188 const domains = await getAll('SELECT domain FROM sites'); 189 return new Set(domains.map(row => row.domain)); 190 } 191 192 /** 193 * Update site status in database 194 */ 195 async function updateSiteStatus(domain, url, keyword, status, errorLog = null) { 196 await run( 197 `INSERT INTO sites (domain, landing_page_url, keyword, processing_status, error_log, created_at) 198 VALUES ($1, $2, $3, $4, $5, NOW()) 199 ON CONFLICT (domain) DO UPDATE SET 200 processing_status = $6, 201 error_log = $7, 202 updated_at = NOW()`, 203 [domain, url, keyword, status, errorLog, status, errorLog] 204 ); 205 } 206 207 /** 208 * Store complete site data 209 */ 210 async function storeSiteData(domain, url, keyword, captureData, scoreData) { 211 // Extract contact details if available - never save empty results 212 let contactsJson = null; 213 if (scoreData.contact_details) { 214 const contacts = scoreData.contact_details; 215 // Only save if it's a non-empty object or array with items 216 if (typeof contacts === 'object') { 217 const hasContent = Array.isArray(contacts) 218 ? contacts.length > 0 219 : Object.keys(contacts).length > 0; 220 if (hasContent) { 221 contactsJson = JSON.stringify(contacts); 222 } 223 } 224 } 225 226 const scoreJson = JSON.stringify(scoreData); 227 const conversionScore = scoreData?.overall_calculation?.conversion_score || null; 228 229 const result = await run( 230 `INSERT INTO sites ( 231 domain, 232 landing_page_url, 233 keyword, 234 screenshot_above_desktop, 235 screenshot_below_desktop, 236 screenshot_above_mobile, 237 screenshot_above_desktop_uncropped, 238 screenshot_below_desktop_uncropped, 239 screenshot_above_mobile_uncropped, 240 html_dom, 241 http_status_code, 242 conversion_score_json, 243 conversion_score, 244 contacts_json, 245 processing_status, 246 created_at, 247 updated_at 248 ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, 'scored', NOW(), NOW()) 249 ON CONFLICT (domain) DO UPDATE SET 250 landing_page_url = $15, 251 keyword = $16, 252 screenshot_above_desktop = $17, 253 screenshot_below_desktop = $18, 254 screenshot_above_mobile = $19, 255 screenshot_above_desktop_uncropped = $20, 256 screenshot_below_desktop_uncropped = $21, 257 screenshot_above_mobile_uncropped = $22, 258 html_dom = $23, 259 http_status_code = $24, 260 conversion_score_json = $25, 261 conversion_score = $26, 262 contacts_json = $27, 263 processing_status = 'scored', 264 updated_at = NOW() 265 RETURNING id`, 266 [ 267 domain, 268 url, 269 keyword, 270 captureData.screenshots.desktop_above, 271 captureData.screenshots.desktop_below, 272 captureData.screenshots.mobile_above, 273 captureData.screenshotsUncropped.desktop_above, 274 captureData.screenshotsUncropped.desktop_below, 275 captureData.screenshotsUncropped.mobile_above, 276 captureData.html, 277 captureData.httpStatusCode, 278 scoreJson, 279 conversionScore, 280 contactsJson ? FS_SENTINEL : null, 281 // Update values 282 url, 283 keyword, 284 captureData.screenshots.desktop_above, 285 captureData.screenshots.desktop_below, 286 captureData.screenshots.mobile_above, 287 captureData.screenshotsUncropped.desktop_above, 288 captureData.screenshotsUncropped.desktop_below, 289 captureData.screenshotsUncropped.mobile_above, 290 captureData.html, 291 captureData.httpStatusCode, 292 scoreJson, 293 conversionScore, 294 contactsJson ? FS_SENTINEL : null, 295 ] 296 ); 297 298 // Write contacts to filesystem (sentinel already set in DB above) 299 if (contactsJson && result.lastInsertRowid) { 300 setContactsJson(result.lastInsertRowid, contactsJson); 301 } else if (contactsJson) { 302 const siteRow = await getOne('SELECT id FROM sites WHERE domain = $1', [domain]); 303 if (siteRow) setContactsJson(siteRow.id, contactsJson); 304 } 305 } 306 307 /** 308 * Generate summary report 309 */ 310 function generateSummary(results, keyword) { 311 console.log(`\n${'='.repeat(60)}`); 312 console.log('POC SUMMARY'); 313 console.log('='.repeat(60)); 314 console.log(`\nKeyword: ${keyword}`); 315 console.log(`Total sites: ${results.total}`); 316 console.log(`Processed: ${results.processed}`); 317 console.log(`Succeeded: ${results.succeeded}`); 318 console.log(`Failed: ${results.failed}`); 319 320 console.log('\nš Grade Distribution:'); 321 Object.entries(results.grades) 322 .sort((a, b) => b[1] - a[1]) 323 .forEach(([grade, count]) => { 324 const percentage = ((count / results.succeeded) * 100).toFixed(1); 325 console.log(` ${grade}: ${count} (${percentage}%)`); 326 }); 327 328 // Calculate low-scoring sites (C or below) 329 const lowGrades = ['C+', 'C', 'C-', 'D+', 'D', 'D-', 'F']; 330 const lowScoreCount = Object.entries(results.grades) 331 .filter(([grade]) => lowGrades.includes(grade)) 332 .reduce((sum, [, count]) => sum + count, 0); 333 334 const lowScorePct = ((lowScoreCount / results.succeeded) * 100).toFixed(1); 335 336 console.log(`\nšÆ Low-scoring sites (C or below): ${lowScoreCount} (${lowScorePct}%)`); 337 console.log(`š° Potential market: ${lowScoreCount} sites need optimization`); 338 339 console.log('\nš Next steps:'); 340 console.log(' 1. Review results in DBeaver: Open db/sites.db'); 341 console.log(' 2. Analyze grade distribution'); 342 console.log(' 3. Plan MVP outreach for low scorers'); 343 344 console.log(`\n${'='.repeat(60)}\n`); 345 } 346 347 /** 348 * Parse command-line arguments and run 349 */ 350 const args = process.argv.slice(2); 351 352 if (args.length === 0) { 353 console.log('Usage: npm run poc "<keyword>" [limit]'); 354 console.log('Example: npm run poc "plumber seattle" 10'); 355 process.exit(1); 356 } 357 358 const keyword = args[0]; 359 const limit = parseInt(args[1]) || 10; 360 361 runPOC(keyword, limit);