/ src / poc.js
poc.js
  1  #!/usr/bin/env node
  2  
  3  /**
  4   * POC Orchestration
  5   * Main entry point for proof of concept - scans sites and scores them
  6   */
  7  
  8  import Logger from './utils/logger.js';
  9  import { processBatch, extractDomain } from './utils/error-handler.js';
 10  import { scrapeSERP } from './scrape.js';
 11  import { launchBrowser, createStealthContext, captureScreenshots } from './capture.js';
 12  import { scoreWebsite, extractGrade, extractScore } from './score.js';
 13  import { setContactsJson } from './utils/contacts-storage.js';
 14  import './utils/load-env.js';
 15  import { run, getOne, getAll, query, withTransaction, closePool, getPool } from './utils/db.js';
 16  
 17  const FS_SENTINEL = '{"_fs":true}';
 18  
 19  const logger = new Logger('POC');
 20  
 21  /**
 22   * Main POC function
 23   */
 24  async function runPOC(keyword, limit = 10) {
 25    logger.info(`Starting POC for keyword: "${keyword}" (limit: ${limit})`);
 26  
 27    try {
 28      logger.success('Database connected');
 29  
 30      // Step 1: Scrape SERP
 31      logger.info('Step 1: Scraping SERP...');
 32      const serpResults = await scrapeSERP(keyword, limit);
 33      logger.success(`Found ${serpResults.length} sites from SERP`);
 34  
 35      // Step 1.5: Deduplicate against existing domains in database
 36      const existingDomains = await getExistingDomains();
 37      const newSites = serpResults.filter(site => {
 38        const domain = extractDomain(site.url);
 39        return !existingDomains.has(domain);
 40      });
 41  
 42      if (newSites.length < serpResults.length) {
 43        const skipped = serpResults.length - newSites.length;
 44        logger.info(`Skipping ${skipped} sites already in database`);
 45      }
 46  
 47      if (newSites.length === 0) {
 48        logger.info('All sites already processed. Nothing to do.');
 49        return;
 50      }
 51  
 52      logger.success(`Processing ${newSites.length} new sites`);
 53  
 54      // Step 2: Launch browser
 55      logger.info('Step 2: Launching browser...');
 56      const browser = await launchBrowser({ headless: false, slowMo: 100 });
 57      const context = await createStealthContext(browser);
 58  
 59      // Step 3: Process sites
 60      logger.info('Step 3: Processing sites...');
 61      const results = await processSiteBatch(context, newSites, keyword);
 62  
 63      // Step 4: Close browser
 64      await context.close();
 65      await browser.close();
 66      logger.info('Browser closed');
 67  
 68      // Step 5: Generate summary
 69      generateSummary(results, keyword);
 70  
 71      logger.success('POC Complete!');
 72    } catch (error) {
 73      logger.error('POC failed', error);
 74      process.exit(1);
 75    }
 76  }
 77  
 78  /**
 79   * Process a batch of sites
 80   */
 81  async function processSiteBatch(context, sites, keyword) {
 82    const results = {
 83      total: sites.length,
 84      processed: 0,
 85      succeeded: 0,
 86      failed: 0,
 87      grades: {},
 88    };
 89  
 90    // Process sites with concurrency control
 91    const { results: siteResults, errors } = await processBatch(
 92      sites,
 93      // eslint-disable-next-line require-await -- Calls async function
 94      async site => processSingleSite(context, site, keyword),
 95      {
 96        concurrency: 5, // Lower concurrency for headed browser
 97        onProgress: (completed, total) => {
 98          logger.progress(completed, total, `Processing ${completed}/${total} sites`);
 99        },
100        onError: error => {
101          logger.error('Site processing error', error);
102          results.failed++;
103        },
104      }
105    );
106  
107    results.processed = siteResults.length;
108    results.succeeded = siteResults.filter(r => r.success).length;
109    results.failed = errors.length;
110  
111    // Count grades
112    siteResults.forEach(result => {
113      if (result.grade) {
114        results.grades[result.grade] = (results.grades[result.grade] || 0) + 1;
115      }
116    });
117  
118    return results;
119  }
120  
121  /**
122   * Process a single site
123   */
124  async function processSingleSite(context, site, keyword) {
125    const domain = extractDomain(site.url);
126  
127    try {
128      logger.info(`Processing: ${domain}`);
129  
130      // Update status to processing
131      await updateSiteStatus(domain, site.url, keyword, 'processing');
132  
133      // Step 1: Capture screenshots
134      const captureResults = await captureScreenshots(context, site.url, domain);
135  
136      if (captureResults.error) {
137        throw new Error(`Capture failed: ${captureResults.error}`);
138      }
139  
140      // Check if site returned 4xx/5xx error - set aside for later retry
141      if (captureResults.httpStatusCode && captureResults.httpStatusCode >= 400) {
142        const errorMsg = `HTTP ${captureResults.httpStatusCode} - Site returned error status, set aside for later retry`;
143        logger.warn(errorMsg);
144        await updateSiteStatus(domain, site.url, keyword, 'failed', errorMsg);
145  
146        return {
147          success: false,
148          domain,
149          error: errorMsg,
150          httpError: true,
151        };
152      }
153  
154      // Step 2: Score website
155      const scoreResults = await scoreWebsite(captureResults);
156      const grade = extractGrade(scoreResults);
157      const score = extractScore(scoreResults);
158  
159      // Step 3: Store in database
160      await storeSiteData(domain, site.url, keyword, captureResults, scoreResults);
161  
162      logger.success(`${domain}: ${grade} (${score})`);
163  
164      return {
165        success: true,
166        domain,
167        grade,
168        score,
169      };
170    } catch (error) {
171      logger.error(`Failed to process ${domain}`, error);
172  
173      // Store error in database
174      await updateSiteStatus(domain, site.url, keyword, 'failed', error.message);
175  
176      return {
177        success: false,
178        domain,
179        error: error.message,
180      };
181    }
182  }
183  
184  /**
185   * Get existing domains from database
186   */
187  async function getExistingDomains() {
188    const domains = await getAll('SELECT domain FROM sites');
189    return new Set(domains.map(row => row.domain));
190  }
191  
192  /**
193   * Update site status in database
194   */
195  async function updateSiteStatus(domain, url, keyword, status, errorLog = null) {
196    await run(
197      `INSERT INTO sites (domain, landing_page_url, keyword, processing_status, error_log, created_at)
198       VALUES ($1, $2, $3, $4, $5, NOW())
199       ON CONFLICT (domain) DO UPDATE SET
200         processing_status = $6,
201         error_log = $7,
202         updated_at = NOW()`,
203      [domain, url, keyword, status, errorLog, status, errorLog]
204    );
205  }
206  
207  /**
208   * Store complete site data
209   */
210  async function storeSiteData(domain, url, keyword, captureData, scoreData) {
211    // Extract contact details if available - never save empty results
212    let contactsJson = null;
213    if (scoreData.contact_details) {
214      const contacts = scoreData.contact_details;
215      // Only save if it's a non-empty object or array with items
216      if (typeof contacts === 'object') {
217        const hasContent = Array.isArray(contacts)
218          ? contacts.length > 0
219          : Object.keys(contacts).length > 0;
220        if (hasContent) {
221          contactsJson = JSON.stringify(contacts);
222        }
223      }
224    }
225  
226    const scoreJson = JSON.stringify(scoreData);
227    const conversionScore = scoreData?.overall_calculation?.conversion_score || null;
228  
229    const result = await run(
230      `INSERT INTO sites (
231        domain,
232        landing_page_url,
233        keyword,
234        screenshot_above_desktop,
235        screenshot_below_desktop,
236        screenshot_above_mobile,
237        screenshot_above_desktop_uncropped,
238        screenshot_below_desktop_uncropped,
239        screenshot_above_mobile_uncropped,
240        html_dom,
241        http_status_code,
242        conversion_score_json,
243        conversion_score,
244        contacts_json,
245        processing_status,
246        created_at,
247        updated_at
248      ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, 'scored', NOW(), NOW())
249      ON CONFLICT (domain) DO UPDATE SET
250        landing_page_url = $15,
251        keyword = $16,
252        screenshot_above_desktop = $17,
253        screenshot_below_desktop = $18,
254        screenshot_above_mobile = $19,
255        screenshot_above_desktop_uncropped = $20,
256        screenshot_below_desktop_uncropped = $21,
257        screenshot_above_mobile_uncropped = $22,
258        html_dom = $23,
259        http_status_code = $24,
260        conversion_score_json = $25,
261        conversion_score = $26,
262        contacts_json = $27,
263        processing_status = 'scored',
264        updated_at = NOW()
265      RETURNING id`,
266      [
267        domain,
268        url,
269        keyword,
270        captureData.screenshots.desktop_above,
271        captureData.screenshots.desktop_below,
272        captureData.screenshots.mobile_above,
273        captureData.screenshotsUncropped.desktop_above,
274        captureData.screenshotsUncropped.desktop_below,
275        captureData.screenshotsUncropped.mobile_above,
276        captureData.html,
277        captureData.httpStatusCode,
278        scoreJson,
279        conversionScore,
280        contactsJson ? FS_SENTINEL : null,
281        // Update values
282        url,
283        keyword,
284        captureData.screenshots.desktop_above,
285        captureData.screenshots.desktop_below,
286        captureData.screenshots.mobile_above,
287        captureData.screenshotsUncropped.desktop_above,
288        captureData.screenshotsUncropped.desktop_below,
289        captureData.screenshotsUncropped.mobile_above,
290        captureData.html,
291        captureData.httpStatusCode,
292        scoreJson,
293        conversionScore,
294        contactsJson ? FS_SENTINEL : null,
295      ]
296    );
297  
298    // Write contacts to filesystem (sentinel already set in DB above)
299    if (contactsJson && result.lastInsertRowid) {
300      setContactsJson(result.lastInsertRowid, contactsJson);
301    } else if (contactsJson) {
302      const siteRow = await getOne('SELECT id FROM sites WHERE domain = $1', [domain]);
303      if (siteRow) setContactsJson(siteRow.id, contactsJson);
304    }
305  }
306  
307  /**
308   * Generate summary report
309   */
310  function generateSummary(results, keyword) {
311    console.log(`\n${'='.repeat(60)}`);
312    console.log('POC SUMMARY');
313    console.log('='.repeat(60));
314    console.log(`\nKeyword: ${keyword}`);
315    console.log(`Total sites: ${results.total}`);
316    console.log(`Processed: ${results.processed}`);
317    console.log(`Succeeded: ${results.succeeded}`);
318    console.log(`Failed: ${results.failed}`);
319  
320    console.log('\nšŸ“Š Grade Distribution:');
321    Object.entries(results.grades)
322      .sort((a, b) => b[1] - a[1])
323      .forEach(([grade, count]) => {
324        const percentage = ((count / results.succeeded) * 100).toFixed(1);
325        console.log(`   ${grade}: ${count} (${percentage}%)`);
326      });
327  
328    // Calculate low-scoring sites (C or below)
329    const lowGrades = ['C+', 'C', 'C-', 'D+', 'D', 'D-', 'F'];
330    const lowScoreCount = Object.entries(results.grades)
331      .filter(([grade]) => lowGrades.includes(grade))
332      .reduce((sum, [, count]) => sum + count, 0);
333  
334    const lowScorePct = ((lowScoreCount / results.succeeded) * 100).toFixed(1);
335  
336    console.log(`\nšŸŽÆ Low-scoring sites (C or below): ${lowScoreCount} (${lowScorePct}%)`);
337    console.log(`šŸ’° Potential market: ${lowScoreCount} sites need optimization`);
338  
339    console.log('\nšŸ“‹ Next steps:');
340    console.log('   1. Review results in DBeaver: Open db/sites.db');
341    console.log('   2. Analyze grade distribution');
342    console.log('   3. Plan MVP outreach for low scorers');
343  
344    console.log(`\n${'='.repeat(60)}\n`);
345  }
346  
347  /**
348   * Parse command-line arguments and run
349   */
350  const args = process.argv.slice(2);
351  
352  if (args.length === 0) {
353    console.log('Usage: npm run poc "<keyword>" [limit]');
354    console.log('Example: npm run poc "plumber seattle" 10');
355    process.exit(1);
356  }
357  
358  const keyword = args[0];
359  const limit = parseInt(args[1]) || 10;
360  
361  runPOC(keyword, limit);