/ src / utils / error-categories.js
error-categories.js
  1  /**
  2   * Error Categorization Module
  3   * Regex-based categorization of site and outreach error messages.
  4   * Groups errors into terminal (won't succeed on retry) and retriable.
  5   */
  6  
  7  import { getAll, getOne } from './db.js';
  8  
  9  // ──────────────────────────────────────────────────────────────────────────────
 10  // Pattern tables
 11  // ──────────────────────────────────────────────────────────────────────────────
 12  
 13  const SITE_TERMINAL_PATTERNS = [
 14    { pattern: /Social media platform/i, label: 'Social media' },
 15    { pattern: /Business directory/i, label: 'Business directory' },
 16    { pattern: /Not a local business/i, label: 'Not a local business' },
 17    { pattern: /Home service franchise/i, label: 'Franchise' },
 18    { pattern: /Government/i, label: 'Government domain' },
 19    { pattern: /Education domain/i, label: 'Education domain' },
 20    { pattern: /Non-commercial/i, label: 'Non-commercial' },
 21    { pattern: /Cross-border duplicate/i, label: 'Cross-border duplicate' },
 22    { pattern: /Duplicate domain/i, label: 'Duplicate domain' },
 23    { pattern: /Country mismatch/i, label: 'Country mismatch' },
 24    { pattern: /No email template/i, label: 'No template' },
 25    { pattern: /HTTP 404.*[Cc]annot capture/i, label: 'HTTP 404 (page gone)' },
 26    { pattern: /Max recapture attempts/i, label: 'Max recapture retries' },
 27    { pattern: /Ignored: Regulated industry \(.*\)/i, label: 'Regulated Industry Exclusion' },
 28    {
 29      pattern: /Max retries \(\d+\) exceeded: No contacts or key_pages found/i,
 30      label: 'No Contacts Found',
 31    },
 32    { pattern: /Ignored: Regulated industry \(\w+\)/i, label: 'Regulated Industry Exclusion' },
 33    {
 34      pattern: /Ignored: (Regulated industry|Law firm|Legal site detected).*/i,
 35      label: 'Restricted Industry',
 36    },
 37    {
 38      pattern:
 39        /(Page|The page) (not found|does not exist|you're looking for doesn't exist|appears to be an error page|is a security verification screen|indicates an error).*/i,
 40      label: 'Page Not Found',
 41    },
 42    {
 43      pattern:
 44        /Max retries \(\d+\) exceeded: (url\.toLowerCase is not a function|HTTP \d+ - Cannot capture assets for error response)/i,
 45      label: 'Capture Processing Error',
 46    },
 47    { pattern: /The page with the name '.*' could not be found/i, label: 'Page Not Found' },
 48    {
 49      pattern:
 50        /Max retries \(\d+\) exceeded: LLM daily budget exceeded: \$\d+\.\d+ spent today \(budget: \$\d+\)/i,
 51      label: 'LLM Daily Budget Exceeded',
 52    },
 53    {
 54      pattern: /The page displays a 'no posts matched your criteria' message/i,
 55      label: 'No Posts Found',
 56    },
 57    {
 58      pattern:
 59        /Access Denied|Access to the page (is forbidden|was denied)|access denied.*HTTP Error 403/i,
 60      label: 'Access Denied',
 61    },
 62    { pattern: /False-positive error page/i, label: 'Error page (false positive)' },
 63    { pattern: /CHECK constraint failed/i, label: 'DB constraint error' },
 64    { pattern: /^(Franchise site|Social media:\s+\S+)$/i, label: 'Site Filter - Franchise/Social Media' },
 65    { pattern: /Max retries.*could not determine data type of parameter/i, label: 'Database Parameter Type Mismatch' },
 66    { pattern: /^Access forbidden$/i, label: 'HTTP 403 Forbidden' },
 67    { pattern: /^Permanent error: 410$/i, label: 'HTTP 410 Gone' },
 68  ];
 69  
 70  const SITE_RETRIABLE_PATTERNS = [
 71    { pattern: /HTTP 403.*[Cc]annot capture/i, label: 'HTTP 403 (access denied)' },
 72    { pattern: /userDataDir|launchPersistentContext/i, label: 'Browser launch conflict' },
 73    { pattern: /status code 400/i, label: 'API bad request' },
 74    { pattern: /status code 401/i, label: 'API auth error' },
 75    { pattern: /status code 429/i, label: 'Rate limited' },
 76    { pattern: /Incomplete LLM response/i, label: 'Incomplete LLM' },
 77    { pattern: /Timed out|Timeout/i, label: 'Timeout' },
 78    { pattern: /ECONNRESET|ETIMEDOUT|ENOTFOUND/i, label: 'Network error' },
 79    { pattern: /navigating and changing/i, label: 'Page navigating' },
 80    { pattern: /browser has been closed/i, label: 'Browser crashed' },
 81    { pattern: /EACCES/i, label: 'Permission denied' },
 82    { pattern: /screenshot_path is NULL/i, label: 'Missing screenshot' },
 83    { pattern: /database is locked/i, label: 'DB lock' },
 84    { pattern: /chromium-nice ENOENT/i, label: 'Chromium not found' },
 85    { pattern: /HTTP (5\d\d|null)/i, label: 'Server error' },
 86    {
 87      pattern:
 88        /Max retries \(3\) exceeded: (No contacts or key_pages found|Country code is required|Failed to parse JSON response)/i,
 89      label: 'Max Retries Extraction Failure',
 90    },
 91    {
 92      pattern:
 93        /Max retries \(\d+\) exceeded: page\.goto: (net::ERR_TOO_MANY_REDIRECTS|net::ERR_TIMED_OUT|net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH).*/i,
 94      label: 'Page Navigation Error',
 95    },
 96    {
 97      pattern: /(Website is currently not reachable|No content available for evaluation|directory)/i,
 98      label: 'Site Unreachable',
 99    },
100    {
101      pattern:
102        /Max retries \(\d+\) exceeded: page\.goto: (net::ERR_SSL_UNRECOGNIZED_NAME_ALERT|net::ERR_SSL_PROTOCOL_ERROR) at https?:/i,
103      label: 'SSL Connection Error',
104    },
105    {
106      pattern: /Max retries \(\d+\) exceeded: page\.goto: net::ERR_NAME_NOT_RESOLVED/i,
107      label: 'DNS / domain not found',
108    },
109    {
110      pattern:
111        /Max retries \(\d+\) exceeded: page\.goto: net::ERR_CERT_(DATE_INVALID|COMMON_NAME_INVALID|AUTHORITY_INVALID)/i,
112      label: 'Invalid SSL certificate',
113    },
114    {
115      pattern:
116        /Max retries \(\d+\) exceeded: page\.goto: net::ERR_(CONNECTION_REFUSED|CONNECTION_CLOSED|CONNECTION_RESET|SOCKET_NOT_CONNECTED|ADDRESS_UNREACHABLE|ABORTED)/i,
117      label: 'Connection refused / closed',
118    },
119    {
120      pattern:
121        /Max retries \(\d+\) exceeded: page\.goto: net::ERR_HTTP(2_PROTOCOL_ERROR|_RESPONSE_CODE_FAILURE)/i,
122      label: 'HTTP protocol error',
123    },
124    {
125      pattern: /Max retries \(\d+\) exceeded: page\.goto: (Download is starting|net::ERR_ABORTED)/i,
126      label: 'URL is a file download',
127    },
128    {
129      pattern: /Max retries \(\d+\) exceeded: page\.goto: Protocol error/i,
130      label: 'Browser protocol error',
131    },
132    {
133      pattern: /Max retries \(\d+\) exceeded: page\.addStyleTag:.*Content Security Policy/i,
134      label: 'CSP blocks screenshot',
135    },
136    {
137      pattern: /Max retries \(\d+\) exceeded: page\.(addStyleTag|evaluate): TypeError/i,
138      label: 'Page JS error',
139    },
140    {
141      pattern:
142        /Max retries \(3\) exceeded: page\.goto: (net::ERR_INVALID_AUTH_CREDENTIALS|Page crashed|HTTP \d{3}:)/i,
143      label: 'Page Load Failure',
144    },
145    { pattern: /Max retries \(3\) exceeded: HTTP (700|405):/i, label: 'HTTP Max Retries Exceeded' },
146    {
147      pattern: /Max retries \(\d+\) exceeded: page\.goto: net::ERR_CONNECTION_TIMED_OUT/i,
148      label: 'Connection timed out',
149    },
150    {
151      pattern: /Max retries \(\d+\) exceeded: page\.content: Target crashed/i,
152      label: 'Browser crashed (target)',
153    },
154    {
155      pattern: /Max retries \(\d+\) exceeded: Request failed with status code 5\d\d/i,
156      label: 'Server error (5xx)',
157    },
158  ];
159  
160  const OUTREACH_TERMINAL_PATTERNS = [
161    { pattern: /opted out/i, label: 'Opted out' },
162    { pattern: /Fake.*phone|invalid.*phone/i, label: 'Invalid phone' },
163    { pattern: /gdpr_blocked/i, label: 'GDPR blocked' },
164    { pattern: /Cloudflare.*blocked/i, label: 'Cloudflare blocked' },
165    { pattern: /ZeroBounce/i, label: 'Invalid email (ZeroBounce)' },
166    { pattern: /gov_blocked/i, label: 'Gov domain blocked' },
167    { pattern: /no_message_button/i, label: 'No DM button' },
168    { pattern: /'To' number: \+\d+, is not a valid mobile number/i, label: 'Invalid Phone Number' },
169    {
170      pattern: /Permission to send an SMS has not been enabled for the region.*/i,
171      label: 'SMS Region Blocked',
172    },
173    {
174      pattern: /No Twilio number configured for country \w+ and TWILIO_PHONE_NUMBER not set/i,
175      label: 'Twilio Number Configuration Missing',
176    },
177    { pattern: /'To' number: \+\d+, is not a valid mobile number/i, label: 'Invalid Phone Number' },
178    { pattern: /\+\d{10,12}X{3,4} is not a valid mobile number/i, label: 'Invalid Phone Number' },
179    {
180      pattern: /'To' number: \+\d{11}XXXX, is not a valid mobile number/i,
181      label: 'Invalid Mobile Number',
182    },
183    {
184      pattern: /'To' number cannot be a Short Code: \+\d{3}XXXX/i,
185      label: 'Invalid Short Code Number',
186    },
187    {
188      pattern: /'From' \+\d{11} is not a Twilio phone number or Short Code country mismatch/i,
189      label: 'Invalid Twilio Phone Number',
190    },
191    {
192      pattern: /'To' number: \+61\d{9}XXXX, is not a valid mobile number/i,
193      label: 'Invalid Mobile Number',
194    },
195    { pattern: /\+?\d{10,12}XXXX is not a valid mobile number/i, label: 'Invalid Phone Number' },
196    { pattern: /\+\d+\d{4}XXXX, is not a valid mobile number/i, label: 'Invalid Phone Number' },
197    {
198      pattern: /'To' number cannot be a Short Code: \+61\d{2}XXXX/i,
199      label: 'Invalid Short Code Phone Number',
200    },
201    {
202      pattern: /'To' number cannot be a Short Code: \+\d{4}XXXX/i,
203      label: 'Invalid Short Code Phone Number',
204    },
205    {
206      pattern: /'To' number cannot be a Short Code: \+\d{8,12}/i,
207      label: 'Invalid Short Code Number',
208    },
209    {
210      pattern: /'To' number cannot be a Short Code: \+\d{8,12}/i,
211      label: 'Invalid Short Code Number',
212    },
213    {
214      pattern: /'To' number cannot be a Short Code: \+61(490|472)XXXX/i,
215      label: 'Invalid Short Code Number',
216    },
217    {
218      pattern: /Invalid E\.164: starts with \+0 \(\+0\d+\)/i,
219      label: 'Invalid Phone Number Zero Prefix',
220    },
221    {
222      pattern: /Short code detected — not a mobile number \(\+\d{8}\)/i,
223      label: 'Short Code Phone Number',
224    },
225    { pattern: /Outreach #\d+ is for \w+, not \w+/i, label: 'Channel mismatch' },
226    { pattern: /Contact email is a placeholder/i, label: 'Placeholder email' },
227    { pattern: /Toll-free number — cannot receive SMS/i, label: 'Toll-free number' },
228    { pattern: /cannot be a Short Code/i, label: 'Short code number' },
229    {
230      pattern: /(placeholder|test).*(email|address).*(permanently|exclude)/i,
231      label: 'Placeholder/Test Email Permanently Excluded',
232    },
233    { pattern: /email.*on.*exclusion list/i, label: 'Email Exclusion List' },
234    { pattern: /Too short to be a real phone number \(\+\d+\)/i, label: 'Invalid Phone Number - Too Short' },
235    { pattern: /\+1\d{3}555\d{4}/i, label: 'US 555 Placeholder Number' },
236    { pattern: /^Authentication Error\s*-\s*invalid username$/i, label: 'Auth Error - Invalid Username' },
237    { pattern: /'From' phone number provided.*is not a valid message-capable Twilio phone number/i, label: 'Twilio Invalid Sender Number' },
238    { pattern: /^US 555 placeholder number/i, label: 'Test Number Detection - 555' },
239  ];
240  
241  const OUTREACH_RETRIABLE_PATTERNS = [
242    {
243      pattern: /under.?construction|coming soon|site maintenance|parked domain/i,
244      label: 'Under construction',
245    },
246    { pattern: /business hours/i, label: 'Business hours block' },
247    { pattern: /status.?code 429|rate limit/i, label: 'Rate limited' },
248    { pattern: /Timeout/i, label: 'Form timeout' },
249    { pattern: /net::ERR_NAME_NOT_RESOLVED/i, label: 'DNS failure' },
250    { pattern: /per.recipient cooldown/i, label: 'Per-recipient cooldown' },
251    { pattern: /Breaker is open/i, label: 'Circuit breaker open' },
252    { pattern: /landline/i, label: 'Landline number' },
253    {
254      pattern: /browser.*crash|Target page.*closed|browser has been closed/i,
255      label: 'Browser crash',
256    },
257    { pattern: /ECONNRESET|ETIMEDOUT|ENOTFOUND/i, label: 'Network error' },
258    {
259      pattern:
260        /Invalid Resend API response: .*"statusCode":429,"message":"You have reached your daily email sending quota"/i,
261      label: 'Daily Email Quota Exceeded',
262    },
263    {
264      pattern: /Form page failed to load.*net::ERR_NAME_NOT_RESOLVED/i,
265      label: 'Form page DNS failure',
266    },
267    {
268      pattern: /Form page failed to load/i,
269      label: 'Form page load failure',
270    },
271    {
272      pattern: /Invalid Resend API response:.+"statusCode":429,.+"daily_quota_exceeded"/i,
273      label: 'Email Sending Quota Exceeded',
274    },
275    {
276      pattern: /Invalid Resend API response:.*"statusCode"\s*:\s*429/i,
277      label: 'Resend Rate Limited',
278    },
279    { pattern: /Timed out after \d+ms/i, label: 'Request Timeout' },
280    { pattern: /zb_unknown:/i, label: 'ZeroBounce unknown (re-check pending)' },
281    {
282      pattern: /No local phone number for country \w+ — parked/i,
283      label: 'No local phone number (parked)',
284    },
285  ];
286  
287  /**
288   * Suggested retry_at offsets (in seconds) for known retriable error patterns.
289   * Used by outreach send functions to set retry_at when transitioning to retry_later.
290   *
291   * Rationale:
292   *   Rate limited (429)      → 1 hour  (most APIs reset hourly)
293   *   Form timeout            → 30 min  (transient load spike)
294   *   DNS failure             → 24 hours (DNS outage / propagation)
295   *   Business hours block    → 8 hours  (conservative; covers most time zones)
296   *   Per-recipient cooldown  → 72 hours (full cooldown window)
297   *   Browser crash           → 15 min   (process restart window)
298   *   Network error           → 1 hour   (general network recovery)
299   *   Unknown retriable       → 1 hour   (safe default)
300   */
301  export const OUTREACH_RETRY_INTERVALS = [
302    {
303      pattern: /under.?construction|coming soon|site maintenance|parked domain/i,
304      seconds: 604800,
305      label: 'Under construction',
306    }, // 7 days
307    { pattern: /status.?code 429|rate limit/i, seconds: 3600, label: 'Rate limited' },
308    { pattern: /Timeout/i, seconds: 1800, label: 'Form timeout' },
309    { pattern: /net::ERR_NAME_NOT_RESOLVED/i, seconds: 86400, label: 'DNS failure' },
310    { pattern: /business hours/i, seconds: 28800, label: 'Business hours' },
311    { pattern: /per.recipient cooldown/i, seconds: 259200, label: 'Per-recipient cooldown' },
312    {
313      pattern: /browser.*crash|Target page.*closed|browser has been closed/i,
314      seconds: 900,
315      label: 'Browser crash',
316    },
317    { pattern: /ECONNRESET|ETIMEDOUT|ENOTFOUND/i, seconds: 3600, label: 'Network error' },
318  ];
319  
320  /**
321   * Compute retry_at datetime string for a retriable error message.
322   * Returns an ISO-8601 string (UTC) for use in SQLite DATETIME fields.
323   * Falls back to 1 hour if no pattern matches.
324   *
325   * @param {string} errorMessage
326   * @returns {string} ISO datetime string
327   */
328  export function computeRetryAt(errorMessage) {
329    const msg = errorMessage || '';
330    for (const { pattern, seconds } of OUTREACH_RETRY_INTERVALS) {
331      if (pattern.test(msg)) {
332        return new Date(Date.now() + seconds * 1000).toISOString();
333      }
334    }
335    // Default: 1 hour
336    return new Date(Date.now() + 3600 * 1000).toISOString();
337  }
338  
339  /**
340   * Determine whether an outreach error is retriable (should become retry_later)
341   * or terminal (should become failed).
342   *
343   * @param {string|null} errorMessage
344   * @returns {boolean} true if error is retriable
345   */
346  export function isOutreachRetriable(errorMessage) {
347    const { group } = categorizeError(errorMessage, 'outreach');
348    return group === 'retriable';
349  }
350  
351  // markOutreachResult() was removed — it used SQLite .prepare().run() and had no active
352  // callers. Each outreach module (email.js, sms.js, form.js, linkedin.js, x.js) has an
353  // inline async markOutreachResultAsync() that uses the PG run() helper from db.js.
354  
355  // ──────────────────────────────────────────────────────────────────────────────
356  // Core categorization function
357  // ──────────────────────────────────────────────────────────────────────────────
358  
359  /**
360   * Categorize an error message into a group and label.
361   * @param {string|null} errorMessage
362   * @param {'site'|'outreach'} context
363   * @returns {{ group: 'terminal'|'retriable'|'unknown', label: string }}
364   */
365  export function categorizeError(errorMessage, context = 'site') {
366    if (!errorMessage || errorMessage.trim() === '') {
367      return { group: 'retriable', label: 'Unknown (no error stored)' };
368    }
369  
370    const terminalPatterns =
371      context === 'outreach' ? OUTREACH_TERMINAL_PATTERNS : SITE_TERMINAL_PATTERNS;
372    const retriablePatterns =
373      context === 'outreach' ? OUTREACH_RETRIABLE_PATTERNS : SITE_RETRIABLE_PATTERNS;
374  
375    for (const { pattern, label } of terminalPatterns) {
376      if (pattern.test(errorMessage)) {
377        return { group: 'terminal', label };
378      }
379    }
380  
381    for (const { pattern, label } of retriablePatterns) {
382      if (pattern.test(errorMessage)) {
383        return { group: 'retriable', label };
384      }
385    }
386  
387    return { group: 'unknown', label: 'Unknown' };
388  }
389  
390  // ──────────────────────────────────────────────────────────────────────────────
391  // Tree builders
392  // ──────────────────────────────────────────────────────────────────────────────
393  
394  /**
395   * Pipeline stage order for display
396   */
397  const SITE_STATUS_ORDER = [
398    'found',
399    'ignored',
400    'assets_captured',
401    'prog_scored',
402    'semantic_scored',
403    'vision_scored',
404    'enriched',
405    'enriched_regex',
406    'enriched_llm',
407    'proposals_drafted',
408    'outreach_partial',
409    'outreach_sent',
410    'high_score',
411    'failing',
412  ];
413  
414  const OUTREACH_STATUS_ORDER = [
415    'pending',
416    'approved',
417    'scheduled',
418    'sent',
419    'delivered',
420    'opened',
421    'clicked',
422    'replied',
423    'rework',
424    'retry_later',
425    'failed',
426    'bounced',
427    'gdpr_blocked',
428    'rejected',
429  ];
430  
431  /**
432   * Pipeline stages in order (index = pipeline depth).
433   * For a given status, all statuses at higher indexes are "downstream" —
434   * sites that passed through this status but have since advanced further.
435   */
436  const PIPELINE_STAGE_DEPTH = [
437    'found',
438    'ignored',
439    'assets_captured',
440    'prog_scored',
441    'semantic_scored',
442    'vision_scored',
443    'enriched',
444    'enriched_regex',
445    'enriched_llm',
446    'proposals_drafted',
447    'outreach_partial',
448    'outreach_sent',
449  ];
450  
451  /**
452   * Build status tree for sites with error breakdowns and throughput.
453   * @returns {Promise<Array>} status tree
454   */
455  export async function buildStatusTree() {
456    // Split into separate queries to avoid full table scan with conditional COUNTs.
457  
458    // 1. Totals by status (uses idx_sites_status)
459    const totalRows = await getAll('SELECT status, COUNT(*) AS total FROM sites GROUP BY status');
460  
461    // 2. Deltas: sites that entered this status recently AND are still here.
462    //    Uses site_status transition log joined to sites current status.
463    //    Avoids the inflation from sites.updated_at which gets bumped by updateLastOutreachAt()
464    //    and other non-status-change updates (would show 2.3x actual throughput for outreach_sent).
465    // Rewritten to avoid COUNT(DISTINCT) full JOIN on 1.3M×1.2M rows.
466    // Subquery filters site_status by time (uses idx_site_status_created_at) first,
467    // then joins the small result set to sites for current-status confirmation.
468    const delta24hRows = await getAll(
469      `SELECT recent.status, COUNT(*) AS delta
470       FROM (
471         SELECT DISTINCT site_id, status FROM site_status
472         WHERE created_at > NOW() - INTERVAL '24 hours'
473       ) recent
474       JOIN sites s ON recent.site_id = s.id AND s.status = recent.status
475       GROUP BY recent.status`
476    );
477    const delta1hRows = await getAll(
478      `SELECT recent.status, COUNT(*) AS delta
479       FROM (
480         SELECT DISTINCT site_id, status FROM site_status
481         WHERE created_at > NOW() - INTERVAL '1 hour'
482       ) recent
483       JOIN sites s ON recent.site_id = s.id AND s.status = recent.status
484       GROUP BY recent.status`
485    );
486  
487    // Merge into topLevel shape
488    const d24map = Object.fromEntries(delta24hRows.map(r => [r.status, Number(r.delta)]));
489    const d1map = Object.fromEntries(delta1hRows.map(r => [r.status, Number(r.delta)]));
490    const topLevel = totalRows.map(r => ({
491      status: r.status,
492      total: Number(r.total),
493      delta_24h: d24map[r.status] || 0,
494      delta_1h: d1map[r.status] || 0,
495    }));
496  
497    // Build a lookup of total per status for cumulative calculation
498    const totalByStatus = {};
499    for (const row of topLevel) {
500      totalByStatus[row.status] = row.total;
501    }
502  
503    // Throughput from site_status transition log (split queries)
504    const thruTotal = await getAll('SELECT status, COUNT(*) AS total FROM site_status GROUP BY status');
505    const thru24h = await getAll(
506      `SELECT status, COUNT(*) AS cnt
507       FROM site_status WHERE created_at > NOW() - INTERVAL '24 hours'
508       GROUP BY status`
509    );
510    const thru1h = await getAll(
511      `SELECT status, COUNT(*) AS cnt
512       FROM site_status WHERE created_at > NOW() - INTERVAL '1 hour'
513       GROUP BY status`
514    );
515  
516    const thru24map = Object.fromEntries(thru24h.map(r => [r.status, Number(r.cnt)]));
517    const thru1map = Object.fromEntries(thru1h.map(r => [r.status, Number(r.cnt)]));
518  
519    const throughputByStatus = {};
520    for (const row of thruTotal) {
521      throughputByStatus[row.status] = {
522        thru_24h: thru24map[row.status] || 0,
523        thru_1h: thru1map[row.status] || 0,
524      };
525    }
526  
527    // Error breakdown for failing only (ignore has 800K+ rows dominated by known categories
528    // like "Social media platform" / cross-border duplicates — not actionable in status view).
529    // Fetch raw rows and group in JS to avoid slow PG substr() GROUP BY on large table.
530    const ERR_PREFIX = 200;
531    const failingRaw = await getAll(`SELECT error_message FROM sites WHERE status = 'failing'`);
532    const failingMap = {};
533    for (const r of failingRaw) {
534      const key = (r.error_message || '').substring(0, ERR_PREFIX);
535      failingMap[key] = (failingMap[key] || 0) + 1;
536    }
537    const errorRows = Object.entries(failingMap).map(([error_message, total]) => ({
538      status: 'failing',
539      error_message,
540      total,
541    }));
542  
543    // Group error rows by status
544    const errorsByStatus = {};
545    for (const row of errorRows) {
546      if (!errorsByStatus[row.status]) errorsByStatus[row.status] = [];
547      errorsByStatus[row.status].push(row);
548    }
549  
550    // Build result in pipeline order
551    const statusMap = {};
552    for (const row of topLevel) {
553      statusMap[row.status] = row;
554    }
555  
556    const tree = [];
557    const seen = new Set();
558  
559    for (const status of SITE_STATUS_ORDER) {
560      if (statusMap[status]) {
561        seen.add(status);
562        const row = statusMap[status];
563  
564        // Cumulative = this status + all downstream pipeline stages
565        const myDepth = PIPELINE_STAGE_DEPTH.indexOf(status);
566        let cumulative = null;
567        if (myDepth !== -1) {
568          cumulative = row.total;
569          for (let d = myDepth + 1; d < PIPELINE_STAGE_DEPTH.length; d++) {
570            cumulative += totalByStatus[PIPELINE_STAGE_DEPTH[d]] || 0;
571          }
572          // Only show cumulative when it differs from total (i.e. there are downstream sites)
573          if (cumulative === row.total) cumulative = null;
574        }
575  
576        const thru = throughputByStatus[status] || { thru_24h: 0, thru_1h: 0 };
577        const entry = {
578          status,
579          total: row.total,
580          cumulative,
581          delta_24h: row.delta_24h,
582          delta_1h: row.delta_1h,
583          thru_24h: thru.thru_24h,
584          thru_1h: thru.thru_1h,
585          children: null,
586        };
587  
588        if (errorsByStatus[status]) {
589          entry.children = buildErrorChildren(errorsByStatus[status], 'site');
590        }
591  
592        tree.push(entry);
593      }
594    }
595  
596    // Any statuses not in our order list go at the end
597    for (const row of topLevel) {
598      if (!seen.has(row.status)) {
599        const thru = throughputByStatus[row.status] || { thru_24h: 0, thru_1h: 0 };
600        tree.push({
601          status: row.status,
602          total: row.total,
603          cumulative: null,
604          delta_24h: row.delta_24h,
605          delta_1h: row.delta_1h,
606          thru_24h: thru.thru_24h,
607          thru_1h: thru.thru_1h,
608          children: null,
609        });
610      }
611    }
612  
613    return tree;
614  }
615  
616  /**
617   * Build status tree for outbound messages with error breakdowns.
618   * Combines approval_status and delivery_status into a synthetic display status:
619   *   - If delivery_status is set, use it
620   *   - Otherwise, use approval_status
621   * @returns {Promise<Array>} outreach status tree
622   */
623  export async function buildOutreachTree() {
624    const STATUS_EXPR = `COALESCE(delivery_status, approval_status)`;
625    const BASE_WHERE = `direction = 'outbound'`;
626  
627    // Split queries to avoid full table scans with conditional COUNTs.
628  
629    // 1. Totals
630    const totalRows = await getAll(
631      `SELECT ${STATUS_EXPR} AS status, COUNT(*) AS total
632       FROM messages WHERE ${BASE_WHERE}
633       GROUP BY status`
634    );
635    // Use updated_at for all delta queries — tracks the most recent status change.
636    const d24Rows = await getAll(
637      `SELECT ${STATUS_EXPR} AS status, COUNT(*) AS delta
638       FROM messages
639       WHERE ${BASE_WHERE} AND updated_at > NOW() - INTERVAL '24 hours'
640       GROUP BY status`
641    );
642    const d1Rows = await getAll(
643      `SELECT ${STATUS_EXPR} AS status, COUNT(*) AS delta
644       FROM messages
645       WHERE ${BASE_WHERE} AND updated_at > NOW() - INTERVAL '1 hour'
646       GROUP BY status`
647    );
648    const d24map = Object.fromEntries(d24Rows.map(r => [r.status, Number(r.delta)]));
649    const d1map = Object.fromEntries(d1Rows.map(r => [r.status, Number(r.delta)]));
650    const topLevel = totalRows.map(r => ({
651      status: r.status,
652      total: Number(r.total),
653      delta_24h: d24map[r.status] || 0,
654      delta_1h: d1map[r.status] || 0,
655    }));
656  
657    // 2. Channel breakdown for pending, rework, approved + sent (split queries)
658    const chanStatuses = `('pending', 'rework', 'approved', 'sent', 'delivered', 'opened', 'clicked', 'replied')`;
659    const chanTotals = await getAll(
660      `SELECT ${STATUS_EXPR} AS status, contact_method, COUNT(*) AS total
661       FROM messages WHERE ${BASE_WHERE} AND ${STATUS_EXPR} IN ${chanStatuses}
662       GROUP BY status, contact_method`
663    );
664    const chanD24 = await getAll(
665      `SELECT ${STATUS_EXPR} AS status, contact_method, COUNT(*) AS delta
666       FROM messages
667       WHERE ${BASE_WHERE} AND ${STATUS_EXPR} IN ${chanStatuses}
668         AND updated_at > NOW() - INTERVAL '24 hours'
669       GROUP BY status, contact_method`
670    );
671    const chanD1 = await getAll(
672      `SELECT ${STATUS_EXPR} AS status, contact_method, COUNT(*) AS delta
673       FROM messages
674       WHERE ${BASE_WHERE} AND ${STATUS_EXPR} IN ${chanStatuses}
675         AND updated_at > NOW() - INTERVAL '1 hour'
676       GROUP BY status, contact_method`
677    );
678    const chanD24map = Object.fromEntries(
679      chanD24.map(r => [`${r.status}|${r.contact_method}`, Number(r.delta)])
680    );
681    const chanD1map = Object.fromEntries(
682      chanD1.map(r => [`${r.status}|${r.contact_method}`, Number(r.delta)])
683    );
684    const channelRows = chanTotals.map(r => ({
685      status: r.status,
686      contact_method: r.contact_method,
687      total: Number(r.total),
688      delta_24h: chanD24map[`${r.status}|${r.contact_method}`] || 0,
689      delta_1h: chanD1map[`${r.status}|${r.contact_method}`] || 0,
690    }));
691  
692    // 3. Error breakdown for failed + retry_later (split queries)
693    const errTotals = await getAll(
694      `SELECT ${STATUS_EXPR} AS status, error_message, COUNT(*) AS total
695       FROM messages
696       WHERE ${BASE_WHERE} AND ${STATUS_EXPR} IN ('failed', 'retry_later')
697       GROUP BY status, error_message`
698    );
699    // delta_24h/delta_1h per error label are not displayed — skip extra queries
700    const errorRows = errTotals.map(r => ({
701      status: r.status,
702      error_message: r.error_message,
703      total: Number(r.total),
704    }));
705  
706    // 4. retry_later: count due now vs still waiting
707    const retryDueRow = await getOne(
708      `SELECT
709         COUNT(*) AS due_now,
710         MIN(retry_at) AS next_retry_at
711       FROM messages
712       WHERE ${BASE_WHERE} AND delivery_status = 'retry_later'`
713    ) || { due_now: 0, next_retry_at: null };
714    const retryStats = {
715      due_now: Number(retryDueRow.due_now),
716      next_retry_at: retryDueRow.next_retry_at,
717    };
718  
719    // Group channel rows by status
720    const channelsByStatus = {};
721    for (const row of channelRows) {
722      if (!channelsByStatus[row.status]) channelsByStatus[row.status] = [];
723      channelsByStatus[row.status].push(row);
724    }
725  
726    const statusMap = {};
727    for (const row of topLevel) {
728      statusMap[row.status] = row;
729    }
730  
731    const tree = [];
732    const seen = new Set();
733  
734    for (const status of OUTREACH_STATUS_ORDER) {
735      if (statusMap[status]) {
736        seen.add(status);
737        const row = statusMap[status];
738        const entry = {
739          status,
740          total: row.total,
741          delta_24h: row.delta_24h,
742          delta_1h: row.delta_1h,
743          children: null,
744        };
745  
746        if (channelsByStatus[status]) {
747          // Channel breakdown (sorted by total desc)
748          entry.children = {
749            type: 'channels',
750            rows: channelsByStatus[status]
751              .sort((a, b) => b.total - a.total)
752              .map(r => ({
753                label: r.contact_method,
754                total: r.total,
755                delta_24h: r.delta_24h,
756                delta_1h: r.delta_1h,
757              })),
758          };
759        } else if (status === 'failed') {
760          entry.children = buildErrorChildren(
761            errorRows.filter(r => r.status === 'failed'),
762            'outreach'
763          );
764        } else if (status === 'retry_later') {
765          entry.retryStats = retryStats;
766          entry.children = buildErrorChildren(
767            errorRows.filter(r => r.status === 'retry_later'),
768            'outreach'
769          );
770        }
771  
772        tree.push(entry);
773      }
774    }
775  
776    // Any statuses not in our order list
777    for (const row of topLevel) {
778      if (!seen.has(row.status)) {
779        tree.push({
780          status: row.status,
781          total: row.total,
782          delta_24h: row.delta_24h,
783          delta_1h: row.delta_1h,
784          children: null,
785        });
786      }
787    }
788  
789    return tree;
790  }
791  
792  // ──────────────────────────────────────────────────────────────────────────────
793  // Helpers
794  // ──────────────────────────────────────────────────────────────────────────────
795  
796  /**
797   * Collapse and categorize error rows into retriable/terminal/unknown groups.
798   * Cross-border duplicates are folded into a single entry.
799   * @param {Array} rows - DB rows with error_message, total, delta_24h, delta_1h
800   * @param {'site'|'outreach'} context
801   * @returns {{ type: 'errors', retriable: Array, terminal: Array, unknown: Array }}
802   */
803  function buildErrorChildren(rows, context) {
804    const retriable = {};
805    const terminal = {};
806    const unknown = {};
807  
808    for (const row of rows) {
809      const { group, label } = categorizeError(row.error_message, context);
810      const bucket = group === 'terminal' ? terminal : group === 'retriable' ? retriable : unknown;
811  
812      if (!bucket[label]) {
813        bucket[label] = { label, total: 0, delta_24h: 0, delta_1h: 0 };
814      }
815      bucket[label].total += row.total;
816      bucket[label].delta_24h += row.delta_24h;
817      bucket[label].delta_1h += row.delta_1h;
818    }
819  
820    const toSortedArray = obj => Object.values(obj).sort((a, b) => b.total - a.total);
821  
822    return {
823      type: 'errors',
824      retriable: toSortedArray(retriable),
825      terminal: toSortedArray(terminal),
826      unknown: toSortedArray(unknown),
827    };
828  }
829  
830  // ──────────────────────────────────────────────────────────────────────────────
831  // Conversations tree
832  // ──────────────────────────────────────────────────────────────────────────────
833  
834  const SENTIMENT_ORDER = ['positive', 'neutral', 'negative', 'objection'];
835  
836  /**
837   * Build inbound messages summary grouped by sentiment, with intent breakdown.
838   * @returns {Promise<Array>} sentiment rows with intent children
839   */
840  export async function buildConversationsTree() {
841    const topLevel = await getAll(
842      `SELECT
843         COALESCE(sentiment, 'unknown') AS sentiment,
844         COUNT(*) AS total,
845         COUNT(CASE WHEN created_at > NOW() - INTERVAL '24 hours' THEN 1 END) AS delta_24h,
846         COUNT(CASE WHEN created_at > NOW() - INTERVAL '1 hour' THEN 1 END) AS delta_1h
847       FROM messages
848       WHERE direction = 'inbound'
849       GROUP BY sentiment`
850    );
851  
852    const intentRows = await getAll(
853      `SELECT
854         COALESCE(sentiment, 'unknown') AS sentiment,
855         COALESCE(intent, 'unknown') AS intent,
856         COUNT(*) AS total,
857         COUNT(CASE WHEN created_at > NOW() - INTERVAL '24 hours' THEN 1 END) AS delta_24h,
858         COUNT(CASE WHEN created_at > NOW() - INTERVAL '1 hour' THEN 1 END) AS delta_1h
859       FROM messages
860       WHERE direction = 'inbound'
861       GROUP BY sentiment, intent`
862    );
863  
864    // PG COUNT() returns bigint strings — coerce to Number throughout
865    const intentsBySentiment = {};
866    for (const row of intentRows) {
867      if (!intentsBySentiment[row.sentiment]) intentsBySentiment[row.sentiment] = [];
868      intentsBySentiment[row.sentiment].push({
869        ...row,
870        total: Number(row.total),
871        delta_24h: Number(row.delta_24h),
872        delta_1h: Number(row.delta_1h),
873      });
874    }
875  
876    const statusMap = {};
877    for (const row of topLevel) {
878      statusMap[row.sentiment] = {
879        ...row,
880        total: Number(row.total),
881        delta_24h: Number(row.delta_24h),
882        delta_1h: Number(row.delta_1h),
883      };
884    }
885  
886    const tree = [];
887    const seen = new Set();
888  
889    for (const sentiment of SENTIMENT_ORDER) {
890      if (statusMap[sentiment]) {
891        seen.add(sentiment);
892        const row = statusMap[sentiment];
893        const intents = (intentsBySentiment[sentiment] || [])
894          .sort((a, b) => b.total - a.total)
895          .map(r => ({
896            label: r.intent,
897            total: r.total,
898            delta_24h: r.delta_24h,
899            delta_1h: r.delta_1h,
900          }));
901        tree.push({
902          sentiment,
903          total: row.total,
904          delta_24h: row.delta_24h,
905          delta_1h: row.delta_1h,
906          intents,
907        });
908      }
909    }
910  
911    // Any sentiments not in our order list
912    for (const row of topLevel) {
913      if (!seen.has(row.sentiment)) {
914        tree.push({
915          sentiment: row.sentiment,
916          total: row.total,
917          delta_24h: row.delta_24h,
918          delta_1h: row.delta_1h,
919          intents: (intentsBySentiment[row.sentiment] || [])
920            .sort((a, b) => b.total - a.total)
921            .map(r => ({
922              label: r.intent,
923              total: r.total,
924              delta_24h: r.delta_24h,
925              delta_1h: r.delta_1h,
926            })),
927        });
928      }
929    }
930  
931    return tree;
932  }