/ src / cron / classify-unknown-errors.js
classify-unknown-errors.js
  1  /**
  2   * Classify Unknown Errors Cron Job
  3   *
  4   * Runs every 4 hours. Two phases:
  5   *
  6   * Phase 1 — Retry reclassification:
  7   *   Re-evaluates every failing site and failed outreach against the current
  8   *   categorizeError() patterns. Any that now match a retriable pattern are
  9   *   promoted back to the retry queue (sites → found, messages → retry_later).
 10   *   This picks up regressions fixed since the record was written, and benefits
 11   *   immediately from new patterns added to error-categories.js.
 12   *
 13   * Phase 2 — LLM classification (DELEGATED):
 14   *   Moved to claude-orchestrator.sh --type classify_errors (claude -p haiku).
 15   *   This cron job no longer makes LLM calls.
 16   *
 17   * Phase 3 — Auto-apply proposals:
 18   *   Reads pending error_pattern_proposals from DB (populated by orchestrator),
 19   *   appends them to error-categories.js, then re-runs Phase 1 with fresh patterns.
 20   *
 21   * Registered in cron_jobs table as 'classifyUnknownErrors' (every 4 hours).
 22   */
 23  
 24  import { run, getAll, withTransaction } from '../utils/db.js';
 25  import { join, dirname } from 'path';
 26  import { fileURLToPath } from 'url';
 27  import Logger from '../utils/logger.js';
 28  import { categorizeError, computeRetryAt } from '../utils/error-categories.js';
 29  import '../utils/load-env.js';
 30  
 31  const __filename = fileURLToPath(import.meta.url);
 32  const __dirname = dirname(__filename);
 33  const projectRoot = join(__dirname, '../..');
 34  
 35  const logger = new Logger('ClassifyUnknownErrors');
 36  
 37  /**
 38   * Phase 1: Re-evaluate all failing sites and failed/retry_later messages
 39   * against current categorizeError() patterns. Promote retriable ones back
 40   * to the retry queue so they get another attempt.
 41   *
 42   * Sites:     failing → found  (pipeline picks them up again)
 43   * Messages:  failed → retry_later  (outreach stage retries them)
 44   *
 45   * @returns {{ sites_retried: number, outreaches_retried: number }}
 46   */
 47  async function retryReclassified() {
 48    // Sites: re-check every failing site AND ignore sites with no error (retriable unknown)
 49    const failingSites = await getAll(
 50      `SELECT id, error_message, status FROM sites
 51       WHERE status = 'failing'
 52          OR (status = 'ignored' AND (error_message IS NULL OR error_message = ''))`
 53    );
 54  
 55    // Log NULL error_message cases before processing — helps identify root cause
 56    const nullErrSites = failingSites.filter(s => s.error_message === null || s.error_message === '');
 57    if (nullErrSites.length > 0) {
 58      logger.warn(
 59        `${nullErrSites.length} sites have NULL/empty error_message (status: ${[...new Set(nullErrSites.map(s => s.status))].join(', ')}). ` +
 60          `Site IDs: ${nullErrSites
 61            .map(s => s.id)
 62            .slice(0, 20)
 63            .join(', ')}${nullErrSites.length > 20 ? '...' : ''}`
 64      );
 65    }
 66  
 67    let sitesRetried = 0;
 68    await withTransaction(async (client) => {
 69      for (const site of failingSites) {
 70        const { group } = categorizeError(site.error_message, 'site');
 71        if (group === 'retriable') {
 72          await client.query(
 73            `UPDATE sites SET status = 'found', error_message = NULL, updated_at = CURRENT_TIMESTAMP WHERE id = $1`,
 74            [site.id]
 75          );
 76          sitesRetried++;
 77        }
 78      }
 79    });
 80  
 81    // Outreaches: re-check every failed and retry_later outreach
 82    const failedOutreaches = await getAll(
 83      `SELECT id, error_message, delivery_status FROM messages WHERE direction = 'outbound' AND delivery_status IN ('failed', 'retry_later')`
 84    );
 85  
 86    // Log NULL error_message outreach cases
 87    const nullErrOutreaches = failedOutreaches.filter(
 88      o => o.error_message === null || o.error_message === ''
 89    );
 90    if (nullErrOutreaches.length > 0) {
 91      logger.warn(
 92        `${nullErrOutreaches.length} outreaches have NULL/empty error_message with delivery_status='failed'/'retry_later'. ` +
 93          `Message IDs: ${nullErrOutreaches
 94            .map(o => o.id)
 95            .slice(0, 20)
 96            .join(', ')}${nullErrOutreaches.length > 20 ? '...' : ''}`
 97      );
 98    }
 99  
100    let outreachesRetried = 0;
101    await withTransaction(async (client) => {
102      for (const o of failedOutreaches) {
103        const { group } = categorizeError(o.error_message, 'outreach');
104        if (group === 'retriable') {
105          await client.query(
106            `UPDATE messages SET delivery_status = 'retry_later', retry_at = $1, updated_at = CURRENT_TIMESTAMP WHERE id = $2`,
107            [computeRetryAt(o.error_message), o.id]
108          );
109          outreachesRetried++;
110        }
111      }
112    });
113  
114    // Log any remaining unknowns after reclassification (helps track new unmatched patterns)
115    const remainingSiteUnknowns = failingSites.filter(
116      s => categorizeError(s.error_message, 'site').group === 'unknown'
117    );
118    if (remainingSiteUnknowns.length > 0) {
119      logger.warn(
120        `${remainingSiteUnknowns.length} failing sites still have unmatched error patterns after reclassification:`
121      );
122      const byPrefix = {};
123      for (const s of remainingSiteUnknowns) {
124        const key = (s.error_message || '').substring(0, 80);
125        byPrefix[key] = (byPrefix[key] || 0) + 1;
126      }
127      for (const [prefix, cnt] of Object.entries(byPrefix).slice(0, 10)) {
128        logger.warn(`  (${cnt}x) ${prefix}`);
129      }
130    }
131  
132    const remainingOutUnknowns = failedOutreaches.filter(
133      o => categorizeError(o.error_message, 'outreach').group === 'unknown'
134    );
135    if (remainingOutUnknowns.length > 0) {
136      logger.warn(
137        `${remainingOutUnknowns.length} failed outreaches still have unmatched error patterns after reclassification:`
138      );
139      const byPrefix = {};
140      for (const o of remainingOutUnknowns) {
141        const key = (o.error_message || '').substring(0, 80);
142        byPrefix[key] = (byPrefix[key] || 0) + 1;
143      }
144      for (const [prefix, cnt] of Object.entries(byPrefix).slice(0, 10)) {
145        logger.warn(`  (${cnt}x) ${prefix}`);
146      }
147    }
148  
149    return { sites_retried: sitesRetried, outreaches_retried: outreachesRetried };
150  }
151  
152  /**
153   * Phase 3: Auto-apply pending proposals to error-categories.js.
154   *
155   * Reads all pending proposals from the DB, appends them to the correct
156   * pattern array in error-categories.js, then marks them as approved.
157   * Re-runs retryReclassified() so newly-categorised errors are retried
158   * in the same cycle.
159   *
160   * @returns {{ applied: number, sites_retried: number, outreaches_retried: number }}
161   */
162  async function applyProposals() {
163    const pending = await getAll(
164      `SELECT id, pattern, label, group_name, context
165       FROM error_pattern_proposals WHERE status = 'pending'`
166    );
167  
168    if (pending.length === 0) return { applied: 0, sites_retried: 0, outreaches_retried: 0 };
169  
170    const categoriesPath = join(projectRoot, 'src/utils/error-categories.js');
171    let src = await import('fs').then(fs => fs.promises.readFile(categoriesPath, 'utf8'));
172  
173    // Map proposal to the correct const name in the file
174    const arrayName = p =>
175      p.context === 'site'
176        ? p.group_name === 'terminal'
177          ? 'SITE_TERMINAL_PATTERNS'
178          : 'SITE_RETRIABLE_PATTERNS'
179        : p.group_name === 'terminal'
180          ? 'OUTREACH_TERMINAL_PATTERNS'
181          : 'OUTREACH_RETRIABLE_PATTERNS';
182  
183    let applied = 0;
184    for (const proposal of pending) {
185      // Validate regex before writing
186      try {
187        new RegExp(proposal.pattern);
188      } catch {
189        logger.warn(`Skipping proposal #${proposal.id} — invalid regex: ${proposal.pattern}`);
190        continue;
191      }
192  
193      const arr = arrayName(proposal);
194      const entry = `  { pattern: /${proposal.pattern}/i, label: '${proposal.label}' },`;
195      const closingBracket = new RegExp(`(const ${arr} = \\[(?:[^\\]]|\\[[^\\]]*\\])*?)(\\];)`);
196      if (!closingBracket.test(src)) {
197        logger.warn(`Could not locate ${arr} in error-categories.js — skipping #${proposal.id}`);
198        continue;
199      }
200      src = src.replace(closingBracket, `$1${entry}\n$2`);
201      applied++;
202    }
203  
204    if (applied > 0) {
205      const fs = await import('fs');
206      await fs.promises.writeFile(categoriesPath, src, 'utf8');
207      logger.info(`Wrote ${applied} new patterns to error-categories.js`);
208  
209      // Mark applied proposals as approved
210      const ids = pending.slice(0, applied).map(p => p.id);
211      await run(
212        `UPDATE error_pattern_proposals
213         SET status = 'approved', reviewed_at = CURRENT_TIMESTAMP, reviewed_by = 'cron:classifyUnknownErrors'
214         WHERE id = ANY($1::int[])`,
215        [ids]
216      );
217    }
218  
219    // Re-run retry with freshly-loaded patterns (dynamic import to bust module cache)
220    // Use a subprocess so Node re-reads error-categories.js from disk
221    const { execSync } = await import('child_process').then(m => m);
222    try {
223      const out = execSync(`node --input-type=module`, {
224        input: `
225  import { getAll, run, withTransaction } from './src/utils/db.js';
226  import { categorizeError, computeRetryAt } from './src/utils/error-categories.js';
227  const sites = await getAll("SELECT id, error_message FROM sites WHERE status='failing'");
228  let sr = 0;
229  await withTransaction(async (client) => {
230    for(const s of sites){
231      if(categorizeError(s.error_message,'site').group==='retriable'){
232        await client.query("UPDATE sites SET status='found', error_message=NULL, updated_at=CURRENT_TIMESTAMP WHERE id=$1",[s.id]);
233        sr++;
234      }
235    }
236  });
237  const outreaches = await getAll("SELECT id, error_message FROM messages WHERE direction='outbound' AND delivery_status IN ('failed','retry_later')");
238  let or2 = 0;
239  await withTransaction(async (client) => {
240    for(const o of outreaches){
241      if(categorizeError(o.error_message,'outreach').group==='retriable'){
242        await client.query("UPDATE messages SET delivery_status='retry_later', retry_at=$1, updated_at=CURRENT_TIMESTAMP WHERE id=$2",[computeRetryAt(o.error_message), o.id]);
243        or2++;
244      }
245    }
246  });
247  console.log(JSON.stringify({sites_retried:sr,outreaches_retried:or2}));
248  `,
249        cwd: projectRoot,
250        encoding: 'utf8',
251        env: { ...process.env },
252        timeout: 60000,
253      });
254      const result = JSON.parse(out.trim().split('\n').pop());
255      logger.info(
256        `Phase 3 retry (fresh patterns): ${result.sites_retried} sites → found, ${result.outreaches_retried} outreaches → retry_later`
257      );
258      return { applied, ...result };
259    } catch (err) {
260      logger.warn(`Phase 3 retry subprocess failed: ${err.message}`);
261      return { applied, sites_retried: 0, outreaches_retried: 0 };
262    }
263  }
264  
265  /**
266   * Main entry point — Phase 1 + Phase 3 only.
267   * Phase 2 (LLM error classification) is delegated to claude-orchestrator.sh --type classify_errors.
268   */
269  export async function classifyUnknownErrors() {
270    logger.info('Starting error reclassification (phases 1 + 3)...');
271  
272    // Phase 1: retry reclassification against current patterns
273    const retryResult = await retryReclassified();
274    logger.info(
275      `Phase 1 retry: ${retryResult.sites_retried} sites → found, ${retryResult.outreaches_retried} outreaches → retry_later`
276    );
277  
278    // Phase 3: auto-apply any pending proposals (from orchestrator classify_errors batch)
279    const applyResult = await applyProposals();
280    logger.info(`Phase 3 apply: ${applyResult.applied} patterns written to error-categories.js`);
281  
282    return {
283      ...retryResult,
284      patterns_applied: applyResult.applied,
285      phase3_sites_retried: applyResult.sites_retried,
286      phase3_outreaches_retried: applyResult.outreaches_retried,
287    };
288  }
289  
290  // Run directly
291  if (process.argv[1] === fileURLToPath(import.meta.url)) {
292    classifyUnknownErrors()
293      .then(result => {
294        console.log('Done:', result);
295        process.exit(0);
296      })
297      .catch(err => {
298        console.error(err);
299        process.exit(1);
300      });
301  }