classify-unknown-errors.js
1 /** 2 * Classify Unknown Errors Cron Job 3 * 4 * Runs every 4 hours. Two phases: 5 * 6 * Phase 1 — Retry reclassification: 7 * Re-evaluates every failing site and failed outreach against the current 8 * categorizeError() patterns. Any that now match a retriable pattern are 9 * promoted back to the retry queue (sites → found, messages → retry_later). 10 * This picks up regressions fixed since the record was written, and benefits 11 * immediately from new patterns added to error-categories.js. 12 * 13 * Phase 2 — LLM classification (DELEGATED): 14 * Moved to claude-orchestrator.sh --type classify_errors (claude -p haiku). 15 * This cron job no longer makes LLM calls. 16 * 17 * Phase 3 — Auto-apply proposals: 18 * Reads pending error_pattern_proposals from DB (populated by orchestrator), 19 * appends them to error-categories.js, then re-runs Phase 1 with fresh patterns. 20 * 21 * Registered in cron_jobs table as 'classifyUnknownErrors' (every 4 hours). 22 */ 23 24 import { run, getAll, withTransaction } from '../utils/db.js'; 25 import { join, dirname } from 'path'; 26 import { fileURLToPath } from 'url'; 27 import Logger from '../utils/logger.js'; 28 import { categorizeError, computeRetryAt } from '../utils/error-categories.js'; 29 import '../utils/load-env.js'; 30 31 const __filename = fileURLToPath(import.meta.url); 32 const __dirname = dirname(__filename); 33 const projectRoot = join(__dirname, '../..'); 34 35 const logger = new Logger('ClassifyUnknownErrors'); 36 37 /** 38 * Phase 1: Re-evaluate all failing sites and failed/retry_later messages 39 * against current categorizeError() patterns. Promote retriable ones back 40 * to the retry queue so they get another attempt. 41 * 42 * Sites: failing → found (pipeline picks them up again) 43 * Messages: failed → retry_later (outreach stage retries them) 44 * 45 * @returns {{ sites_retried: number, outreaches_retried: number }} 46 */ 47 async function retryReclassified() { 48 // Sites: re-check every failing site AND ignore sites with no error (retriable unknown) 49 const failingSites = await getAll( 50 `SELECT id, error_message, status FROM sites 51 WHERE status = 'failing' 52 OR (status = 'ignored' AND (error_message IS NULL OR error_message = ''))` 53 ); 54 55 // Log NULL error_message cases before processing — helps identify root cause 56 const nullErrSites = failingSites.filter(s => s.error_message === null || s.error_message === ''); 57 if (nullErrSites.length > 0) { 58 logger.warn( 59 `${nullErrSites.length} sites have NULL/empty error_message (status: ${[...new Set(nullErrSites.map(s => s.status))].join(', ')}). ` + 60 `Site IDs: ${nullErrSites 61 .map(s => s.id) 62 .slice(0, 20) 63 .join(', ')}${nullErrSites.length > 20 ? '...' : ''}` 64 ); 65 } 66 67 let sitesRetried = 0; 68 await withTransaction(async (client) => { 69 for (const site of failingSites) { 70 const { group } = categorizeError(site.error_message, 'site'); 71 if (group === 'retriable') { 72 await client.query( 73 `UPDATE sites SET status = 'found', error_message = NULL, updated_at = CURRENT_TIMESTAMP WHERE id = $1`, 74 [site.id] 75 ); 76 sitesRetried++; 77 } 78 } 79 }); 80 81 // Outreaches: re-check every failed and retry_later outreach 82 const failedOutreaches = await getAll( 83 `SELECT id, error_message, delivery_status FROM messages WHERE direction = 'outbound' AND delivery_status IN ('failed', 'retry_later')` 84 ); 85 86 // Log NULL error_message outreach cases 87 const nullErrOutreaches = failedOutreaches.filter( 88 o => o.error_message === null || o.error_message === '' 89 ); 90 if (nullErrOutreaches.length > 0) { 91 logger.warn( 92 `${nullErrOutreaches.length} outreaches have NULL/empty error_message with delivery_status='failed'/'retry_later'. ` + 93 `Message IDs: ${nullErrOutreaches 94 .map(o => o.id) 95 .slice(0, 20) 96 .join(', ')}${nullErrOutreaches.length > 20 ? '...' : ''}` 97 ); 98 } 99 100 let outreachesRetried = 0; 101 await withTransaction(async (client) => { 102 for (const o of failedOutreaches) { 103 const { group } = categorizeError(o.error_message, 'outreach'); 104 if (group === 'retriable') { 105 await client.query( 106 `UPDATE messages SET delivery_status = 'retry_later', retry_at = $1, updated_at = CURRENT_TIMESTAMP WHERE id = $2`, 107 [computeRetryAt(o.error_message), o.id] 108 ); 109 outreachesRetried++; 110 } 111 } 112 }); 113 114 // Log any remaining unknowns after reclassification (helps track new unmatched patterns) 115 const remainingSiteUnknowns = failingSites.filter( 116 s => categorizeError(s.error_message, 'site').group === 'unknown' 117 ); 118 if (remainingSiteUnknowns.length > 0) { 119 logger.warn( 120 `${remainingSiteUnknowns.length} failing sites still have unmatched error patterns after reclassification:` 121 ); 122 const byPrefix = {}; 123 for (const s of remainingSiteUnknowns) { 124 const key = (s.error_message || '').substring(0, 80); 125 byPrefix[key] = (byPrefix[key] || 0) + 1; 126 } 127 for (const [prefix, cnt] of Object.entries(byPrefix).slice(0, 10)) { 128 logger.warn(` (${cnt}x) ${prefix}`); 129 } 130 } 131 132 const remainingOutUnknowns = failedOutreaches.filter( 133 o => categorizeError(o.error_message, 'outreach').group === 'unknown' 134 ); 135 if (remainingOutUnknowns.length > 0) { 136 logger.warn( 137 `${remainingOutUnknowns.length} failed outreaches still have unmatched error patterns after reclassification:` 138 ); 139 const byPrefix = {}; 140 for (const o of remainingOutUnknowns) { 141 const key = (o.error_message || '').substring(0, 80); 142 byPrefix[key] = (byPrefix[key] || 0) + 1; 143 } 144 for (const [prefix, cnt] of Object.entries(byPrefix).slice(0, 10)) { 145 logger.warn(` (${cnt}x) ${prefix}`); 146 } 147 } 148 149 return { sites_retried: sitesRetried, outreaches_retried: outreachesRetried }; 150 } 151 152 /** 153 * Phase 3: Auto-apply pending proposals to error-categories.js. 154 * 155 * Reads all pending proposals from the DB, appends them to the correct 156 * pattern array in error-categories.js, then marks them as approved. 157 * Re-runs retryReclassified() so newly-categorised errors are retried 158 * in the same cycle. 159 * 160 * @returns {{ applied: number, sites_retried: number, outreaches_retried: number }} 161 */ 162 async function applyProposals() { 163 const pending = await getAll( 164 `SELECT id, pattern, label, group_name, context 165 FROM error_pattern_proposals WHERE status = 'pending'` 166 ); 167 168 if (pending.length === 0) return { applied: 0, sites_retried: 0, outreaches_retried: 0 }; 169 170 const categoriesPath = join(projectRoot, 'src/utils/error-categories.js'); 171 let src = await import('fs').then(fs => fs.promises.readFile(categoriesPath, 'utf8')); 172 173 // Map proposal to the correct const name in the file 174 const arrayName = p => 175 p.context === 'site' 176 ? p.group_name === 'terminal' 177 ? 'SITE_TERMINAL_PATTERNS' 178 : 'SITE_RETRIABLE_PATTERNS' 179 : p.group_name === 'terminal' 180 ? 'OUTREACH_TERMINAL_PATTERNS' 181 : 'OUTREACH_RETRIABLE_PATTERNS'; 182 183 let applied = 0; 184 for (const proposal of pending) { 185 // Validate regex before writing 186 try { 187 new RegExp(proposal.pattern); 188 } catch { 189 logger.warn(`Skipping proposal #${proposal.id} — invalid regex: ${proposal.pattern}`); 190 continue; 191 } 192 193 const arr = arrayName(proposal); 194 const entry = ` { pattern: /${proposal.pattern}/i, label: '${proposal.label}' },`; 195 const closingBracket = new RegExp(`(const ${arr} = \\[(?:[^\\]]|\\[[^\\]]*\\])*?)(\\];)`); 196 if (!closingBracket.test(src)) { 197 logger.warn(`Could not locate ${arr} in error-categories.js — skipping #${proposal.id}`); 198 continue; 199 } 200 src = src.replace(closingBracket, `$1${entry}\n$2`); 201 applied++; 202 } 203 204 if (applied > 0) { 205 const fs = await import('fs'); 206 await fs.promises.writeFile(categoriesPath, src, 'utf8'); 207 logger.info(`Wrote ${applied} new patterns to error-categories.js`); 208 209 // Mark applied proposals as approved 210 const ids = pending.slice(0, applied).map(p => p.id); 211 await run( 212 `UPDATE error_pattern_proposals 213 SET status = 'approved', reviewed_at = CURRENT_TIMESTAMP, reviewed_by = 'cron:classifyUnknownErrors' 214 WHERE id = ANY($1::int[])`, 215 [ids] 216 ); 217 } 218 219 // Re-run retry with freshly-loaded patterns (dynamic import to bust module cache) 220 // Use a subprocess so Node re-reads error-categories.js from disk 221 const { execSync } = await import('child_process').then(m => m); 222 try { 223 const out = execSync(`node --input-type=module`, { 224 input: ` 225 import { getAll, run, withTransaction } from './src/utils/db.js'; 226 import { categorizeError, computeRetryAt } from './src/utils/error-categories.js'; 227 const sites = await getAll("SELECT id, error_message FROM sites WHERE status='failing'"); 228 let sr = 0; 229 await withTransaction(async (client) => { 230 for(const s of sites){ 231 if(categorizeError(s.error_message,'site').group==='retriable'){ 232 await client.query("UPDATE sites SET status='found', error_message=NULL, updated_at=CURRENT_TIMESTAMP WHERE id=$1",[s.id]); 233 sr++; 234 } 235 } 236 }); 237 const outreaches = await getAll("SELECT id, error_message FROM messages WHERE direction='outbound' AND delivery_status IN ('failed','retry_later')"); 238 let or2 = 0; 239 await withTransaction(async (client) => { 240 for(const o of outreaches){ 241 if(categorizeError(o.error_message,'outreach').group==='retriable'){ 242 await client.query("UPDATE messages SET delivery_status='retry_later', retry_at=$1, updated_at=CURRENT_TIMESTAMP WHERE id=$2",[computeRetryAt(o.error_message), o.id]); 243 or2++; 244 } 245 } 246 }); 247 console.log(JSON.stringify({sites_retried:sr,outreaches_retried:or2})); 248 `, 249 cwd: projectRoot, 250 encoding: 'utf8', 251 env: { ...process.env }, 252 timeout: 60000, 253 }); 254 const result = JSON.parse(out.trim().split('\n').pop()); 255 logger.info( 256 `Phase 3 retry (fresh patterns): ${result.sites_retried} sites → found, ${result.outreaches_retried} outreaches → retry_later` 257 ); 258 return { applied, ...result }; 259 } catch (err) { 260 logger.warn(`Phase 3 retry subprocess failed: ${err.message}`); 261 return { applied, sites_retried: 0, outreaches_retried: 0 }; 262 } 263 } 264 265 /** 266 * Main entry point — Phase 1 + Phase 3 only. 267 * Phase 2 (LLM error classification) is delegated to claude-orchestrator.sh --type classify_errors. 268 */ 269 export async function classifyUnknownErrors() { 270 logger.info('Starting error reclassification (phases 1 + 3)...'); 271 272 // Phase 1: retry reclassification against current patterns 273 const retryResult = await retryReclassified(); 274 logger.info( 275 `Phase 1 retry: ${retryResult.sites_retried} sites → found, ${retryResult.outreaches_retried} outreaches → retry_later` 276 ); 277 278 // Phase 3: auto-apply any pending proposals (from orchestrator classify_errors batch) 279 const applyResult = await applyProposals(); 280 logger.info(`Phase 3 apply: ${applyResult.applied} patterns written to error-categories.js`); 281 282 return { 283 ...retryResult, 284 patterns_applied: applyResult.applied, 285 phase3_sites_retried: applyResult.sites_retried, 286 phase3_outreaches_retried: applyResult.outreaches_retried, 287 }; 288 } 289 290 // Run directly 291 if (process.argv[1] === fileURLToPath(import.meta.url)) { 292 classifyUnknownErrors() 293 .then(result => { 294 console.log('Done:', result); 295 process.exit(0); 296 }) 297 .catch(err => { 298 console.error(err); 299 process.exit(1); 300 }); 301 }