retry-handler.js
1 /** 2 * Retry Handler Utility 3 * Centralized retry logic for all pipeline stages 4 */ 5 6 import Logger from './logger.js'; 7 import { run, getOne } from './db.js'; 8 9 const logger = new Logger('RetryHandler'); 10 11 // Retry limits by stage 12 const RETRY_LIMITS = { 13 assets: 3, // Expensive: browser automation 14 scoring: 3, // Expensive: LLM vision API 15 rescoring: 3, // Expensive: LLM vision API 16 enrichment: 3, // Expensive: browser + LLM 17 proposals: 5, // Less expensive: just LLM text 18 serps: 5, // API calls 19 }; 20 21 /** 22 * Detect if an error originated from the circuit breaker being open. 23 * opossum throws with message "Breaker is open" when the circuit is OPEN. 24 */ 25 function isCircuitBreakerError(errorMessage) { 26 return /breaker is open|circuit.*open|EOPENBREAKER/i.test(errorMessage); 27 } 28 29 /** 30 * Detect HTTP errors that warrant a long-term retry (site may recover within weeks). 31 * 403 = access denied (Cloudflare/geo-block may lift) 32 * 404 = page gone (small businesses sometimes restore pages) 33 * Returns retry interval in days, or null if not a long-term-retryable error. 34 */ 35 function httpLongTermRetryDays(errorMessage) { 36 if (/HTTP 403.*Cannot capture/i.test(errorMessage)) return 7; 37 if (/HTTP 404.*Cannot capture/i.test(errorMessage)) return 7; 38 return null; 39 } 40 41 // Max long-term HTTP retries before giving up (8 weeks at 7d each) 42 const HTTP_MAX_LONG_RETRIES = 8; 43 44 /** 45 * Record a failure and determine if site should be marked as failing 46 * @param {number} siteId - Site ID 47 * @param {string} stageName - Stage name (assets, scoring, etc.) 48 * @param {Error|string} error - Error object or message 49 * @param {string} currentStatus - Current site status (to keep unchanged if retrying) 50 * @returns {Promise<boolean>} True if marked as failing, false if still retrying 51 */ 52 export async function recordFailure(siteId, stageName, error, currentStatus) { 53 const errorMessage = error instanceof Error ? error.message : String(error); 54 55 // Get current retry count 56 const site = await getOne('SELECT retry_count FROM sites WHERE id = $1', [siteId]); 57 const currentRetry = site?.retry_count || 0; 58 const newRetryCount = currentRetry + 1; 59 60 // Get retry limit for this stage 61 const maxRetries = RETRY_LIMITS[stageName] || 5; 62 63 if (newRetryCount >= maxRetries) { 64 if (isCircuitBreakerError(errorMessage)) { 65 // Circuit breaker error: keep site at current stage, retry in 1 hour. 66 // Do NOT reset retry_count — preserve accumulated failure count so sites 67 // that also have real errors (e.g. Incomplete LLM response) still hit the cap. 68 await run( 69 `UPDATE sites 70 SET status = $1, 71 error_message = $2, 72 last_retry_at = NOW(), 73 recapture_at = NOW() + INTERVAL '1 hour' 74 WHERE id = $3`, 75 [currentStatus, `Circuit breaker retry in 1h: ${errorMessage}`, siteId] 76 ); 77 78 logger.warn( 79 ` Site ${siteId} circuit breaker retry scheduled in 1 hour at ${stageName} stage` 80 ); 81 return false; 82 } 83 84 // HTTP 403/404: schedule a long-term retry instead of permanent failure 85 const retryDays = httpLongTermRetryDays(errorMessage); 86 if (retryDays !== null) { 87 const site2 = await getOne('SELECT recapture_count FROM sites WHERE id = $1', [siteId]); 88 const httpRetries = (site2?.recapture_count || 0) + 1; 89 90 if (httpRetries <= HTTP_MAX_LONG_RETRIES) { 91 await run( 92 `UPDATE sites 93 SET status = 'found', 94 error_message = $1, 95 retry_count = 0, 96 recapture_count = $2, 97 last_retry_at = NOW(), 98 recapture_at = NOW() + INTERVAL '${retryDays} days' 99 WHERE id = $3`, 100 [ 101 `HTTP retry ${httpRetries}/${HTTP_MAX_LONG_RETRIES}: ${errorMessage}`, 102 httpRetries, 103 siteId, 104 ] 105 ); 106 logger.warn( 107 ` Site ${siteId} HTTP error — long-term retry ${httpRetries}/${HTTP_MAX_LONG_RETRIES} in ${retryDays}d` 108 ); 109 return false; 110 } 111 // Exhausted all long-term retries — fall through to permanent failing 112 } 113 114 // Non-circuit-breaker error: mark as failing 115 await run( 116 `UPDATE sites 117 SET status = 'failing', 118 error_message = $1, 119 retry_count = $2, 120 last_retry_at = NOW() 121 WHERE id = $3`, 122 [`Max retries (${maxRetries}) exceeded: ${errorMessage}`, newRetryCount, siteId] 123 ); 124 125 logger.error( 126 ` Site ${siteId} marked as FAILING after ${newRetryCount} retries at ${stageName} stage` 127 ); 128 return true; 129 } else { 130 // Keep retrying - increment counter but keep current status 131 await run( 132 `UPDATE sites 133 SET status = $1, 134 error_message = $2, 135 retry_count = $3, 136 last_retry_at = NOW() 137 WHERE id = $4`, 138 [currentStatus, errorMessage, newRetryCount, siteId] 139 ); 140 141 logger.info( 142 ` Site ${siteId} retry ${newRetryCount}/${maxRetries} at ${stageName} stage: ${errorMessage}` 143 ); 144 return false; 145 } 146 } 147 148 /** 149 * Reset retry counter on successful stage completion 150 * @param {number} siteId - Site ID 151 * @returns {Promise<void>} 152 */ 153 export async function resetRetries(siteId) { 154 await run( 155 `UPDATE sites 156 SET retry_count = 0, 157 last_retry_at = NULL, 158 error_message = NULL 159 WHERE id = $1`, 160 [siteId] 161 ); 162 } 163 164 /** 165 * Get retry statistics for monitoring 166 * @returns {Promise<object>} Retry statistics 167 */ 168 export async function getRetryStats() { 169 const stats = await getOne( 170 `SELECT 171 COUNT(CASE WHEN status = 'failing' THEN 1 END) AS failing_sites, 172 COUNT(CASE WHEN retry_count > 0 AND status != 'failing' THEN 1 END) AS retrying_sites, 173 AVG(CASE WHEN retry_count > 0 THEN retry_count END) AS avg_retry_count, 174 MAX(retry_count) AS max_retry_count 175 FROM sites 176 WHERE status NOT IN ('ignored', 'high_score', 'dead_letter')`, 177 [] 178 ); 179 180 return stats; 181 }