/ src / utils / retry-handler.js
retry-handler.js
  1  /**
  2   * Retry Handler Utility
  3   * Centralized retry logic for all pipeline stages
  4   */
  5  
  6  import Logger from './logger.js';
  7  import { run, getOne } from './db.js';
  8  
  9  const logger = new Logger('RetryHandler');
 10  
 11  // Retry limits by stage
 12  const RETRY_LIMITS = {
 13    assets: 3, // Expensive: browser automation
 14    scoring: 3, // Expensive: LLM vision API
 15    rescoring: 3, // Expensive: LLM vision API
 16    enrichment: 3, // Expensive: browser + LLM
 17    proposals: 5, // Less expensive: just LLM text
 18    serps: 5, // API calls
 19  };
 20  
 21  /**
 22   * Detect if an error originated from the circuit breaker being open.
 23   * opossum throws with message "Breaker is open" when the circuit is OPEN.
 24   */
 25  function isCircuitBreakerError(errorMessage) {
 26    return /breaker is open|circuit.*open|EOPENBREAKER/i.test(errorMessage);
 27  }
 28  
 29  /**
 30   * Detect HTTP errors that warrant a long-term retry (site may recover within weeks).
 31   * 403 = access denied (Cloudflare/geo-block may lift)
 32   * 404 = page gone (small businesses sometimes restore pages)
 33   * Returns retry interval in days, or null if not a long-term-retryable error.
 34   */
 35  function httpLongTermRetryDays(errorMessage) {
 36    if (/HTTP 403.*Cannot capture/i.test(errorMessage)) return 7;
 37    if (/HTTP 404.*Cannot capture/i.test(errorMessage)) return 7;
 38    return null;
 39  }
 40  
 41  // Max long-term HTTP retries before giving up (8 weeks at 7d each)
 42  const HTTP_MAX_LONG_RETRIES = 8;
 43  
 44  /**
 45   * Record a failure and determine if site should be marked as failing
 46   * @param {number} siteId - Site ID
 47   * @param {string} stageName - Stage name (assets, scoring, etc.)
 48   * @param {Error|string} error - Error object or message
 49   * @param {string} currentStatus - Current site status (to keep unchanged if retrying)
 50   * @returns {Promise<boolean>} True if marked as failing, false if still retrying
 51   */
 52  export async function recordFailure(siteId, stageName, error, currentStatus) {
 53    const errorMessage = error instanceof Error ? error.message : String(error);
 54  
 55    // Get current retry count
 56    const site = await getOne('SELECT retry_count FROM sites WHERE id = $1', [siteId]);
 57    const currentRetry = site?.retry_count || 0;
 58    const newRetryCount = currentRetry + 1;
 59  
 60    // Get retry limit for this stage
 61    const maxRetries = RETRY_LIMITS[stageName] || 5;
 62  
 63    if (newRetryCount >= maxRetries) {
 64      if (isCircuitBreakerError(errorMessage)) {
 65        // Circuit breaker error: keep site at current stage, retry in 1 hour.
 66        // Do NOT reset retry_count — preserve accumulated failure count so sites
 67        // that also have real errors (e.g. Incomplete LLM response) still hit the cap.
 68        await run(
 69          `UPDATE sites
 70           SET status = $1,
 71               error_message = $2,
 72               last_retry_at = NOW(),
 73               recapture_at = NOW() + INTERVAL '1 hour'
 74           WHERE id = $3`,
 75          [currentStatus, `Circuit breaker retry in 1h: ${errorMessage}`, siteId]
 76        );
 77  
 78        logger.warn(
 79          `  Site ${siteId} circuit breaker retry scheduled in 1 hour at ${stageName} stage`
 80        );
 81        return false;
 82      }
 83  
 84      // HTTP 403/404: schedule a long-term retry instead of permanent failure
 85      const retryDays = httpLongTermRetryDays(errorMessage);
 86      if (retryDays !== null) {
 87        const site2 = await getOne('SELECT recapture_count FROM sites WHERE id = $1', [siteId]);
 88        const httpRetries = (site2?.recapture_count || 0) + 1;
 89  
 90        if (httpRetries <= HTTP_MAX_LONG_RETRIES) {
 91          await run(
 92            `UPDATE sites
 93             SET status = 'found',
 94                 error_message = $1,
 95                 retry_count = 0,
 96                 recapture_count = $2,
 97                 last_retry_at = NOW(),
 98                 recapture_at = NOW() + INTERVAL '${retryDays} days'
 99             WHERE id = $3`,
100            [
101              `HTTP retry ${httpRetries}/${HTTP_MAX_LONG_RETRIES}: ${errorMessage}`,
102              httpRetries,
103              siteId,
104            ]
105          );
106          logger.warn(
107            `  Site ${siteId} HTTP error — long-term retry ${httpRetries}/${HTTP_MAX_LONG_RETRIES} in ${retryDays}d`
108          );
109          return false;
110        }
111        // Exhausted all long-term retries — fall through to permanent failing
112      }
113  
114      // Non-circuit-breaker error: mark as failing
115      await run(
116        `UPDATE sites
117         SET status = 'failing',
118             error_message = $1,
119             retry_count = $2,
120             last_retry_at = NOW()
121         WHERE id = $3`,
122        [`Max retries (${maxRetries}) exceeded: ${errorMessage}`, newRetryCount, siteId]
123      );
124  
125      logger.error(
126        `  Site ${siteId} marked as FAILING after ${newRetryCount} retries at ${stageName} stage`
127      );
128      return true;
129    } else {
130      // Keep retrying - increment counter but keep current status
131      await run(
132        `UPDATE sites
133         SET status = $1,
134             error_message = $2,
135             retry_count = $3,
136             last_retry_at = NOW()
137         WHERE id = $4`,
138        [currentStatus, errorMessage, newRetryCount, siteId]
139      );
140  
141      logger.info(
142        `  Site ${siteId} retry ${newRetryCount}/${maxRetries} at ${stageName} stage: ${errorMessage}`
143      );
144      return false;
145    }
146  }
147  
148  /**
149   * Reset retry counter on successful stage completion
150   * @param {number} siteId - Site ID
151   * @returns {Promise<void>}
152   */
153  export async function resetRetries(siteId) {
154    await run(
155      `UPDATE sites
156       SET retry_count = 0,
157           last_retry_at = NULL,
158           error_message = NULL
159       WHERE id = $1`,
160      [siteId]
161    );
162  }
163  
164  /**
165   * Get retry statistics for monitoring
166   * @returns {Promise<object>} Retry statistics
167   */
168  export async function getRetryStats() {
169    const stats = await getOne(
170      `SELECT
171         COUNT(CASE WHEN status = 'failing' THEN 1 END) AS failing_sites,
172         COUNT(CASE WHEN retry_count > 0 AND status != 'failing' THEN 1 END) AS retrying_sites,
173         AVG(CASE WHEN retry_count > 0 THEN retry_count END) AS avg_retry_count,
174         MAX(retry_count) AS max_retry_count
175       FROM sites
176       WHERE status NOT IN ('ignored', 'high_score', 'dead_letter')`,
177      []
178    );
179  
180    return stats;
181  }