/ src / utils / outreach-guard.js
outreach-guard.js
  1  /**
  2   * Outreach Reputation Guard
  3   *
  4   * In-memory sliding window that tracks repeated errors per channel.
  5   * When the same error message occurs 25+ times within 2 hours on a single
  6   * channel, that channel is halted to protect Twilio/Resend sender reputation.
  7   *
  8   * This guard sits ABOVE the opossum circuit breakers:
  9   * - opossum handles: rate limits (429/503), network failures
 10   * - This guard handles: application-level errors that repeat at volume
 11   *   (bad phone formats, business hours blocks, auth failures, etc.)
 12   *
 13   * Persistence: in-memory only. Resets on pipeline restart — the assumption
 14   * is that a restart implies the underlying issue has been fixed.
 15   *
 16   * Usage:
 17   *   import { recordOutreachError, shouldHaltChannel } from './outreach-guard.js';
 18   *
 19   *   // Before sending:
 20   *   if (shouldHaltChannel('sms')) return { skipped: true, reason: 'channel_halted' };
 21   *
 22   *   // On failure:
 23   *   recordOutreachError('sms', error.message);
 24   */
 25  
 26  const WINDOW_MS = 2 * 60 * 60 * 1000; // 2 hours
 27  const THRESHOLD = 25; // same error 25 times → halt channel
 28  
 29  // Map: "channel:errorMessage" → [{ timestamp: number }, ...]
 30  const errorWindows = new Map();
 31  
 32  // Map: channel → { error: string, count: number, halted_at: number }
 33  const haltedChannels = new Map();
 34  
 35  /**
 36   * Normalize error message for grouping.
 37   * Strips variable parts (IDs, URLs, phone numbers) so semantically identical
 38   * errors collapse to the same key.
 39   */
 40  function normalizeError(msg) {
 41    if (!msg) return 'unknown';
 42    return msg
 43      .replace(/\+\d{7,15}/g, '+PHONE') // phone numbers
 44      .replace(/https?:\/\/\S+/g, 'URL') // URLs
 45      .replace(/\b[0-9a-f-]{8,}\b/gi, 'ID') // UUIDs / hex IDs
 46      .replace(/outreach #\d+/gi, 'outreach #N') // outreach IDs
 47      .replace(/\d{4,}/g, 'N') // long numbers
 48      .slice(0, 200)
 49      .trim();
 50  }
 51  
 52  /**
 53   * Record a channel failure. If the same error has occurred THRESHOLD times
 54   * within WINDOW_MS, the channel is halted.
 55   *
 56   * @param {'sms'|'email'|'form'|'x'|'linkedin'} channel
 57   * @param {string} errorMessage
 58   */
 59  export function recordOutreachError(channel, errorMessage) {
 60    const normalized = normalizeError(errorMessage);
 61    const key = `${channel}:${normalized}`;
 62    const now = Date.now();
 63    const cutoff = now - WINDOW_MS;
 64  
 65    // Evict old entries
 66    const entries = (errorWindows.get(key) || []).filter(e => e.timestamp > cutoff);
 67    entries.push({ timestamp: now });
 68    errorWindows.set(key, entries);
 69  
 70    if (entries.length >= THRESHOLD && !haltedChannels.has(channel)) {
 71      haltedChannels.set(channel, {
 72        error: normalized,
 73        count: entries.length,
 74        halted_at: now,
 75      });
 76      // Log to console (logger not imported to keep this module lightweight)
 77      console.warn(
 78        `[OutreachGuard] ⛔ ${channel.toUpperCase()} channel halted: ` +
 79          `${entries.length}× "${normalized.slice(0, 100)}" in last 2h`
 80      );
 81    }
 82  }
 83  
 84  /**
 85   * Check if a channel is currently halted.
 86   *
 87   * @param {string} channel
 88   * @returns {boolean}
 89   */
 90  export function shouldHaltChannel(channel) {
 91    if (!haltedChannels.has(channel)) return false;
 92  
 93    const info = haltedChannels.get(channel);
 94    const key = `${channel}:${info.error}`;
 95    const now = Date.now();
 96    const cutoff = now - WINDOW_MS;
 97  
 98    // Auto-clear if the window has passed and no new occurrences
 99    const recentCount = (errorWindows.get(key) || []).filter(e => e.timestamp > cutoff).length;
100    if (recentCount < THRESHOLD) {
101      haltedChannels.delete(channel);
102      console.info(
103        `[OutreachGuard] ✅ ${channel.toUpperCase()} channel auto-cleared (window expired)`
104      );
105      return false;
106    }
107  
108    return true;
109  }
110  
111  /**
112   * Get all currently halted channels.
113   *
114   * @returns {Object} Map of channel → { error, count, halted_at }
115   */
116  export function getHaltedChannels() {
117    const result = {};
118    for (const [channel, info] of haltedChannels.entries()) {
119      if (shouldHaltChannel(channel)) {
120        result[channel] = info;
121      }
122    }
123    return result;
124  }
125  
126  /**
127   * Manually clear a halted channel (e.g. after fixing the underlying issue).
128   *
129   * @param {string} channel
130   */
131  export function clearHalt(channel) {
132    haltedChannels.delete(channel);
133    // Also clear error window entries for this channel
134    for (const key of errorWindows.keys()) {
135      if (key.startsWith(`${channel}:`)) {
136        errorWindows.delete(key);
137      }
138    }
139  }
140  
141  /**
142   * Get error frequency stats (for status CLI display).
143   * Returns the top error per channel within the current window.
144   *
145   * @returns {Array<{ channel, error, count }>}
146   */
147  export function getChannelErrorStats() {
148    const now = Date.now();
149    const cutoff = now - WINDOW_MS;
150    const stats = {};
151  
152    for (const [key, entries] of errorWindows.entries()) {
153      const recent = entries.filter(e => e.timestamp > cutoff);
154      if (recent.length === 0) continue;
155  
156      const colonIdx = key.indexOf(':');
157      const channel = key.slice(0, colonIdx);
158      const error = key.slice(colonIdx + 1);
159  
160      if (!stats[channel] || recent.length > stats[channel].count) {
161        stats[channel] = { channel, error, count: recent.length };
162      }
163    }
164  
165    return Object.values(stats);
166  }