outreach-guard.js
1 /** 2 * Outreach Reputation Guard 3 * 4 * In-memory sliding window that tracks repeated errors per channel. 5 * When the same error message occurs 25+ times within 2 hours on a single 6 * channel, that channel is halted to protect Twilio/Resend sender reputation. 7 * 8 * This guard sits ABOVE the opossum circuit breakers: 9 * - opossum handles: rate limits (429/503), network failures 10 * - This guard handles: application-level errors that repeat at volume 11 * (bad phone formats, business hours blocks, auth failures, etc.) 12 * 13 * Persistence: in-memory only. Resets on pipeline restart — the assumption 14 * is that a restart implies the underlying issue has been fixed. 15 * 16 * Usage: 17 * import { recordOutreachError, shouldHaltChannel } from './outreach-guard.js'; 18 * 19 * // Before sending: 20 * if (shouldHaltChannel('sms')) return { skipped: true, reason: 'channel_halted' }; 21 * 22 * // On failure: 23 * recordOutreachError('sms', error.message); 24 */ 25 26 const WINDOW_MS = 2 * 60 * 60 * 1000; // 2 hours 27 const THRESHOLD = 25; // same error 25 times → halt channel 28 29 // Map: "channel:errorMessage" → [{ timestamp: number }, ...] 30 const errorWindows = new Map(); 31 32 // Map: channel → { error: string, count: number, halted_at: number } 33 const haltedChannels = new Map(); 34 35 /** 36 * Normalize error message for grouping. 37 * Strips variable parts (IDs, URLs, phone numbers) so semantically identical 38 * errors collapse to the same key. 39 */ 40 function normalizeError(msg) { 41 if (!msg) return 'unknown'; 42 return msg 43 .replace(/\+\d{7,15}/g, '+PHONE') // phone numbers 44 .replace(/https?:\/\/\S+/g, 'URL') // URLs 45 .replace(/\b[0-9a-f-]{8,}\b/gi, 'ID') // UUIDs / hex IDs 46 .replace(/outreach #\d+/gi, 'outreach #N') // outreach IDs 47 .replace(/\d{4,}/g, 'N') // long numbers 48 .slice(0, 200) 49 .trim(); 50 } 51 52 /** 53 * Record a channel failure. If the same error has occurred THRESHOLD times 54 * within WINDOW_MS, the channel is halted. 55 * 56 * @param {'sms'|'email'|'form'|'x'|'linkedin'} channel 57 * @param {string} errorMessage 58 */ 59 export function recordOutreachError(channel, errorMessage) { 60 const normalized = normalizeError(errorMessage); 61 const key = `${channel}:${normalized}`; 62 const now = Date.now(); 63 const cutoff = now - WINDOW_MS; 64 65 // Evict old entries 66 const entries = (errorWindows.get(key) || []).filter(e => e.timestamp > cutoff); 67 entries.push({ timestamp: now }); 68 errorWindows.set(key, entries); 69 70 if (entries.length >= THRESHOLD && !haltedChannels.has(channel)) { 71 haltedChannels.set(channel, { 72 error: normalized, 73 count: entries.length, 74 halted_at: now, 75 }); 76 // Log to console (logger not imported to keep this module lightweight) 77 console.warn( 78 `[OutreachGuard] ⛔ ${channel.toUpperCase()} channel halted: ` + 79 `${entries.length}× "${normalized.slice(0, 100)}" in last 2h` 80 ); 81 } 82 } 83 84 /** 85 * Check if a channel is currently halted. 86 * 87 * @param {string} channel 88 * @returns {boolean} 89 */ 90 export function shouldHaltChannel(channel) { 91 if (!haltedChannels.has(channel)) return false; 92 93 const info = haltedChannels.get(channel); 94 const key = `${channel}:${info.error}`; 95 const now = Date.now(); 96 const cutoff = now - WINDOW_MS; 97 98 // Auto-clear if the window has passed and no new occurrences 99 const recentCount = (errorWindows.get(key) || []).filter(e => e.timestamp > cutoff).length; 100 if (recentCount < THRESHOLD) { 101 haltedChannels.delete(channel); 102 console.info( 103 `[OutreachGuard] ✅ ${channel.toUpperCase()} channel auto-cleared (window expired)` 104 ); 105 return false; 106 } 107 108 return true; 109 } 110 111 /** 112 * Get all currently halted channels. 113 * 114 * @returns {Object} Map of channel → { error, count, halted_at } 115 */ 116 export function getHaltedChannels() { 117 const result = {}; 118 for (const [channel, info] of haltedChannels.entries()) { 119 if (shouldHaltChannel(channel)) { 120 result[channel] = info; 121 } 122 } 123 return result; 124 } 125 126 /** 127 * Manually clear a halted channel (e.g. after fixing the underlying issue). 128 * 129 * @param {string} channel 130 */ 131 export function clearHalt(channel) { 132 haltedChannels.delete(channel); 133 // Also clear error window entries for this channel 134 for (const key of errorWindows.keys()) { 135 if (key.startsWith(`${channel}:`)) { 136 errorWindows.delete(key); 137 } 138 } 139 } 140 141 /** 142 * Get error frequency stats (for status CLI display). 143 * Returns the top error per channel within the current window. 144 * 145 * @returns {Array<{ channel, error, count }>} 146 */ 147 export function getChannelErrorStats() { 148 const now = Date.now(); 149 const cutoff = now - WINDOW_MS; 150 const stats = {}; 151 152 for (const [key, entries] of errorWindows.entries()) { 153 const recent = entries.filter(e => e.timestamp > cutoff); 154 if (recent.length === 0) continue; 155 156 const colonIdx = key.indexOf(':'); 157 const channel = key.slice(0, colonIdx); 158 const error = key.slice(colonIdx + 1); 159 160 if (!stats[channel] || recent.length > stats[channel].count) { 161 stats[channel] = { channel, error, count: recent.length }; 162 } 163 } 164 165 return Object.values(stats); 166 }