llm-sanitizer.js
1 /** 2 * LLM Prompt Injection Sanitizer 3 * 4 * Defends against prompt injection by: 5 * 1. Stripping dangerous HTML elements and injection markers from untrusted content 6 * 2. Wrapping untrusted data in XML boundary tags that system prompts reference 7 * 3. Detecting jailbreak attempts in inbound messages 8 */ 9 10 import Logger from './logger.js'; 11 12 const logger = new Logger('LLMSanitizer'); 13 14 // ─── Injection marker patterns ────────────────────────────────────────────── 15 // Fake system/instruction delimiters that attackers embed in content 16 const INJECTION_MARKERS = [ 17 /\[SYSTEM\]/gi, 18 /\[INST\]/gi, 19 /\[\/INST\]/gi, 20 /<<SYS>>/gi, 21 /<<\/SYS>>/gi, 22 /<\|im_start\|>/gi, 23 /<\|im_end\|>/gi, 24 /<\|endoftext\|>/gi, 25 /<\|system\|>/gi, 26 /<\/s>/gi, 27 /\[ASSISTANT\]/gi, 28 /\[USER\]/gi, 29 ]; 30 31 // ─── HTML sanitization ───────────────────────────────────────────────────── 32 // Remove elements that serve no analytical purpose and could carry injections 33 const DANGEROUS_HTML_RE = [ 34 /<script\b[^>]*>[\s\S]*?<\/script>/gi, 35 /<style\b[^>]*>[\s\S]*?<\/style>/gi, 36 /<!--[\s\S]*?-->/g, // HTML comments can hide instructions 37 /\sdata-prompt\s*=\s*"[^"]*"/gi, 38 /\sdata-instruction[s]?\s*=\s*"[^"]*"/gi, 39 /\son\w+\s*=\s*"[^"]*"/gi, // event handlers (onclick, onerror, etc.) 40 ]; 41 42 /** 43 * Strip dangerous HTML elements and injection markers from content 44 * intended for LLM analysis. Preserves structural HTML (tags, attributes) 45 * needed for scoring/enrichment. 46 * 47 * @param {string} html - Raw HTML content 48 * @returns {string} Sanitized HTML 49 */ 50 export function sanitizeHtmlForPrompt(html) { 51 if (!html || typeof html !== 'string') return html || ''; 52 53 let sanitized = html; 54 55 // Strip dangerous HTML elements 56 for (const re of DANGEROUS_HTML_RE) { 57 sanitized = sanitized.replace(re, ''); 58 } 59 60 // Strip injection markers 61 sanitized = stripInjectionMarkers(sanitized); 62 63 return sanitized; 64 } 65 66 /** 67 * Remove fake system/instruction delimiter tokens from text. 68 * 69 * @param {string} text 70 * @returns {string} 71 */ 72 export function stripInjectionMarkers(text) { 73 if (!text || typeof text !== 'string') return text || ''; 74 75 let cleaned = text; 76 for (const re of INJECTION_MARKERS) { 77 cleaned = cleaned.replace(re, ''); 78 } 79 return cleaned; 80 } 81 82 /** 83 * Wrap untrusted content in XML boundary tags. 84 * System prompts instruct the LLM to treat content inside these tags 85 * as opaque data — not as instructions. 86 * 87 * @param {string} text - Untrusted content 88 * @param {string} label - Content type label (e.g. 'website_html', 'prospect_reply') 89 * @returns {string} Wrapped content 90 */ 91 export function wrapUntrusted(text, label) { 92 return `<untrusted_content type="${label}">\n${text}\n</untrusted_content>`; 93 } 94 95 // ─── Jailbreak detection ──────────────────────────────────────────────────── 96 97 const HIGH_SEVERITY_PATTERNS = [ 98 /ignore\s+(?:all\s+)?(?:previous|prior|above|earlier|your)\s+(?:instructions|prompts?|rules|context|guidelines|directions)/i, 99 /disregard\s+(?:all|any|the|your)\s+(?:previous|prior|above|earlier)\s+(?:instructions|prompts?|rules|context|guidelines)/i, 100 /override\s+(?:your|all|the)\s+(?:instructions|rules|guidelines|prompts?)/i, 101 /you\s+are\s+now\s+(?:a|an|my|the)\b/i, 102 /from\s+now\s+on\s+you\s+(?:are|will|must|should)\b/i, 103 /new\s+(?:instructions|rules|task|objective|goal|system\s+prompt)\s*:/i, 104 /respond\s+(?:with|as\s+if)\s+(?:you\s+are|you're)\b/i, 105 /pretend\s+(?:that\s+)?you\s+(?:are|were)\b/i, 106 /\[(?:SYSTEM|INST|SYS)\]/i, 107 /<\|(?:im_start|im_end|endoftext)\|>/i, 108 /enter\s+(?:a\s+)?(?:new|different)\s+(?:mode|persona|character)/i, 109 /(?:system|admin|root)\s*:\s*(?:override|execute|run|enable)/i, 110 ]; 111 112 const MEDIUM_SEVERITY_PATTERNS = [ 113 /(?:system|assistant)\s+prompt/i, 114 /what\s+(?:are|were)\s+your\s+(?:instructions|rules|guidelines|system\s+prompt)/i, 115 /repeat\s+(?:your|the)\s+(?:system|initial|original)\s+(?:prompt|message|instructions)/i, 116 /\bDAN\b.*\bDo\s+Anything\s+Now\b/i, 117 /reveal\s+(?:your|the)\s+(?:system|hidden|secret)\s+(?:prompt|instructions|message)/i, 118 ]; 119 120 /** 121 * Detect potential jailbreak/prompt injection attempts in text. 122 * 123 * @param {string} text - Message content to check 124 * @returns {{ detected: boolean, severity: 'none'|'low'|'medium'|'high', patterns: string[] }} 125 */ 126 export function detectJailbreak(text) { 127 if (!text || typeof text !== 'string') { 128 return { detected: false, severity: 'none', patterns: [] }; 129 } 130 131 const matched = []; 132 133 for (const re of HIGH_SEVERITY_PATTERNS) { 134 if (re.test(text)) { 135 matched.push(`HIGH: ${re.source}`); 136 } 137 } 138 139 if (matched.length > 0) { 140 logger.warn( 141 `Jailbreak detected (HIGH) in message: "${text.substring(0, 80)}..." — patterns: ${matched.join(', ')}` 142 ); 143 return { detected: true, severity: 'high', patterns: matched }; 144 } 145 146 for (const re of MEDIUM_SEVERITY_PATTERNS) { 147 if (re.test(text)) { 148 matched.push(`MEDIUM: ${re.source}`); 149 } 150 } 151 152 if (matched.length > 0) { 153 logger.warn( 154 `Jailbreak detected (MEDIUM) in message: "${text.substring(0, 80)}..." — patterns: ${matched.join(', ')}` 155 ); 156 return { detected: true, severity: 'medium', patterns: matched }; 157 } 158 159 return { detected: false, severity: 'none', patterns: [] }; 160 } 161 162 // ─── Output sanitization ───────────────────────────────────────────────────── 163 // Strips dangerous content from LLM-generated text before it reaches outreach 164 // emails/SMS. This is the OUTPUT path (post-LLM, pre-send). 165 166 /** 167 * Dangerous patterns that must never appear in outreach messages. 168 * These could originate from prompt injection via scraped website content. 169 */ 170 const OUTPUT_DANGEROUS_PATTERNS = [ 171 // Script tags (any casing, with or without attributes) 172 /<script\b[^>]*>[\s\S]*?<\/script>/gi, 173 // Standalone script open/close tags (malformed) 174 /<\/?script[^>]*>/gi, 175 // javascript: URLs (in href, src, action, or standalone) 176 /javascript\s*:/gi, 177 // data: URLs that could execute (text/html, application/x-javascript, etc.) 178 /data\s*:\s*(?:text\/html|application\/(?:x-)?javascript)[^'"\s>]*/gi, 179 // Event handlers (onclick, onerror, onload, etc.) 180 /\bon\w+\s*=\s*["'][^"']*["']/gi, 181 // VBScript URLs (IE legacy but still a risk in some email clients) 182 /vbscript\s*:/gi, 183 // Expression CSS (IE vulnerability) 184 /expression\s*\(/gi, 185 // HTML comments that could hide content from preview but render in clients 186 /<!--[\s\S]*?-->/g, 187 ]; 188 189 /** 190 * URL allowlist for outreach messages. 191 * Only our own domains should appear as clickable links. 192 */ 193 const ALLOWED_URL_DOMAINS = [ 194 'auditandfix.com', 195 'www.auditandfix.com', 196 ]; 197 198 /** 199 * Match HTTP(S) URLs in text 200 */ 201 const URL_RE = /https?:\/\/[^\s"'<>)}\]]+/gi; 202 203 /** 204 * Sanitize LLM-generated text before it is used in outreach messages. 205 * Runs on the OUTPUT path (after LLM response, before email/SMS send). 206 * 207 * 1. Strips <script> tags, javascript: URLs, event handlers 208 * 2. Removes URLs that don't point to our allowed domains 209 * 3. Strips injection markers (belt-and-suspenders with input sanitization) 210 * 211 * @param {string} text - LLM-generated proposal/message text 212 * @param {string} [prospectDomain] - The prospect's own domain (allowed in references) 213 * @returns {{ sanitized: string, strippedUrls: string[], strippedPatterns: string[] }} 214 */ 215 export function sanitizeLlmOutput(text, prospectDomain = null) { 216 if (typeof text !== 'string' || !text) { 217 return { sanitized: '', strippedUrls: [], strippedPatterns: [] }; 218 } 219 220 let sanitized = text; 221 const strippedPatterns = []; 222 223 // 1. Strip dangerous HTML/JS patterns 224 for (const re of OUTPUT_DANGEROUS_PATTERNS) { 225 const matches = sanitized.match(re); 226 if (matches) { 227 for (const m of matches) { 228 strippedPatterns.push(m.substring(0, 80)); 229 } 230 sanitized = sanitized.replace(re, ''); 231 } 232 } 233 234 // 2. Strip injection markers (redundant with input path, but defence-in-depth) 235 sanitized = stripInjectionMarkers(sanitized); 236 237 // 3. Remove URLs that aren't our own domains or the prospect's domain 238 const allowedDomains = [...ALLOWED_URL_DOMAINS]; 239 if (prospectDomain) { 240 // Allow the prospect's own domain and www variant 241 const cleanDomain = prospectDomain.replace(/^www\./, ''); 242 allowedDomains.push(cleanDomain, `www.${cleanDomain}`); 243 } 244 245 const strippedUrls = []; 246 sanitized = sanitized.replace(URL_RE, (url) => { 247 try { 248 const parsed = new URL(url); 249 const hostname = parsed.hostname.toLowerCase(); 250 if (allowedDomains.some(d => hostname === d || hostname.endsWith(`.${d}`))) { 251 return url; // Keep allowed URLs 252 } 253 strippedUrls.push(url); 254 logger.warn(`[sanitize-output] Stripped unauthorized URL from LLM output: ${url}`); 255 return ''; // Remove unauthorized URL 256 } catch { 257 // Malformed URL — strip it 258 strippedUrls.push(url); 259 return ''; 260 } 261 }); 262 263 // 4. Clean up whitespace artifacts from removals 264 sanitized = sanitized.replace(/\n{3,}/g, '\n\n').trim(); 265 266 if (strippedPatterns.length > 0) { 267 logger.warn( 268 `[sanitize-output] Stripped ${strippedPatterns.length} dangerous patterns from LLM output` 269 ); 270 } 271 272 return { sanitized, strippedUrls, strippedPatterns }; 273 } 274 275 export default { sanitizeHtmlForPrompt, stripInjectionMarkers, wrapUntrusted, detectJailbreak, sanitizeLlmOutput };