Cradicle Explorer

/ src / utils / llm-sanitizer.js
llm-sanitizer.js
  1  /**
  2   * LLM Prompt Injection Sanitizer
  3   *
  4   * Defends against prompt injection by:
  5   * 1. Stripping dangerous HTML elements and injection markers from untrusted content
  6   * 2. Wrapping untrusted data in XML boundary tags that system prompts reference
  7   * 3. Detecting jailbreak attempts in inbound messages
  8   */
  9  
 10  import Logger from './logger.js';
 11  
 12  const logger = new Logger('LLMSanitizer');
 13  
 14  // ─── Injection marker patterns ──────────────────────────────────────────────
 15  // Fake system/instruction delimiters that attackers embed in content
 16  const INJECTION_MARKERS = [
 17    /\[SYSTEM\]/gi,
 18    /\[INST\]/gi,
 19    /\[\/INST\]/gi,
 20    /<<SYS>>/gi,
 21    /<<\/SYS>>/gi,
 22    /<\|im_start\|>/gi,
 23    /<\|im_end\|>/gi,
 24    /<\|endoftext\|>/gi,
 25    /<\|system\|>/gi,
 26    /<\/s>/gi,
 27    /\[ASSISTANT\]/gi,
 28    /\[USER\]/gi,
 29  ];
 30  
 31  // ─── HTML sanitization ─────────────────────────────────────────────────────
 32  // Remove elements that serve no analytical purpose and could carry injections
 33  const DANGEROUS_HTML_RE = [
 34    /<script\b[^>]*>[\s\S]*?<\/script>/gi,
 35    /<style\b[^>]*>[\s\S]*?<\/style>/gi,
 36    /<!--[\s\S]*?-->/g, // HTML comments can hide instructions
 37    /\sdata-prompt\s*=\s*"[^"]*"/gi,
 38    /\sdata-instruction[s]?\s*=\s*"[^"]*"/gi,
 39    /\son\w+\s*=\s*"[^"]*"/gi, // event handlers (onclick, onerror, etc.)
 40  ];
 41  
 42  /**
 43   * Strip dangerous HTML elements and injection markers from content
 44   * intended for LLM analysis. Preserves structural HTML (tags, attributes)
 45   * needed for scoring/enrichment.
 46   *
 47   * @param {string} html - Raw HTML content
 48   * @returns {string} Sanitized HTML
 49   */
 50  export function sanitizeHtmlForPrompt(html) {
 51    if (!html || typeof html !== 'string') return html || '';
 52  
 53    let sanitized = html;
 54  
 55    // Strip dangerous HTML elements
 56    for (const re of DANGEROUS_HTML_RE) {
 57      sanitized = sanitized.replace(re, '');
 58    }
 59  
 60    // Strip injection markers
 61    sanitized = stripInjectionMarkers(sanitized);
 62  
 63    return sanitized;
 64  }
 65  
 66  /**
 67   * Remove fake system/instruction delimiter tokens from text.
 68   *
 69   * @param {string} text
 70   * @returns {string}
 71   */
 72  export function stripInjectionMarkers(text) {
 73    if (!text || typeof text !== 'string') return text || '';
 74  
 75    let cleaned = text;
 76    for (const re of INJECTION_MARKERS) {
 77      cleaned = cleaned.replace(re, '');
 78    }
 79    return cleaned;
 80  }
 81  
 82  /**
 83   * Wrap untrusted content in XML boundary tags.
 84   * System prompts instruct the LLM to treat content inside these tags
 85   * as opaque data — not as instructions.
 86   *
 87   * @param {string} text - Untrusted content
 88   * @param {string} label - Content type label (e.g. 'website_html', 'prospect_reply')
 89   * @returns {string} Wrapped content
 90   */
 91  export function wrapUntrusted(text, label) {
 92    return `<untrusted_content type="${label}">\n${text}\n</untrusted_content>`;
 93  }
 94  
 95  // ─── Jailbreak detection ────────────────────────────────────────────────────
 96  
 97  const HIGH_SEVERITY_PATTERNS = [
 98    /ignore\s+(?:all\s+)?(?:previous|prior|above|earlier|your)\s+(?:instructions|prompts?|rules|context|guidelines|directions)/i,
 99    /disregard\s+(?:all|any|the|your)\s+(?:previous|prior|above|earlier)\s+(?:instructions|prompts?|rules|context|guidelines)/i,
100    /override\s+(?:your|all|the)\s+(?:instructions|rules|guidelines|prompts?)/i,
101    /you\s+are\s+now\s+(?:a|an|my|the)\b/i,
102    /from\s+now\s+on\s+you\s+(?:are|will|must|should)\b/i,
103    /new\s+(?:instructions|rules|task|objective|goal|system\s+prompt)\s*:/i,
104    /respond\s+(?:with|as\s+if)\s+(?:you\s+are|you're)\b/i,
105    /pretend\s+(?:that\s+)?you\s+(?:are|were)\b/i,
106    /\[(?:SYSTEM|INST|SYS)\]/i,
107    /<\|(?:im_start|im_end|endoftext)\|>/i,
108    /enter\s+(?:a\s+)?(?:new|different)\s+(?:mode|persona|character)/i,
109    /(?:system|admin|root)\s*:\s*(?:override|execute|run|enable)/i,
110  ];
111  
112  const MEDIUM_SEVERITY_PATTERNS = [
113    /(?:system|assistant)\s+prompt/i,
114    /what\s+(?:are|were)\s+your\s+(?:instructions|rules|guidelines|system\s+prompt)/i,
115    /repeat\s+(?:your|the)\s+(?:system|initial|original)\s+(?:prompt|message|instructions)/i,
116    /\bDAN\b.*\bDo\s+Anything\s+Now\b/i,
117    /reveal\s+(?:your|the)\s+(?:system|hidden|secret)\s+(?:prompt|instructions|message)/i,
118  ];
119  
120  /**
121   * Detect potential jailbreak/prompt injection attempts in text.
122   *
123   * @param {string} text - Message content to check
124   * @returns {{ detected: boolean, severity: 'none'|'low'|'medium'|'high', patterns: string[] }}
125   */
126  export function detectJailbreak(text) {
127    if (!text || typeof text !== 'string') {
128      return { detected: false, severity: 'none', patterns: [] };
129    }
130  
131    const matched = [];
132  
133    for (const re of HIGH_SEVERITY_PATTERNS) {
134      if (re.test(text)) {
135        matched.push(`HIGH: ${re.source}`);
136      }
137    }
138  
139    if (matched.length > 0) {
140      logger.warn(
141        `Jailbreak detected (HIGH) in message: "${text.substring(0, 80)}..." — patterns: ${matched.join(', ')}`
142      );
143      return { detected: true, severity: 'high', patterns: matched };
144    }
145  
146    for (const re of MEDIUM_SEVERITY_PATTERNS) {
147      if (re.test(text)) {
148        matched.push(`MEDIUM: ${re.source}`);
149      }
150    }
151  
152    if (matched.length > 0) {
153      logger.warn(
154        `Jailbreak detected (MEDIUM) in message: "${text.substring(0, 80)}..." — patterns: ${matched.join(', ')}`
155      );
156      return { detected: true, severity: 'medium', patterns: matched };
157    }
158  
159    return { detected: false, severity: 'none', patterns: [] };
160  }
161  
162  // ─── Output sanitization ─────────────────────────────────────────────────────
163  // Strips dangerous content from LLM-generated text before it reaches outreach
164  // emails/SMS. This is the OUTPUT path (post-LLM, pre-send).
165  
166  /**
167   * Dangerous patterns that must never appear in outreach messages.
168   * These could originate from prompt injection via scraped website content.
169   */
170  const OUTPUT_DANGEROUS_PATTERNS = [
171    // Script tags (any casing, with or without attributes)
172    /<script\b[^>]*>[\s\S]*?<\/script>/gi,
173    // Standalone script open/close tags (malformed)
174    /<\/?script[^>]*>/gi,
175    // javascript: URLs (in href, src, action, or standalone)
176    /javascript\s*:/gi,
177    // data: URLs that could execute (text/html, application/x-javascript, etc.)
178    /data\s*:\s*(?:text\/html|application\/(?:x-)?javascript)[^'"\s>]*/gi,
179    // Event handlers (onclick, onerror, onload, etc.)
180    /\bon\w+\s*=\s*["'][^"']*["']/gi,
181    // VBScript URLs (IE legacy but still a risk in some email clients)
182    /vbscript\s*:/gi,
183    // Expression CSS (IE vulnerability)
184    /expression\s*\(/gi,
185    // HTML comments that could hide content from preview but render in clients
186    /<!--[\s\S]*?-->/g,
187  ];
188  
189  /**
190   * URL allowlist for outreach messages.
191   * Only our own domains should appear as clickable links.
192   */
193  const ALLOWED_URL_DOMAINS = [
194    'auditandfix.com',
195    'www.auditandfix.com',
196  ];
197  
198  /**
199   * Match HTTP(S) URLs in text
200   */
201  const URL_RE = /https?:\/\/[^\s"'<>)}\]]+/gi;
202  
203  /**
204   * Sanitize LLM-generated text before it is used in outreach messages.
205   * Runs on the OUTPUT path (after LLM response, before email/SMS send).
206   *
207   * 1. Strips <script> tags, javascript: URLs, event handlers
208   * 2. Removes URLs that don't point to our allowed domains
209   * 3. Strips injection markers (belt-and-suspenders with input sanitization)
210   *
211   * @param {string} text - LLM-generated proposal/message text
212   * @param {string} [prospectDomain] - The prospect's own domain (allowed in references)
213   * @returns {{ sanitized: string, strippedUrls: string[], strippedPatterns: string[] }}
214   */
215  export function sanitizeLlmOutput(text, prospectDomain = null) {
216    if (typeof text !== 'string' || !text) {
217      return { sanitized: '', strippedUrls: [], strippedPatterns: [] };
218    }
219  
220    let sanitized = text;
221    const strippedPatterns = [];
222  
223    // 1. Strip dangerous HTML/JS patterns
224    for (const re of OUTPUT_DANGEROUS_PATTERNS) {
225      const matches = sanitized.match(re);
226      if (matches) {
227        for (const m of matches) {
228          strippedPatterns.push(m.substring(0, 80));
229        }
230        sanitized = sanitized.replace(re, '');
231      }
232    }
233  
234    // 2. Strip injection markers (redundant with input path, but defence-in-depth)
235    sanitized = stripInjectionMarkers(sanitized);
236  
237    // 3. Remove URLs that aren't our own domains or the prospect's domain
238    const allowedDomains = [...ALLOWED_URL_DOMAINS];
239    if (prospectDomain) {
240      // Allow the prospect's own domain and www variant
241      const cleanDomain = prospectDomain.replace(/^www\./, '');
242      allowedDomains.push(cleanDomain, `www.${cleanDomain}`);
243    }
244  
245    const strippedUrls = [];
246    sanitized = sanitized.replace(URL_RE, (url) => {
247      try {
248        const parsed = new URL(url);
249        const hostname = parsed.hostname.toLowerCase();
250        if (allowedDomains.some(d => hostname === d || hostname.endsWith(`.${d}`))) {
251          return url; // Keep allowed URLs
252        }
253        strippedUrls.push(url);
254        logger.warn(`[sanitize-output] Stripped unauthorized URL from LLM output: ${url}`);
255        return ''; // Remove unauthorized URL
256      } catch {
257        // Malformed URL — strip it
258        strippedUrls.push(url);
259        return '';
260      }
261    });
262  
263    // 4. Clean up whitespace artifacts from removals
264    sanitized = sanitized.replace(/\n{3,}/g, '\n\n').trim();
265  
266    if (strippedPatterns.length > 0) {
267      logger.warn(
268        `[sanitize-output] Stripped ${strippedPatterns.length} dangerous patterns from LLM output`
269      );
270    }
271  
272    return { sanitized, strippedUrls, strippedPatterns };
273  }
274  
275  export default { sanitizeHtmlForPrompt, stripInjectionMarkers, wrapUntrusted, detectJailbreak, sanitizeLlmOutput };