untrusted-content.ts
1 import type { AppSettings, Message, MessageToolEvent } from '@/types' 2 3 const INJECTION_PATTERNS: Array<{ code: string; re: RegExp; note: string }> = [ 4 { code: 'ignore_instructions', re: /\bignore (?:all |any |the )?(?:previous|prior|above|system|developer) instructions\b/i, note: 'tries to override existing instructions' }, 5 { code: 'reveal_prompt', re: /\b(?:reveal|show|print|dump)\b[\s\S]{0,40}\b(?:system prompt|developer prompt|hidden prompt)\b/i, note: 'asks for hidden prompt data' }, 6 { code: 'credential_theft', re: /\b(?:api key|token|password|secret|credential)s?\b[\s\S]{0,40}\b(?:send|share|reveal|print|dump|exfiltrat)/i, note: 'asks for secrets or credentials' }, 7 { code: 'tool_override', re: /\b(?:call|use|run)\b[\s\S]{0,40}\b(?:shell|terminal|browser|http_request|web_fetch|connector_message_tool)\b[\s\S]{0,40}\b(?:without|ignore)\b/i, note: 'tries to direct tool use by bypassing policy' }, 8 { code: 'workflow_override', re: /\b(?:act as|pretend to be)\b[\s\S]{0,40}\b(?:system|developer|administrator|operator)\b/i, note: 'tries to impersonate a higher-priority role' }, 9 ] 10 11 const WEB_TOOL_NAMES = new Set(['browser', 'web_search', 'web_fetch', 'http_request']) 12 13 function normalizeMode(value: unknown): 'off' | 'warn' | 'block' { 14 const normalized = typeof value === 'string' ? value.trim().toLowerCase() : '' 15 if (normalized === 'off' || normalized === 'block') return normalized 16 return 'warn' 17 } 18 19 function summarizeFindings(findings: string[]): string { 20 return findings.slice(0, 2).join('; ') 21 } 22 23 export function getUntrustedContentGuardMode(settings?: Partial<AppSettings> | null): 'off' | 'warn' | 'block' { 24 return normalizeMode(settings?.untrustedContentGuardMode) 25 } 26 27 export function inspectUntrustedText(text: string): { suspicious: boolean; findings: string[] } { 28 const findings = INJECTION_PATTERNS 29 .filter((pattern) => pattern.re.test(text)) 30 .map((pattern) => `${pattern.code}: ${pattern.note}`) 31 return { 32 suspicious: findings.length > 0, 33 findings, 34 } 35 } 36 37 export function guardUntrustedText(params: { 38 text: string 39 source: string 40 mode?: 'off' | 'warn' | 'block' 41 trusted?: boolean 42 }): { text: string; blocked: boolean; findings: string[] } { 43 const text = String(params.text || '') 44 const mode = params.mode || 'warn' 45 if (!text.trim() || params.trusted || mode === 'off') { 46 return { text, blocked: false, findings: [] } 47 } 48 49 const inspection = inspectUntrustedText(text) 50 if (!inspection.suspicious) { 51 return { text, blocked: false, findings: [] } 52 } 53 54 const summary = summarizeFindings(inspection.findings) 55 if (mode === 'block') { 56 return { 57 text: `[Blocked untrusted ${params.source} content]\n${summary}`, 58 blocked: true, 59 findings: inspection.findings, 60 } 61 } 62 63 return { 64 text: `[Untrusted ${params.source} content warning: ${summary}]\n${text}`, 65 blocked: false, 66 findings: inspection.findings, 67 } 68 } 69 70 export function guardUntrustedToolEvents(params: { 71 toolEvents: MessageToolEvent[] 72 mode?: 'off' | 'warn' | 'block' 73 }): MessageToolEvent[] { 74 const mode = params.mode || 'warn' 75 if (mode === 'off' || !params.toolEvents.length) return params.toolEvents 76 77 return params.toolEvents.map((event) => { 78 if (!WEB_TOOL_NAMES.has((event.name || '').trim().toLowerCase())) return event 79 const guarded = guardUntrustedText({ 80 text: typeof event.output === 'string' ? event.output : '', 81 source: `tool result from ${event.name}`, 82 mode, 83 trusted: false, 84 }) 85 if (!guarded.findings.length) return event 86 return { 87 ...event, 88 output: guarded.text, 89 error: guarded.blocked ? true : event.error, 90 } 91 }) 92 } 93 94 export function guardUntrustedMessage(params: { 95 message: Message 96 mode?: 'off' | 'warn' | 'block' 97 trusted?: boolean 98 source: string 99 }): Message { 100 const guardedText = guardUntrustedText({ 101 text: params.message.text, 102 source: params.source, 103 mode: params.mode, 104 trusted: params.trusted, 105 }) 106 return { 107 ...params.message, 108 text: guardedText.text, 109 toolEvents: Array.isArray(params.message.toolEvents) 110 ? guardUntrustedToolEvents({ toolEvents: params.message.toolEvents, mode: params.mode }) 111 : params.message.toolEvents, 112 } 113 }