Cradicle Explorer

/ src / lib / server / untrusted-content.ts
untrusted-content.ts
  1  import type { AppSettings, Message, MessageToolEvent } from '@/types'
  2  
  3  const INJECTION_PATTERNS: Array<{ code: string; re: RegExp; note: string }> = [
  4    { code: 'ignore_instructions', re: /\bignore (?:all |any |the )?(?:previous|prior|above|system|developer) instructions\b/i, note: 'tries to override existing instructions' },
  5    { code: 'reveal_prompt', re: /\b(?:reveal|show|print|dump)\b[\s\S]{0,40}\b(?:system prompt|developer prompt|hidden prompt)\b/i, note: 'asks for hidden prompt data' },
  6    { code: 'credential_theft', re: /\b(?:api key|token|password|secret|credential)s?\b[\s\S]{0,40}\b(?:send|share|reveal|print|dump|exfiltrat)/i, note: 'asks for secrets or credentials' },
  7    { code: 'tool_override', re: /\b(?:call|use|run)\b[\s\S]{0,40}\b(?:shell|terminal|browser|http_request|web_fetch|connector_message_tool)\b[\s\S]{0,40}\b(?:without|ignore)\b/i, note: 'tries to direct tool use by bypassing policy' },
  8    { code: 'workflow_override', re: /\b(?:act as|pretend to be)\b[\s\S]{0,40}\b(?:system|developer|administrator|operator)\b/i, note: 'tries to impersonate a higher-priority role' },
  9  ]
 10  
 11  const WEB_TOOL_NAMES = new Set(['browser', 'web_search', 'web_fetch', 'http_request'])
 12  
 13  function normalizeMode(value: unknown): 'off' | 'warn' | 'block' {
 14    const normalized = typeof value === 'string' ? value.trim().toLowerCase() : ''
 15    if (normalized === 'off' || normalized === 'block') return normalized
 16    return 'warn'
 17  }
 18  
 19  function summarizeFindings(findings: string[]): string {
 20    return findings.slice(0, 2).join('; ')
 21  }
 22  
 23  export function getUntrustedContentGuardMode(settings?: Partial<AppSettings> | null): 'off' | 'warn' | 'block' {
 24    return normalizeMode(settings?.untrustedContentGuardMode)
 25  }
 26  
 27  export function inspectUntrustedText(text: string): { suspicious: boolean; findings: string[] } {
 28    const findings = INJECTION_PATTERNS
 29      .filter((pattern) => pattern.re.test(text))
 30      .map((pattern) => `${pattern.code}: ${pattern.note}`)
 31    return {
 32      suspicious: findings.length > 0,
 33      findings,
 34    }
 35  }
 36  
 37  export function guardUntrustedText(params: {
 38    text: string
 39    source: string
 40    mode?: 'off' | 'warn' | 'block'
 41    trusted?: boolean
 42  }): { text: string; blocked: boolean; findings: string[] } {
 43    const text = String(params.text || '')
 44    const mode = params.mode || 'warn'
 45    if (!text.trim() || params.trusted || mode === 'off') {
 46      return { text, blocked: false, findings: [] }
 47    }
 48  
 49    const inspection = inspectUntrustedText(text)
 50    if (!inspection.suspicious) {
 51      return { text, blocked: false, findings: [] }
 52    }
 53  
 54    const summary = summarizeFindings(inspection.findings)
 55    if (mode === 'block') {
 56      return {
 57        text: `[Blocked untrusted ${params.source} content]\n${summary}`,
 58        blocked: true,
 59        findings: inspection.findings,
 60      }
 61    }
 62  
 63    return {
 64      text: `[Untrusted ${params.source} content warning: ${summary}]\n${text}`,
 65      blocked: false,
 66      findings: inspection.findings,
 67    }
 68  }
 69  
 70  export function guardUntrustedToolEvents(params: {
 71    toolEvents: MessageToolEvent[]
 72    mode?: 'off' | 'warn' | 'block'
 73  }): MessageToolEvent[] {
 74    const mode = params.mode || 'warn'
 75    if (mode === 'off' || !params.toolEvents.length) return params.toolEvents
 76  
 77    return params.toolEvents.map((event) => {
 78      if (!WEB_TOOL_NAMES.has((event.name || '').trim().toLowerCase())) return event
 79      const guarded = guardUntrustedText({
 80        text: typeof event.output === 'string' ? event.output : '',
 81        source: `tool result from ${event.name}`,
 82        mode,
 83        trusted: false,
 84      })
 85      if (!guarded.findings.length) return event
 86      return {
 87        ...event,
 88        output: guarded.text,
 89        error: guarded.blocked ? true : event.error,
 90      }
 91    })
 92  }
 93  
 94  export function guardUntrustedMessage(params: {
 95    message: Message
 96    mode?: 'off' | 'warn' | 'block'
 97    trusted?: boolean
 98    source: string
 99  }): Message {
100    const guardedText = guardUntrustedText({
101      text: params.message.text,
102      source: params.source,
103      mode: params.mode,
104      trusted: params.trusted,
105    })
106    return {
107      ...params.message,
108      text: guardedText.text,
109      toolEvents: Array.isArray(params.message.toolEvents)
110        ? guardUntrustedToolEvents({ toolEvents: params.message.toolEvents, mode: params.mode })
111        : params.message.toolEvents,
112    }
113  }