/ tools / WebFetchTool / utils.ts
utils.ts
  1  import axios, { type AxiosResponse } from 'axios'
  2  import { LRUCache } from 'lru-cache'
  3  import {
  4    type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  5    logEvent,
  6  } from '../../services/analytics/index.js'
  7  import { queryHaiku } from '../../services/api/claude.js'
  8  import { AbortError } from '../../utils/errors.js'
  9  import { getWebFetchUserAgent } from '../../utils/http.js'
 10  import { logError } from '../../utils/log.js'
 11  import {
 12    isBinaryContentType,
 13    persistBinaryContent,
 14  } from '../../utils/mcpOutputStorage.js'
 15  import { getSettings_DEPRECATED } from '../../utils/settings/settings.js'
 16  import { asSystemPrompt } from '../../utils/systemPromptType.js'
 17  import { isPreapprovedHost } from './preapproved.js'
 18  import { makeSecondaryModelPrompt } from './prompt.js'
 19  
 20  // Custom error classes for domain blocking
 21  class DomainBlockedError extends Error {
 22    constructor(domain: string) {
 23      super(`Claude Code is unable to fetch from ${domain}`)
 24      this.name = 'DomainBlockedError'
 25    }
 26  }
 27  
 28  class DomainCheckFailedError extends Error {
 29    constructor(domain: string) {
 30      super(
 31        `Unable to verify if domain ${domain} is safe to fetch. This may be due to network restrictions or enterprise security policies blocking claude.ai.`,
 32      )
 33      this.name = 'DomainCheckFailedError'
 34    }
 35  }
 36  
 37  class EgressBlockedError extends Error {
 38    constructor(public readonly domain: string) {
 39      super(
 40        JSON.stringify({
 41          error_type: 'EGRESS_BLOCKED',
 42          domain,
 43          message: `Access to ${domain} is blocked by the network egress proxy.`,
 44        }),
 45      )
 46      this.name = 'EgressBlockedError'
 47    }
 48  }
 49  
 50  // Cache for storing fetched URL content
 51  type CacheEntry = {
 52    bytes: number
 53    code: number
 54    codeText: string
 55    content: string
 56    contentType: string
 57    persistedPath?: string
 58    persistedSize?: number
 59  }
 60  
 61  // Cache with 15-minute TTL and 50MB size limit
 62  // LRUCache handles automatic expiration and eviction
 63  const CACHE_TTL_MS = 15 * 60 * 1000 // 15 minutes
 64  const MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 // 50MB
 65  
 66  const URL_CACHE = new LRUCache<string, CacheEntry>({
 67    maxSize: MAX_CACHE_SIZE_BYTES,
 68    ttl: CACHE_TTL_MS,
 69  })
 70  
 71  // Separate cache for preflight domain checks. URL_CACHE is URL-keyed, so
 72  // fetching two paths on the same domain triggers two identical preflight
 73  // HTTP round-trips to api.anthropic.com. This hostname-keyed cache avoids
 74  // that. Only 'allowed' is cached — blocked/failed re-check on next attempt.
 75  const DOMAIN_CHECK_CACHE = new LRUCache<string, true>({
 76    max: 128,
 77    ttl: 5 * 60 * 1000, // 5 minutes — shorter than URL_CACHE TTL
 78  })
 79  
 80  export function clearWebFetchCache(): void {
 81    URL_CACHE.clear()
 82    DOMAIN_CHECK_CACHE.clear()
 83  }
 84  
 85  // Lazy singleton — defers the turndown → @mixmark-io/domino import (~1.4MB
 86  // retained heap) until the first HTML fetch, and reuses one instance across
 87  // calls (construction builds 15 rule objects; .turndown() is stateless).
 88  // @types/turndown ships only `export =` (no .d.mts), so TS types the import
 89  // as the class itself while Bun wraps CJS in { default } — hence the cast.
 90  type TurndownCtor = typeof import('turndown')
 91  let turndownServicePromise: Promise<InstanceType<TurndownCtor>> | undefined
 92  function getTurndownService(): Promise<InstanceType<TurndownCtor>> {
 93    return (turndownServicePromise ??= import('turndown').then(m => {
 94      const Turndown = (m as unknown as { default: TurndownCtor }).default
 95      return new Turndown()
 96    }))
 97  }
 98  
 99  // PSR requested limiting the length of URLs to 250 to lower the potential
100  // for a data exfiltration. However, this is too restrictive for some customers'
101  // legitimate use cases, such as JWT-signed URLs (e.g., cloud service signed URLs)
102  // that can be much longer. We already require user approval for each domain,
103  // which provides a primary security boundary. In addition, Claude Code has
104  // other data exfil channels, and this one does not seem relatively high risk,
105  // so I'm removing that length restriction. -ab
106  const MAX_URL_LENGTH = 2000
107  
108  // Per PSR:
109  // "Implement resource consumption controls because setting limits on CPU,
110  // memory, and network usage for the Web Fetch tool can prevent a single
111  // request or user from overwhelming the system."
112  const MAX_HTTP_CONTENT_LENGTH = 10 * 1024 * 1024
113  
114  // Timeout for the main HTTP fetch request (60 seconds).
115  // Prevents hanging indefinitely on slow/unresponsive servers.
116  const FETCH_TIMEOUT_MS = 60_000
117  
118  // Timeout for the domain blocklist preflight check (10 seconds).
119  const DOMAIN_CHECK_TIMEOUT_MS = 10_000
120  
121  // Cap same-host redirect hops. Without this a malicious server can return
122  // a redirect loop (/a → /b → /a …) and the per-request FETCH_TIMEOUT_MS
123  // resets on every hop, hanging the tool until user interrupt. 10 matches
124  // common client defaults (axios=5, follow-redirects=21, Chrome=20).
125  const MAX_REDIRECTS = 10
126  
127  // Truncate to not spend too many tokens
128  export const MAX_MARKDOWN_LENGTH = 100_000
129  
130  export function isPreapprovedUrl(url: string): boolean {
131    try {
132      const parsedUrl = new URL(url)
133      return isPreapprovedHost(parsedUrl.hostname, parsedUrl.pathname)
134    } catch {
135      return false
136    }
137  }
138  
139  export function validateURL(url: string): boolean {
140    if (url.length > MAX_URL_LENGTH) {
141      return false
142    }
143  
144    let parsed
145    try {
146      parsed = new URL(url)
147    } catch {
148      return false
149    }
150  
151    // We don't need to check protocol here, as we'll upgrade http to https when making the request
152  
153    // As long as we aren't supporting aiming to cookies or internal domains,
154    // we should block URLs with usernames/passwords too, even though these
155    // seem exceedingly unlikely.
156    if (parsed.username || parsed.password) {
157      return false
158    }
159  
160    // Initial filter that this isn't a privileged, company-internal URL
161    // by checking that the hostname is publicly resolvable
162    const hostname = parsed.hostname
163    const parts = hostname.split('.')
164    if (parts.length < 2) {
165      return false
166    }
167  
168    return true
169  }
170  
171  type DomainCheckResult =
172    | { status: 'allowed' }
173    | { status: 'blocked' }
174    | { status: 'check_failed'; error: Error }
175  
176  export async function checkDomainBlocklist(
177    domain: string,
178  ): Promise<DomainCheckResult> {
179    if (DOMAIN_CHECK_CACHE.has(domain)) {
180      return { status: 'allowed' }
181    }
182    try {
183      const response = await axios.get(
184        `https://api.anthropic.com/api/web/domain_info?domain=${encodeURIComponent(domain)}`,
185        { timeout: DOMAIN_CHECK_TIMEOUT_MS },
186      )
187      if (response.status === 200) {
188        if (response.data.can_fetch === true) {
189          DOMAIN_CHECK_CACHE.set(domain, true)
190          return { status: 'allowed' }
191        }
192        return { status: 'blocked' }
193      }
194      // Non-200 status but didn't throw
195      return {
196        status: 'check_failed',
197        error: new Error(`Domain check returned status ${response.status}`),
198      }
199    } catch (e) {
200      logError(e)
201      return { status: 'check_failed', error: e as Error }
202    }
203  }
204  
205  /**
206   * Check if a redirect is safe to follow
207   * Allows redirects that:
208   * - Add or remove "www." in the hostname
209   * - Keep the origin the same but change path/query params
210   * - Or both of the above
211   */
212  export function isPermittedRedirect(
213    originalUrl: string,
214    redirectUrl: string,
215  ): boolean {
216    try {
217      const parsedOriginal = new URL(originalUrl)
218      const parsedRedirect = new URL(redirectUrl)
219  
220      if (parsedRedirect.protocol !== parsedOriginal.protocol) {
221        return false
222      }
223  
224      if (parsedRedirect.port !== parsedOriginal.port) {
225        return false
226      }
227  
228      if (parsedRedirect.username || parsedRedirect.password) {
229        return false
230      }
231  
232      // Now check hostname conditions
233      // 1. Adding www. is allowed: example.com -> www.example.com
234      // 2. Removing www. is allowed: www.example.com -> example.com
235      // 3. Same host (with or without www.) is allowed: paths can change
236      const stripWww = (hostname: string) => hostname.replace(/^www\./, '')
237      const originalHostWithoutWww = stripWww(parsedOriginal.hostname)
238      const redirectHostWithoutWww = stripWww(parsedRedirect.hostname)
239      return originalHostWithoutWww === redirectHostWithoutWww
240    } catch (_error) {
241      return false
242    }
243  }
244  
245  /**
246   * Helper function to handle fetching URLs with custom redirect handling
247   * Recursively follows redirects if they pass the redirectChecker function
248   *
249   * Per PSR:
250   * "Do not automatically follow redirects because following redirects could
251   * allow for an attacker to exploit an open redirect vulnerability in a
252   * trusted domain to force a user to make a request to a malicious domain
253   * unknowingly"
254   */
255  type RedirectInfo = {
256    type: 'redirect'
257    originalUrl: string
258    redirectUrl: string
259    statusCode: number
260  }
261  
262  export async function getWithPermittedRedirects(
263    url: string,
264    signal: AbortSignal,
265    redirectChecker: (originalUrl: string, redirectUrl: string) => boolean,
266    depth = 0,
267  ): Promise<AxiosResponse<ArrayBuffer> | RedirectInfo> {
268    if (depth > MAX_REDIRECTS) {
269      throw new Error(`Too many redirects (exceeded ${MAX_REDIRECTS})`)
270    }
271    try {
272      return await axios.get(url, {
273        signal,
274        timeout: FETCH_TIMEOUT_MS,
275        maxRedirects: 0,
276        responseType: 'arraybuffer',
277        maxContentLength: MAX_HTTP_CONTENT_LENGTH,
278        headers: {
279          Accept: 'text/markdown, text/html, */*',
280          'User-Agent': getWebFetchUserAgent(),
281        },
282      })
283    } catch (error) {
284      if (
285        axios.isAxiosError(error) &&
286        error.response &&
287        [301, 302, 307, 308].includes(error.response.status)
288      ) {
289        const redirectLocation = error.response.headers.location
290        if (!redirectLocation) {
291          throw new Error('Redirect missing Location header')
292        }
293  
294        // Resolve relative URLs against the original URL
295        const redirectUrl = new URL(redirectLocation, url).toString()
296  
297        if (redirectChecker(url, redirectUrl)) {
298          // Recursively follow the permitted redirect
299          return getWithPermittedRedirects(
300            redirectUrl,
301            signal,
302            redirectChecker,
303            depth + 1,
304          )
305        } else {
306          // Return redirect information to the caller
307          return {
308            type: 'redirect',
309            originalUrl: url,
310            redirectUrl,
311            statusCode: error.response.status,
312          }
313        }
314      }
315  
316      // Detect egress proxy blocks: the proxy returns 403 with
317      // X-Proxy-Error: blocked-by-allowlist when egress is restricted
318      if (
319        axios.isAxiosError(error) &&
320        error.response?.status === 403 &&
321        error.response.headers['x-proxy-error'] === 'blocked-by-allowlist'
322      ) {
323        const hostname = new URL(url).hostname
324        throw new EgressBlockedError(hostname)
325      }
326  
327      throw error
328    }
329  }
330  
331  function isRedirectInfo(
332    response: AxiosResponse<ArrayBuffer> | RedirectInfo,
333  ): response is RedirectInfo {
334    return 'type' in response && response.type === 'redirect'
335  }
336  
337  export type FetchedContent = {
338    content: string
339    bytes: number
340    code: number
341    codeText: string
342    contentType: string
343    persistedPath?: string
344    persistedSize?: number
345  }
346  
347  export async function getURLMarkdownContent(
348    url: string,
349    abortController: AbortController,
350  ): Promise<FetchedContent | RedirectInfo> {
351    if (!validateURL(url)) {
352      throw new Error('Invalid URL')
353    }
354  
355    // Check cache (LRUCache handles TTL automatically)
356    const cachedEntry = URL_CACHE.get(url)
357    if (cachedEntry) {
358      return {
359        bytes: cachedEntry.bytes,
360        code: cachedEntry.code,
361        codeText: cachedEntry.codeText,
362        content: cachedEntry.content,
363        contentType: cachedEntry.contentType,
364        persistedPath: cachedEntry.persistedPath,
365        persistedSize: cachedEntry.persistedSize,
366      }
367    }
368  
369    let parsedUrl: URL
370    let upgradedUrl = url
371  
372    try {
373      parsedUrl = new URL(url)
374  
375      // Upgrade http to https if needed
376      if (parsedUrl.protocol === 'http:') {
377        parsedUrl.protocol = 'https:'
378        upgradedUrl = parsedUrl.toString()
379      }
380  
381      const hostname = parsedUrl.hostname
382  
383      // Check if the user has opted to skip the blocklist check
384      // This is for enterprise customers with restrictive security policies
385      // that prevent outbound connections to claude.ai
386      const settings = getSettings_DEPRECATED()
387      if (!settings.skipWebFetchPreflight) {
388        const checkResult = await checkDomainBlocklist(hostname)
389        switch (checkResult.status) {
390          case 'allowed':
391            // Continue with the fetch
392            break
393          case 'blocked':
394            throw new DomainBlockedError(hostname)
395          case 'check_failed':
396            throw new DomainCheckFailedError(hostname)
397        }
398      }
399  
400      if (process.env.USER_TYPE === 'ant') {
401        logEvent('tengu_web_fetch_host', {
402          hostname:
403            hostname as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
404        })
405      }
406    } catch (e) {
407      if (
408        e instanceof DomainBlockedError ||
409        e instanceof DomainCheckFailedError
410      ) {
411        // Expected user-facing failures - re-throw without logging as internal error
412        throw e
413      }
414      logError(e)
415    }
416  
417    const response = await getWithPermittedRedirects(
418      upgradedUrl,
419      abortController.signal,
420      isPermittedRedirect,
421    )
422  
423    // Check if we got a redirect response
424    if (isRedirectInfo(response)) {
425      return response
426    }
427  
428    const rawBuffer = Buffer.from(response.data)
429    // Release the axios-held ArrayBuffer copy; rawBuffer owns the bytes now.
430    // This lets GC reclaim up to MAX_HTTP_CONTENT_LENGTH (10MB) before Turndown
431    // builds its DOM tree (which can be 3-5x the HTML size).
432    ;(response as { data: unknown }).data = null
433    const contentType = response.headers['content-type'] ?? ''
434  
435    // Binary content: save raw bytes to disk with a proper extension so Claude
436    // can inspect the file later. We still fall through to the utf-8 decode +
437    // Haiku path below — for PDFs in particular the decoded string has enough
438    // ASCII structure (/Title, text streams) that Haiku can summarize it, and
439    // the saved file is a supplement rather than a replacement.
440    let persistedPath: string | undefined
441    let persistedSize: number | undefined
442    if (isBinaryContentType(contentType)) {
443      const persistId = `webfetch-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
444      const result = await persistBinaryContent(rawBuffer, contentType, persistId)
445      if (!('error' in result)) {
446        persistedPath = result.filepath
447        persistedSize = result.size
448      }
449    }
450  
451    const bytes = rawBuffer.length
452    const htmlContent = rawBuffer.toString('utf-8')
453  
454    let markdownContent: string
455    let contentBytes: number
456    if (contentType.includes('text/html')) {
457      markdownContent = (await getTurndownService()).turndown(htmlContent)
458      contentBytes = Buffer.byteLength(markdownContent)
459    } else {
460      // It's not HTML - just use it raw. The decoded string's UTF-8 byte
461      // length equals rawBuffer.length (modulo U+FFFD replacement on invalid
462      // bytes — negligible for cache eviction accounting), so skip the O(n)
463      // Buffer.byteLength scan.
464      markdownContent = htmlContent
465      contentBytes = bytes
466    }
467  
468    // Store the fetched content in cache. Note that it's stored under
469    // the original URL, not the upgraded or redirected URL.
470    const entry: CacheEntry = {
471      bytes,
472      code: response.status,
473      codeText: response.statusText,
474      content: markdownContent,
475      contentType,
476      persistedPath,
477      persistedSize,
478    }
479    // lru-cache requires positive integers; clamp to 1 for empty responses.
480    URL_CACHE.set(url, entry, { size: Math.max(1, contentBytes) })
481    return entry
482  }
483  
484  export async function applyPromptToMarkdown(
485    prompt: string,
486    markdownContent: string,
487    signal: AbortSignal,
488    isNonInteractiveSession: boolean,
489    isPreapprovedDomain: boolean,
490  ): Promise<string> {
491    // Truncate content to avoid "Prompt is too long" errors from the secondary model
492    const truncatedContent =
493      markdownContent.length > MAX_MARKDOWN_LENGTH
494        ? markdownContent.slice(0, MAX_MARKDOWN_LENGTH) +
495          '\n\n[Content truncated due to length...]'
496        : markdownContent
497  
498    const modelPrompt = makeSecondaryModelPrompt(
499      truncatedContent,
500      prompt,
501      isPreapprovedDomain,
502    )
503    const assistantMessage = await queryHaiku({
504      systemPrompt: asSystemPrompt([]),
505      userPrompt: modelPrompt,
506      signal,
507      options: {
508        querySource: 'web_fetch_apply',
509        agents: [],
510        isNonInteractiveSession,
511        hasAppendSystemPrompt: false,
512        mcpTools: [],
513      },
514    })
515  
516    // We need to bubble this up, so that the tool call throws, causing us to return
517    // an is_error tool_use block to the server, and render a red dot in the UI.
518    if (signal.aborted) {
519      throw new AbortError()
520    }
521  
522    const { content } = assistantMessage.message
523    if (content.length > 0) {
524      const contentBlock = content[0]
525      if ('text' in contentBlock!) {
526        return contentBlock.text
527      }
528    }
529    return 'No response from model'
530  }