/ src / utils / markdownConfigLoader.ts
markdownConfigLoader.ts
  1  import { feature } from 'bun:bundle'
  2  import { statSync } from 'fs'
  3  import { lstat, readdir, readFile, realpath, stat } from 'fs/promises'
  4  import memoize from 'lodash-es/memoize.js'
  5  import { homedir } from 'os'
  6  import { dirname, join, resolve, sep } from 'path'
  7  import {
  8    type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
  9    logEvent,
 10  } from 'src/services/analytics/index.js'
 11  import { getProjectRoot } from '../bootstrap/state.js'
 12  import { logForDebugging } from './debug.js'
 13  import { getClaudeConfigHomeDir, isEnvTruthy } from './envUtils.js'
 14  import { isFsInaccessible } from './errors.js'
 15  import { normalizePathForComparison } from './file.js'
 16  import type { FrontmatterData } from './frontmatterParser.js'
 17  import { parseFrontmatter } from './frontmatterParser.js'
 18  import { findCanonicalGitRoot, findGitRoot } from './git.js'
 19  import { parseToolListFromCLI } from './permissions/permissionSetup.js'
 20  import { ripGrep } from './ripgrep.js'
 21  import {
 22    isSettingSourceEnabled,
 23    type SettingSource,
 24  } from './settings/constants.js'
 25  import { getManagedFilePath } from './settings/managedPath.js'
 26  import { isRestrictedToPluginOnly } from './settings/pluginOnlyPolicy.js'
 27  
 28  // Claude configuration directory names
 29  export const CLAUDE_CONFIG_DIRECTORIES = [
 30    'commands',
 31    'agents',
 32    'output-styles',
 33    'skills',
 34    'workflows',
 35    ...(feature('TEMPLATES') ? (['templates'] as const) : []),
 36  ] as const
 37  
 38  export type ClaudeConfigDirectory = (typeof CLAUDE_CONFIG_DIRECTORIES)[number]
 39  
 40  export type MarkdownFile = {
 41    filePath: string
 42    baseDir: string
 43    frontmatter: FrontmatterData
 44    content: string
 45    source: SettingSource
 46  }
 47  
 48  /**
 49   * Extracts a description from markdown content
 50   * Uses the first non-empty line as the description, or falls back to a default
 51   */
 52  export function extractDescriptionFromMarkdown(
 53    content: string,
 54    defaultDescription: string = 'Custom item',
 55  ): string {
 56    const lines = content.split('\n')
 57    for (const line of lines) {
 58      const trimmed = line.trim()
 59      if (trimmed) {
 60        // If it's a header, strip the header prefix
 61        const headerMatch = trimmed.match(/^#+\s+(.+)$/)
 62        const text = headerMatch?.[1] ?? trimmed
 63  
 64        // Return the text, limited to reasonable length
 65        return text.length > 100 ? text.substring(0, 97) + '...' : text
 66      }
 67    }
 68    return defaultDescription
 69  }
 70  
 71  /**
 72   * Parses tools from frontmatter, supporting both string and array formats
 73   * Always returns a string array for consistency
 74   * @param toolsValue The value from frontmatter
 75   * @returns Parsed tool list as string[]
 76   */
 77  function parseToolListString(toolsValue: unknown): string[] | null {
 78    // Return null for missing/null - let caller decide the default
 79    if (toolsValue === undefined || toolsValue === null) {
 80      return null
 81    }
 82  
 83    // Empty string or other falsy values mean no tools
 84    if (!toolsValue) {
 85      return []
 86    }
 87  
 88    let toolsArray: string[] = []
 89    if (typeof toolsValue === 'string') {
 90      toolsArray = [toolsValue]
 91    } else if (Array.isArray(toolsValue)) {
 92      toolsArray = toolsValue.filter(
 93        (item): item is string => typeof item === 'string',
 94      )
 95    }
 96  
 97    if (toolsArray.length === 0) {
 98      return []
 99    }
100  
101    const parsedTools = parseToolListFromCLI(toolsArray)
102    if (parsedTools.includes('*')) {
103      return ['*']
104    }
105    return parsedTools
106  }
107  
108  /**
109   * Parse tools from agent frontmatter
110   * Missing field = undefined (all tools)
111   * Empty field = [] (no tools)
112   */
113  export function parseAgentToolsFromFrontmatter(
114    toolsValue: unknown,
115  ): string[] | undefined {
116    const parsed = parseToolListString(toolsValue)
117    if (parsed === null) {
118      // For agents: undefined = all tools (undefined), null = no tools ([])
119      return toolsValue === undefined ? undefined : []
120    }
121    // If parsed contains '*', return undefined (all tools)
122    if (parsed.includes('*')) {
123      return undefined
124    }
125    return parsed
126  }
127  
128  /**
129   * Parse allowed-tools from slash command frontmatter
130   * Missing or empty field = no tools ([])
131   */
132  export function parseSlashCommandToolsFromFrontmatter(
133    toolsValue: unknown,
134  ): string[] {
135    const parsed = parseToolListString(toolsValue)
136    if (parsed === null) {
137      return []
138    }
139    return parsed
140  }
141  
142  /**
143   * Gets a unique identifier for a file based on its device ID and inode.
144   * This allows detection of duplicate files accessed through different paths
145   * (e.g., via symlinks). Returns null if the file doesn't exist or can't be stat'd.
146   *
147   * Note: On Windows, dev and ino may not be reliable for all file systems.
148   * The code handles this gracefully by returning null on error (fail open),
149   * meaning deduplication may not work on some Windows configurations.
150   *
151   * Uses bigint: true to handle filesystems with large inodes (e.g., ExFAT)
152   * that exceed JavaScript's Number precision (53 bits). Without bigint, different
153   * large inodes can round to the same Number, causing false duplicate detection.
154   * See: https://github.com/anthropics/claude-code/issues/13893
155   *
156   * @param filePath - Path to the file
157   * @returns A string identifier "device:inode" or null if file can't be identified
158   */
159  async function getFileIdentity(filePath: string): Promise<string | null> {
160    try {
161      const stats = await lstat(filePath, { bigint: true })
162      // Some filesystems (NFS, FUSE, network mounts) report dev=0 and ino=0
163      // for all files, which would cause every file to look like a duplicate.
164      // Return null to skip deduplication for these unreliable identities.
165      if (stats.dev === 0n && stats.ino === 0n) {
166        return null
167      }
168      return `${stats.dev}:${stats.ino}`
169    } catch {
170      return null
171    }
172  }
173  
174  /**
175   * Compute the stop boundary for getProjectDirsUpToHome's upward walk.
176   *
177   * Normally the walk stops at the nearest `.git` above `cwd`. But if the Bash
178   * tool has cd'd into a nested git repo inside the session's project (submodule,
179   * vendored dep with its own `.git`), that nested root isn't the right boundary —
180   * stopping there makes the parent project's `.claude/` unreachable (#31905).
181   *
182   * The boundary is widened to the session's git root only when BOTH:
183   *   - the nearest `.git` from cwd belongs to a *different* canonical repo
184   *     (submodule/vendored clone — not a worktree, which resolves back to main)
185   *   - that nearest `.git` sits *inside* the session's project tree
186   *
187   * Worktrees (under `.claude/worktrees/`) stay on the old behavior: their `.git`
188   * file is the stop, and loadMarkdownFilesForSubdir's fallback adds the main-repo
189   * copy only when the worktree lacks one.
190   */
191  function resolveStopBoundary(cwd: string): string | null {
192    const cwdGitRoot = findGitRoot(cwd)
193    const sessionGitRoot = findGitRoot(getProjectRoot())
194    if (!cwdGitRoot || !sessionGitRoot) {
195      return cwdGitRoot
196    }
197    // findCanonicalGitRoot resolves worktree `.git` files to the main repo.
198    // Submodules (no commondir) and standalone clones fall through unchanged.
199    const cwdCanonical = findCanonicalGitRoot(cwd)
200    if (
201      cwdCanonical &&
202      normalizePathForComparison(cwdCanonical) ===
203        normalizePathForComparison(sessionGitRoot)
204    ) {
205      // Same canonical repo (main, or a worktree of main). Stop at nearest .git.
206      return cwdGitRoot
207    }
208    // Different canonical repo. Is it nested *inside* the session's project?
209    const nCwdGitRoot = normalizePathForComparison(cwdGitRoot)
210    const nSessionRoot = normalizePathForComparison(sessionGitRoot)
211    if (
212      nCwdGitRoot !== nSessionRoot &&
213      nCwdGitRoot.startsWith(nSessionRoot + sep)
214    ) {
215      // Nested repo inside the project — skip past it, stop at the project's root.
216      return sessionGitRoot
217    }
218    // Sibling repo or elsewhere. Stop at nearest .git (old behavior).
219    return cwdGitRoot
220  }
221  
222  /**
223   * Traverses from the current directory up to the git root (or home directory if not in a git repo),
224   * collecting all .claude directories along the way.
225   *
226   * Stopping at git root prevents commands/skills from parent directories outside the repository
227   * from leaking into projects. For example, if ~/projects/.claude/commands/ exists, it won't
228   * appear in ~/projects/my-repo/ if my-repo is a git repository.
229   *
230   * @param subdir Subdirectory (eg. "commands", "agents")
231   * @param cwd Current working directory to start from
232   * @returns Array of directory paths containing .claude/subdir, from most specific (cwd) to least specific
233   */
234  export function getProjectDirsUpToHome(
235    subdir: ClaudeConfigDirectory,
236    cwd: string,
237  ): string[] {
238    const home = resolve(homedir()).normalize('NFC')
239    const gitRoot = resolveStopBoundary(cwd)
240    let current = resolve(cwd)
241    const dirs: string[] = []
242  
243    // Traverse from current directory up to git root (or home if not in a git repo)
244    while (true) {
245      // Stop if we've reached the home directory (don't check it, as it's loaded separately as userDir)
246      // Use normalized comparison to handle Windows drive letter casing (C:\ vs c:\)
247      if (
248        normalizePathForComparison(current) === normalizePathForComparison(home)
249      ) {
250        break
251      }
252  
253      const claudeSubdir = join(current, '.claude', subdir)
254      // Filter to existing dirs. This is a perf filter (avoids spawning
255      // ripgrep on non-existent dirs downstream) and the worktree fallback
256      // in loadMarkdownFilesForSubdir relies on it. statSync + explicit error
257      // handling instead of existsSync — re-throws unexpected errors rather
258      // than silently swallowing them. Downstream loadMarkdownFiles handles
259      // the TOCTOU window (dir disappearing before read) gracefully.
260      try {
261        statSync(claudeSubdir)
262        dirs.push(claudeSubdir)
263      } catch (e: unknown) {
264        if (!isFsInaccessible(e)) throw e
265      }
266  
267      // Stop after processing the git root directory - this prevents commands from parent
268      // directories outside the repository from appearing in the project
269      if (
270        gitRoot &&
271        normalizePathForComparison(current) ===
272          normalizePathForComparison(gitRoot)
273      ) {
274        break
275      }
276  
277      // Move to parent directory
278      const parent = dirname(current)
279  
280      // Safety check: if parent is the same as current, we've reached the root
281      if (parent === current) {
282        break
283      }
284  
285      current = parent
286    }
287  
288    return dirs
289  }
290  
291  /**
292   * Loads markdown files from managed, user, and project directories
293   * @param subdir Subdirectory (eg. "agents" or "commands")
294   * @param cwd Current working directory for project directory traversal
295   * @returns Array of parsed markdown files with metadata
296   */
297  export const loadMarkdownFilesForSubdir = memoize(
298    async function (
299      subdir: ClaudeConfigDirectory,
300      cwd: string,
301    ): Promise<MarkdownFile[]> {
302      const searchStartTime = Date.now()
303      const userDir = join(getClaudeConfigHomeDir(), subdir)
304      const managedDir = join(getManagedFilePath(), '.claude', subdir)
305      const projectDirs = getProjectDirsUpToHome(subdir, cwd)
306  
307      // For git worktrees where the worktree does NOT have .claude/<subdir> checked
308      // out (e.g. sparse-checkout), fall back to the main repository's copy.
309      // getProjectDirsUpToHome stops at the worktree root (where the .git file is),
310      // so it never sees the main repo on its own.
311      //
312      // Only add the main repo's copy when the worktree root's .claude/<subdir>
313      // is absent. A standard `git worktree add` checks out the full tree, so the
314      // worktree already has identical .claude/<subdir> content — loading the main
315      // repo's copy too would duplicate every command/agent/skill
316      // (anthropics/claude-code#29599, #28182, #26992).
317      //
318      // projectDirs already reflects existence (getProjectDirsUpToHome checked
319      // each dir), so we compare against that instead of stat'ing again.
320      const gitRoot = findGitRoot(cwd)
321      const canonicalRoot = findCanonicalGitRoot(cwd)
322      if (gitRoot && canonicalRoot && canonicalRoot !== gitRoot) {
323        const worktreeSubdir = normalizePathForComparison(
324          join(gitRoot, '.claude', subdir),
325        )
326        const worktreeHasSubdir = projectDirs.some(
327          dir => normalizePathForComparison(dir) === worktreeSubdir,
328        )
329        if (!worktreeHasSubdir) {
330          const mainClaudeSubdir = join(canonicalRoot, '.claude', subdir)
331          if (!projectDirs.includes(mainClaudeSubdir)) {
332            projectDirs.push(mainClaudeSubdir)
333          }
334        }
335      }
336  
337      const [managedFiles, userFiles, projectFilesNested] = await Promise.all([
338        // Always load managed (policy settings)
339        loadMarkdownFiles(managedDir).then(_ =>
340          _.map(file => ({
341            ...file,
342            baseDir: managedDir,
343            source: 'policySettings' as const,
344          })),
345        ),
346        // Conditionally load user files
347        isSettingSourceEnabled('userSettings') &&
348        !(subdir === 'agents' && isRestrictedToPluginOnly('agents'))
349          ? loadMarkdownFiles(userDir).then(_ =>
350              _.map(file => ({
351                ...file,
352                baseDir: userDir,
353                source: 'userSettings' as const,
354              })),
355            )
356          : Promise.resolve([]),
357        // Conditionally load project files from all directories up to home
358        isSettingSourceEnabled('projectSettings') &&
359        !(subdir === 'agents' && isRestrictedToPluginOnly('agents'))
360          ? Promise.all(
361              projectDirs.map(projectDir =>
362                loadMarkdownFiles(projectDir).then(_ =>
363                  _.map(file => ({
364                    ...file,
365                    baseDir: projectDir,
366                    source: 'projectSettings' as const,
367                  })),
368                ),
369              ),
370            )
371          : Promise.resolve([]),
372      ])
373  
374      // Flatten nested project files array
375      const projectFiles = projectFilesNested.flat()
376  
377      // Combine all files with priority: managed > user > project
378      const allFiles = [...managedFiles, ...userFiles, ...projectFiles]
379  
380      // Deduplicate files that resolve to the same physical file (same inode).
381      // This prevents the same file from appearing multiple times when ~/.claude is
382      // symlinked to a directory within the project hierarchy, causing the same
383      // physical file to be discovered through different paths.
384      const fileIdentities = await Promise.all(
385        allFiles.map(file => getFileIdentity(file.filePath)),
386      )
387  
388      const seenFileIds = new Map<string, SettingSource>()
389      const deduplicatedFiles: MarkdownFile[] = []
390  
391      for (const [i, file] of allFiles.entries()) {
392        const fileId = fileIdentities[i] ?? null
393        if (fileId === null) {
394          // If we can't identify the file, include it (fail open)
395          deduplicatedFiles.push(file)
396          continue
397        }
398        const existingSource = seenFileIds.get(fileId)
399        if (existingSource !== undefined) {
400          logForDebugging(
401            `Skipping duplicate file '${file.filePath}' from ${file.source} (same inode already loaded from ${existingSource})`,
402          )
403          continue
404        }
405        seenFileIds.set(fileId, file.source)
406        deduplicatedFiles.push(file)
407      }
408  
409      const duplicatesRemoved = allFiles.length - deduplicatedFiles.length
410      if (duplicatesRemoved > 0) {
411        logForDebugging(
412          `Deduplicated ${duplicatesRemoved} files in ${subdir} (same inode via symlinks or hard links)`,
413        )
414      }
415  
416      logEvent(`tengu_dir_search`, {
417        durationMs: Date.now() - searchStartTime,
418        managedFilesFound: managedFiles.length,
419        userFilesFound: userFiles.length,
420        projectFilesFound: projectFiles.length,
421        projectDirsSearched: projectDirs.length,
422        subdir:
423          subdir as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
424      })
425  
426      return deduplicatedFiles
427    },
428    // Custom resolver creates cache key from both subdir and cwd parameters
429    (subdir: ClaudeConfigDirectory, cwd: string) => `${subdir}:${cwd}`,
430  )
431  
432  /**
433   * Native implementation to find markdown files using Node.js fs APIs
434   *
435   * This implementation exists alongside ripgrep for the following reasons:
436   * 1. Ripgrep has poor startup performance in native builds (noticeable on app startup)
437   * 2. Provides a fallback when ripgrep is unavailable
438   * 3. Can be explicitly enabled via CLAUDE_CODE_USE_NATIVE_FILE_SEARCH env var
439   *
440   * Symlink handling:
441   * - Follows symlinks (equivalent to ripgrep's --follow flag)
442   * - Uses device+inode tracking to detect cycles (same as ripgrep's same_file library)
443   * - Falls back to realpath on systems without inode support
444   *
445   * Does not respect .gitignore (matches ripgrep with --no-ignore flag)
446   *
447   * @param dir Directory to search
448   * @param signal AbortSignal for timeout
449   * @returns Array of file paths
450   */
451  async function findMarkdownFilesNative(
452    dir: string,
453    signal: AbortSignal,
454  ): Promise<string[]> {
455    const files: string[] = []
456    const visitedDirs = new Set<string>()
457  
458    async function walk(currentDir: string): Promise<void> {
459      if (signal.aborted) {
460        return
461      }
462  
463      // Cycle detection: track visited directories by device+inode
464      // Uses bigint: true to handle filesystems with large inodes (e.g., ExFAT)
465      // that exceed JavaScript's Number precision (53 bits).
466      // See: https://github.com/anthropics/claude-code/issues/13893
467      try {
468        const stats = await stat(currentDir, { bigint: true })
469        if (stats.isDirectory()) {
470          const dirKey =
471            stats.dev !== undefined && stats.ino !== undefined
472              ? `${stats.dev}:${stats.ino}` // Unix/Linux: device + inode
473              : await realpath(currentDir) // Windows: canonical path
474  
475          if (visitedDirs.has(dirKey)) {
476            logForDebugging(
477              `Skipping already visited directory (circular symlink): ${currentDir}`,
478            )
479            return
480          }
481          visitedDirs.add(dirKey)
482        }
483      } catch (error) {
484        const errorMessage =
485          error instanceof Error ? error.message : String(error)
486        logForDebugging(`Failed to stat directory ${currentDir}: ${errorMessage}`)
487        return
488      }
489  
490      try {
491        const entries = await readdir(currentDir, { withFileTypes: true })
492  
493        for (const entry of entries) {
494          if (signal.aborted) {
495            break
496          }
497  
498          const fullPath = join(currentDir, entry.name)
499  
500          try {
501            // Handle symlinks: isFile() and isDirectory() return false for symlinks
502            if (entry.isSymbolicLink()) {
503              try {
504                const stats = await stat(fullPath) // stat() follows symlinks
505                if (stats.isDirectory()) {
506                  await walk(fullPath)
507                } else if (stats.isFile() && entry.name.endsWith('.md')) {
508                  files.push(fullPath)
509                }
510              } catch (error) {
511                const errorMessage =
512                  error instanceof Error ? error.message : String(error)
513                logForDebugging(
514                  `Failed to follow symlink ${fullPath}: ${errorMessage}`,
515                )
516              }
517            } else if (entry.isDirectory()) {
518              await walk(fullPath)
519            } else if (entry.isFile() && entry.name.endsWith('.md')) {
520              files.push(fullPath)
521            }
522          } catch (error) {
523            // Skip files/directories we can't access
524            const errorMessage =
525              error instanceof Error ? error.message : String(error)
526            logForDebugging(`Failed to access ${fullPath}: ${errorMessage}`)
527          }
528        }
529      } catch (error) {
530        // If readdir fails (e.g., permission denied), log and continue
531        const errorMessage =
532          error instanceof Error ? error.message : String(error)
533        logForDebugging(`Failed to read directory ${currentDir}: ${errorMessage}`)
534      }
535    }
536  
537    await walk(dir)
538    return files
539  }
540  
541  /**
542   * Generic function to load markdown files from specified directories
543   * @param dir Directory (eg. "~/.claude/commands")
544   * @returns Array of parsed markdown files with metadata
545   */
546  async function loadMarkdownFiles(dir: string): Promise<
547    {
548      filePath: string
549      frontmatter: FrontmatterData
550      content: string
551    }[]
552  > {
553    // File search strategy:
554    // - Default: ripgrep (faster, battle-tested)
555    // - Fallback: native Node.js (when CLAUDE_CODE_USE_NATIVE_FILE_SEARCH is set)
556    //
557    // Why both? Ripgrep has poor startup performance in native builds.
558    const useNative = isEnvTruthy(process.env.CLAUDE_CODE_USE_NATIVE_FILE_SEARCH)
559    const signal = AbortSignal.timeout(3000)
560    let files: string[]
561    try {
562      files = useNative
563        ? await findMarkdownFilesNative(dir, signal)
564        : await ripGrep(
565            ['--files', '--hidden', '--follow', '--no-ignore', '--glob', '*.md'],
566            dir,
567            signal,
568          )
569    } catch (e: unknown) {
570      // Handle missing/inaccessible dir directly instead of pre-checking
571      // existence (TOCTOU). findMarkdownFilesNative already catches internally;
572      // ripGrep rejects on inaccessible target paths.
573      if (isFsInaccessible(e)) return []
574      throw e
575    }
576  
577    const results = await Promise.all(
578      files.map(async filePath => {
579        try {
580          const rawContent = await readFile(filePath, { encoding: 'utf-8' })
581          const { frontmatter, content } = parseFrontmatter(rawContent, filePath)
582  
583          return {
584            filePath,
585            frontmatter,
586            content,
587          }
588        } catch (error) {
589          const errorMessage =
590            error instanceof Error ? error.message : String(error)
591          logForDebugging(
592            `Failed to read/parse markdown file:  ${filePath}: ${errorMessage}`,
593          )
594          return null
595        }
596      }),
597    )
598  
599    return results.filter(_ => _ !== null)
600  }