/ src / lib / server / knowledge-sources.ts
knowledge-sources.ts
   1  import { createHash } from 'crypto'
   2  import path from 'path'
   3  
   4  import { genId } from '@/lib/id'
   5  import type {
   6    KnowledgeCitation,
   7    KnowledgeHygieneAction,
   8    KnowledgeHygieneFinding,
   9    KnowledgeHygieneSummary,
  10    KnowledgeSource,
  11    KnowledgeSourceDetail,
  12    KnowledgeSourceKind,
  13    KnowledgeSourceSummary,
  14    KnowledgeRetrievalTrace,
  15    KnowledgeSearchHit,
  16    MemoryEntry,
  17  } from '@/types'
  18  import {
  19    deleteKnowledgeSource as deleteKnowledgeSourceRecord,
  20    loadKnowledgeSource,
  21    loadKnowledgeSources,
  22    patchKnowledgeSource,
  23    upsertKnowledgeSource,
  24  } from '@/lib/server/storage'
  25  import { getMemoryDb } from '@/lib/server/memory/memory-db'
  26  import {
  27    deriveKnowledgeTitle,
  28    extractKnowledgeTextFromFile,
  29    extractKnowledgeTextFromUrl,
  30  } from '@/lib/server/knowledge-import'
  31  import { onNextIdleWindow } from '@/lib/server/runtime/idle-window'
  32  
  33  const KNOWLEDGE_STALE_AFTER_MS = 1000 * 60 * 60 * 24 * 14
  34  const CHUNK_TARGET_CHARS = 2200
  35  const CHUNK_OVERLAP_CHARS = 320
  36  const MAX_KNOWLEDGE_SCAN = 10_000
  37  const MAX_HYGIENE_FINDINGS = 120
  38  const MAX_GROUNDING_HITS = 4
  39  
  40  interface KnowledgeSourceInput {
  41    kind?: KnowledgeSourceKind
  42    title?: string
  43    content?: string | null
  44    tags?: string[]
  45    scope?: 'global' | 'agent'
  46    agentIds?: string[]
  47    sourceLabel?: string | null
  48    sourceUrl?: string | null
  49    sourcePath?: string | null
  50    metadata?: Record<string, unknown>
  51  }
  52  
  53  interface IndexedChunk {
  54    title: string
  55    content: string
  56    chunkIndex: number
  57    chunkCount: number
  58    charStart: number
  59    charEnd: number
  60    sectionLabel?: string | null
  61  }
  62  
  63  let backfillPromise: Promise<void> | null = null
  64  let backfillComplete = false
  65  let maintenanceRegistered = false
  66  let maintenanceHistory: KnowledgeHygieneAction[] = []
  67  
  68  function normalizeText(value: unknown): string {
  69    return typeof value === 'string' ? value.trim() : ''
  70  }
  71  
  72  function normalizeOptionalText(value: unknown): string | null {
  73    const trimmed = normalizeText(value)
  74    return trimmed || null
  75  }
  76  
  77  function normalizeTags(tags: unknown): string[] {
  78    if (!Array.isArray(tags)) return []
  79    const seen = new Set<string>()
  80    const out: string[] = []
  81    for (const tag of tags) {
  82      if (typeof tag !== 'string') continue
  83      const trimmed = tag.trim()
  84      const key = trimmed.toLowerCase()
  85      if (!trimmed || seen.has(key)) continue
  86      seen.add(key)
  87      out.push(trimmed)
  88    }
  89    return out
  90  }
  91  
  92  function matchesTagFilter(sourceTags: string[], filterTags: string[]): boolean {
  93    if (filterTags.length === 0) return true
  94    const tagSet = new Set(sourceTags.map((tag) => tag.toLowerCase()))
  95    return filterTags.some((tag) => tagSet.has(tag.toLowerCase()))
  96  }
  97  
  98  function normalizeAgentIds(agentIds: unknown): string[] {
  99    if (!Array.isArray(agentIds)) return []
 100    const seen = new Set<string>()
 101    const out: string[] = []
 102    for (const id of agentIds) {
 103      if (typeof id !== 'string') continue
 104      const trimmed = id.trim()
 105      if (!trimmed || seen.has(trimmed)) continue
 106      seen.add(trimmed)
 107      out.push(trimmed)
 108    }
 109    return out
 110  }
 111  
 112  function normalizeScope(scope: unknown): 'global' | 'agent' {
 113    return scope === 'agent' ? 'agent' : 'global'
 114  }
 115  
 116  function normalizeKind(kind: unknown): KnowledgeSourceKind {
 117    return kind === 'file' || kind === 'url' ? kind : 'manual'
 118  }
 119  
 120  function contentHash(content: string): string {
 121    return createHash('sha256').update(content).digest('hex')
 122  }
 123  
 124  function isStaleSource(source: KnowledgeSource): boolean {
 125    if (source.archivedAt || source.supersededBySourceId) return false
 126    if (source.syncStatus === 'error') return true
 127    if (source.kind === 'manual') return false
 128    const indexedAt = typeof source.lastIndexedAt === 'number' ? source.lastIndexedAt : 0
 129    if (!indexedAt) return true
 130    return (Date.now() - indexedAt) > KNOWLEDGE_STALE_AFTER_MS
 131  }
 132  
 133  function coerceSource(source: KnowledgeSource): KnowledgeSource {
 134    const now = Date.now()
 135    return {
 136      id: source.id,
 137      kind: normalizeKind(source.kind),
 138      title: normalizeText(source.title) || 'Knowledge Source',
 139      content: typeof source.content === 'string' ? source.content : null,
 140      sourceLabel: normalizeOptionalText(source.sourceLabel),
 141      sourceUrl: normalizeOptionalText(source.sourceUrl),
 142      sourcePath: normalizeOptionalText(source.sourcePath),
 143      sourceHash: normalizeOptionalText(source.sourceHash),
 144      scope: normalizeScope(source.scope),
 145      agentIds: normalizeAgentIds(source.agentIds),
 146      tags: normalizeTags(source.tags),
 147      syncStatus: source.syncStatus === 'syncing' || source.syncStatus === 'error' ? source.syncStatus : 'ready',
 148      lastIndexedAt: typeof source.lastIndexedAt === 'number' ? source.lastIndexedAt : null,
 149      lastSyncedAt: typeof source.lastSyncedAt === 'number' ? source.lastSyncedAt : null,
 150      lastError: normalizeOptionalText(source.lastError),
 151      archivedAt: typeof source.archivedAt === 'number' ? source.archivedAt : null,
 152      archivedReason: normalizeOptionalText(source.archivedReason),
 153      duplicateOfSourceId: normalizeOptionalText(source.duplicateOfSourceId),
 154      supersededBySourceId: normalizeOptionalText(source.supersededBySourceId),
 155      maintenanceUpdatedAt: typeof source.maintenanceUpdatedAt === 'number' ? source.maintenanceUpdatedAt : null,
 156      maintenanceNotes: normalizeOptionalText(source.maintenanceNotes),
 157      nextSyncAt: typeof source.nextSyncAt === 'number' ? source.nextSyncAt : null,
 158      lastAutoSyncAt: typeof source.lastAutoSyncAt === 'number' ? source.lastAutoSyncAt : null,
 159      chunkCount: typeof source.chunkCount === 'number' ? source.chunkCount : 0,
 160      contentLength: typeof source.contentLength === 'number' ? source.contentLength : 0,
 161      createdAt: typeof source.createdAt === 'number' ? source.createdAt : now,
 162      updatedAt: typeof source.updatedAt === 'number' ? source.updatedAt : now,
 163      metadata: source.metadata && typeof source.metadata === 'object' ? source.metadata : undefined,
 164    }
 165  }
 166  
 167  function sourceIsArchived(source: KnowledgeSource): boolean {
 168    return typeof source.archivedAt === 'number' && source.archivedAt > 0
 169  }
 170  
 171  function sourceIsSuperseded(source: KnowledgeSource): boolean {
 172    return typeof source.supersededBySourceId === 'string' && source.supersededBySourceId.trim().length > 0
 173  }
 174  
 175  function sourceIsExcludedByDefault(source: KnowledgeSource): boolean {
 176    return sourceIsArchived(source) || sourceIsSuperseded(source)
 177  }
 178  
 179  function sourceVisibleToAgent(source: KnowledgeSource, viewerAgentId?: string | null): boolean {
 180    if (source.scope === 'global') return true
 181    if (!viewerAgentId) return false
 182    return source.agentIds.includes(viewerAgentId)
 183  }
 184  
 185  function cleanKnowledgeTokens(value: string): string[] {
 186    return Array.from(new Set(
 187      String(value || '')
 188        .toLowerCase()
 189        .replace(/[^a-z0-9]+/g, ' ')
 190        .split(/\s+/)
 191        .map((token) => token.trim())
 192        .filter((token) => token.length >= 3),
 193    ))
 194  }
 195  
 196  function tokenOverlapScore(left: string, right: string): number {
 197    const leftTokens = cleanKnowledgeTokens(left)
 198    const rightSet = new Set(cleanKnowledgeTokens(right))
 199    if (leftTokens.length === 0 || rightSet.size === 0) return 0
 200    let matches = 0
 201    for (const token of leftTokens) {
 202      if (rightSet.has(token)) matches += 1
 203    }
 204    return matches / Math.max(leftTokens.length, 1)
 205  }
 206  
 207  function jaccardSimilarity(left: string, right: string): number {
 208    const leftSet = new Set(cleanKnowledgeTokens(left))
 209    const rightSet = new Set(cleanKnowledgeTokens(right))
 210    if (leftSet.size === 0 || rightSet.size === 0) return 0
 211    let intersection = 0
 212    for (const token of leftSet) {
 213      if (rightSet.has(token)) intersection += 1
 214    }
 215    const union = leftSet.size + rightSet.size - intersection
 216    return union > 0 ? intersection / union : 0
 217  }
 218  
 219  function whyMatched(query: string, title: string, content: string, sectionLabel?: string | null): string {
 220    const queryTokens = cleanKnowledgeTokens(query)
 221    const contentText = `${title}\n${sectionLabel || ''}\n${content}`
 222    const contentTokens = new Set(cleanKnowledgeTokens(contentText))
 223    const matched = queryTokens.filter((token) => contentTokens.has(token))
 224    if (matched.length > 0) {
 225      const head = matched.slice(0, 4).join(', ')
 226      return `Matched query terms: ${head}${matched.length > 4 ? ', ...' : ''}`
 227    }
 228    if (sectionLabel?.trim()) return `Matched the ${sectionLabel.trim()} section`
 229    return 'Retrieved as a high-relevance knowledge chunk'
 230  }
 231  
 232  function toCitation(hit: KnowledgeSearchHit): KnowledgeCitation {
 233    return {
 234      sourceId: hit.sourceId,
 235      sourceTitle: hit.sourceTitle,
 236      sourceKind: hit.sourceKind,
 237      sourceUrl: hit.sourceUrl || null,
 238      sourceLabel: hit.sourceLabel || null,
 239      chunkId: hit.id,
 240      chunkIndex: hit.chunkIndex,
 241      chunkCount: hit.chunkCount,
 242      charStart: hit.charStart,
 243      charEnd: hit.charEnd,
 244      sectionLabel: hit.sectionLabel || null,
 245      snippet: hit.snippet,
 246      whyMatched: hit.whyMatched || null,
 247      score: hit.score,
 248    }
 249  }
 250  
 251  function listStoredSources(): KnowledgeSource[] {
 252    return Object.values(loadKnowledgeSources())
 253      .map((source) => coerceSource(source))
 254      .sort((left, right) => right.updatedAt - left.updatedAt)
 255  }
 256  
 257  function sourceTitleFromUrl(sourceUrl: string): string {
 258    try {
 259      const parsed = new URL(sourceUrl)
 260      const leaf = path.basename(parsed.pathname || '')
 261      return leaf ? deriveKnowledgeTitle(leaf) : parsed.hostname
 262    } catch {
 263      return sourceUrl
 264    }
 265  }
 266  
 267  function sourceLabelFromUrl(sourceUrl: string): string | null {
 268    try {
 269      const parsed = new URL(sourceUrl)
 270      return parsed.hostname || null
 271    } catch {
 272      return null
 273    }
 274  }
 275  
 276  function headingLabel(text: string): string | null {
 277    const match = text.match(/^#{1,6}\s+(.+)$/m)
 278    return match?.[1]?.trim() || null
 279  }
 280  
 281  function previewSnippet(content: string, query?: string): string {
 282    const normalized = String(content || '').replace(/\s+/g, ' ').trim()
 283    if (!normalized) return ''
 284    if (!query) return normalized.slice(0, 180)
 285  
 286    const queryTokens = Array.from(new Set(
 287      query
 288        .toLowerCase()
 289        .split(/\s+/)
 290        .map((token) => token.trim())
 291        .filter((token) => token.length >= 3),
 292    ))
 293  
 294    const lower = normalized.toLowerCase()
 295    let matchIndex = -1
 296    for (const token of queryTokens) {
 297      const idx = lower.indexOf(token)
 298      if (idx !== -1 && (matchIndex === -1 || idx < matchIndex)) {
 299        matchIndex = idx
 300      }
 301    }
 302  
 303    if (matchIndex === -1) return normalized.slice(0, 180)
 304    const start = Math.max(0, matchIndex - 80)
 305    const end = Math.min(normalized.length, matchIndex + 220)
 306    const prefix = start > 0 ? '…' : ''
 307    const suffix = end < normalized.length ? '…' : ''
 308    return `${prefix}${normalized.slice(start, end)}${suffix}`
 309  }
 310  
 311  function splitParagraphs(content: string): Array<{
 312    text: string
 313    start: number
 314    end: number
 315    sectionLabel: string | null
 316  }> {
 317    const normalized = content.replace(/\r\n/g, '\n').trim()
 318    if (!normalized) return []
 319  
 320    const paragraphs: Array<{ text: string; start: number; end: number; sectionLabel: string | null }> = []
 321    let cursor = 0
 322    let lastSection: string | null = null
 323    const breakRegex = /\n{2,}/g
 324  
 325    const pushParagraph = (rawStart: number, rawEnd: number) => {
 326      const raw = normalized.slice(rawStart, rawEnd)
 327      const leadingWhitespace = raw.match(/^\s*/)?.[0].length || 0
 328      const trailingWhitespace = raw.match(/\s*$/)?.[0].length || 0
 329      const text = raw.trim()
 330      if (!text) return
 331      const sectionLabel = headingLabel(text)
 332      if (sectionLabel) lastSection = sectionLabel
 333      paragraphs.push({
 334        text,
 335        start: rawStart + leadingWhitespace,
 336        end: rawEnd - trailingWhitespace,
 337        sectionLabel: lastSection,
 338      })
 339    }
 340  
 341    for (const match of normalized.matchAll(breakRegex)) {
 342      const boundary = match.index ?? 0
 343      pushParagraph(cursor, boundary)
 344      cursor = boundary + match[0].length
 345    }
 346    pushParagraph(cursor, normalized.length)
 347    return paragraphs
 348  }
 349  
 350  function splitOversizedParagraph(
 351    paragraph: { text: string; start: number; end: number; sectionLabel: string | null },
 352    sourceTitle: string,
 353  ): IndexedChunk[] {
 354    const chunks: IndexedChunk[] = []
 355    let cursor = 0
 356  
 357    while (cursor < paragraph.text.length) {
 358      let end = Math.min(paragraph.text.length, cursor + CHUNK_TARGET_CHARS)
 359      if (end < paragraph.text.length) {
 360        const boundary = paragraph.text.lastIndexOf(' ', end)
 361        if (boundary > cursor + 400) end = boundary
 362      }
 363  
 364      const raw = paragraph.text.slice(cursor, end)
 365      const leadingWhitespace = raw.match(/^\s*/)?.[0].length || 0
 366      const trailingWhitespace = raw.match(/\s*$/)?.[0].length || 0
 367      const content = raw.trim()
 368      if (content) {
 369        const relativeStart = cursor + leadingWhitespace
 370        const relativeEnd = end - trailingWhitespace
 371        chunks.push({
 372          title: paragraph.sectionLabel ? `${sourceTitle} · ${paragraph.sectionLabel}` : sourceTitle,
 373          content,
 374          chunkIndex: 0,
 375          chunkCount: 0,
 376          charStart: paragraph.start + relativeStart,
 377          charEnd: paragraph.start + relativeEnd,
 378          sectionLabel: paragraph.sectionLabel,
 379        })
 380      }
 381  
 382      if (end >= paragraph.text.length) break
 383      cursor = Math.max(cursor + 1, end - CHUNK_OVERLAP_CHARS)
 384    }
 385  
 386    return chunks
 387  }
 388  
 389  function chunkKnowledgeContent(sourceTitle: string, content: string): IndexedChunk[] {
 390    const normalized = content.replace(/\r\n/g, '\n').trim()
 391    if (!normalized) return []
 392  
 393    const paragraphs = splitParagraphs(normalized)
 394    if (paragraphs.length === 0) return []
 395  
 396    const chunks: IndexedChunk[] = []
 397    let index = 0
 398  
 399    while (index < paragraphs.length) {
 400      const firstIndex = index
 401      const first = paragraphs[index]
 402  
 403      if (first.text.length > CHUNK_TARGET_CHARS) {
 404        chunks.push(...splitOversizedParagraph(first, sourceTitle))
 405        index += 1
 406        continue
 407      }
 408  
 409      let combined = first.text
 410      const charStart = first.start
 411      let charEnd = first.end
 412      let sectionLabel = first.sectionLabel
 413      let nextIndex = index + 1
 414  
 415      while (nextIndex < paragraphs.length) {
 416        const nextParagraph = paragraphs[nextIndex]
 417        if (nextParagraph.text.length > CHUNK_TARGET_CHARS) break
 418        const candidate = `${combined}\n\n${nextParagraph.text}`
 419        if (candidate.length > CHUNK_TARGET_CHARS) break
 420        combined = candidate
 421        charEnd = nextParagraph.end
 422        sectionLabel = sectionLabel || nextParagraph.sectionLabel
 423        nextIndex += 1
 424      }
 425  
 426      chunks.push({
 427        title: sectionLabel ? `${sourceTitle} · ${sectionLabel}` : sourceTitle,
 428        content: combined,
 429        chunkIndex: 0,
 430        chunkCount: 0,
 431        charStart,
 432        charEnd,
 433        sectionLabel,
 434      })
 435  
 436      if (nextIndex >= paragraphs.length) break
 437  
 438      let overlapChars = 0
 439      let overlapStart = nextIndex
 440      for (let back = nextIndex - 1; back > firstIndex; back--) {
 441        overlapChars += paragraphs[back].text.length
 442        overlapStart = back
 443        if (overlapChars >= CHUNK_OVERLAP_CHARS) break
 444      }
 445      index = Math.max(firstIndex + 1, overlapStart)
 446    }
 447  
 448    const chunkCount = chunks.length
 449    return chunks.map((chunk, chunkIndex) => ({
 450      ...chunk,
 451      chunkIndex,
 452      chunkCount,
 453    }))
 454  }
 455  
 456  function memorySourceMeta(entry: MemoryEntry): Record<string, unknown> {
 457    return entry.metadata && typeof entry.metadata === 'object'
 458      ? entry.metadata as Record<string, unknown>
 459      : {}
 460  }
 461  
 462  function buildSourceSummary(source: KnowledgeSource, chunks?: MemoryEntry[]): KnowledgeSourceSummary {
 463    const firstChunk = chunks?.[0] || null
 464    const preview = typeof source.content === 'string' && source.content.trim()
 465      ? source.content
 466      : firstChunk?.content || ''
 467  
 468    return {
 469      ...source,
 470      stale: isStaleSource(source),
 471      topSnippet: preview ? previewSnippet(preview) : null,
 472    }
 473  }
 474  
 475  function buildSearchHit(source: KnowledgeSource, entry: MemoryEntry, score: number, query: string): KnowledgeSearchHit {
 476    const metadata = memorySourceMeta(entry)
 477    return {
 478      id: entry.id,
 479      sourceId: source.id,
 480      sourceTitle: source.title,
 481      sourceKind: source.kind,
 482      sourceUrl: source.sourceUrl || null,
 483      sourceLabel: source.sourceLabel || null,
 484      scope: source.scope,
 485      agentIds: source.agentIds,
 486      tags: source.tags,
 487      syncStatus: source.syncStatus,
 488      stale: isStaleSource(source),
 489      title: entry.title || source.title,
 490      snippet: previewSnippet(entry.content, query),
 491      content: entry.content,
 492      chunkIndex: typeof metadata.chunkIndex === 'number' ? metadata.chunkIndex : 0,
 493      chunkCount: typeof metadata.chunkCount === 'number' ? metadata.chunkCount : source.chunkCount,
 494      charStart: typeof metadata.charStart === 'number' ? metadata.charStart : 0,
 495      charEnd: typeof metadata.charEnd === 'number' ? metadata.charEnd : entry.content.length,
 496      sectionLabel: typeof metadata.sectionLabel === 'string' ? metadata.sectionLabel : null,
 497      score,
 498      whyMatched: whyMatched(query, entry.title || source.title, entry.content, typeof metadata.sectionLabel === 'string' ? metadata.sectionLabel : null),
 499      createdAt: entry.createdAt,
 500      updatedAt: entry.updatedAt,
 501    }
 502  }
 503  
 504  async function resolveSourceContent(
 505    source: KnowledgeSource,
 506    overrideContent?: string | null,
 507  ): Promise<{ content: string; title: string; sourceLabel?: string | null }> {
 508    const inlineContent = typeof overrideContent === 'string' ? overrideContent.trim() : ''
 509    if (inlineContent) {
 510      return {
 511        content: overrideContent || '',
 512        title: source.title,
 513        sourceLabel: source.sourceLabel || null,
 514      }
 515    }
 516  
 517    if (source.kind === 'manual') {
 518      if (!source.content?.trim()) throw new Error('Content is required for manual knowledge.')
 519      return {
 520        content: source.content,
 521        title: source.title,
 522        sourceLabel: source.sourceLabel || null,
 523      }
 524    }
 525  
 526    if (source.kind === 'file') {
 527      if (source.sourcePath) {
 528        return {
 529          content: await extractKnowledgeTextFromFile(source.sourcePath, source.sourceLabel || source.title),
 530          title: source.title,
 531          sourceLabel: source.sourceLabel || path.basename(source.sourcePath),
 532        }
 533      }
 534      if (source.content?.trim()) {
 535        return {
 536          content: source.content,
 537          title: source.title,
 538          sourceLabel: source.sourceLabel || null,
 539        }
 540      }
 541      throw new Error('A file path or extracted content is required for file knowledge.')
 542    }
 543  
 544    if (!source.sourceUrl) {
 545      if (source.content?.trim()) {
 546        return {
 547          content: source.content,
 548          title: source.title,
 549          sourceLabel: source.sourceLabel || null,
 550        }
 551      }
 552      throw new Error('A URL is required for URL knowledge.')
 553    }
 554  
 555    const extracted = await extractKnowledgeTextFromUrl(source.sourceUrl)
 556    return {
 557      content: extracted.content,
 558      title: source.title || extracted.title || sourceTitleFromUrl(source.sourceUrl),
 559      sourceLabel: source.sourceLabel || extracted.title || sourceLabelFromUrl(source.sourceUrl),
 560    }
 561  }
 562  
 563  function sharedWithForSource(source: KnowledgeSource): string[] | undefined {
 564    return source.scope === 'agent' && source.agentIds.length > 0 ? source.agentIds : undefined
 565  }
 566  
 567  function toChunkMetadata(source: KnowledgeSource, chunk: IndexedChunk): Record<string, unknown> {
 568    return {
 569      sourceId: source.id,
 570      sourceTitle: source.title,
 571      sourceKind: source.kind,
 572      sourceUrl: source.sourceUrl || null,
 573      sourceLabel: source.sourceLabel || null,
 574      tags: source.tags,
 575      scope: source.scope,
 576      agentIds: source.agentIds,
 577      chunkIndex: chunk.chunkIndex,
 578      chunkCount: chunk.chunkCount,
 579      charStart: chunk.charStart,
 580      charEnd: chunk.charEnd,
 581      sectionLabel: chunk.sectionLabel || null,
 582      indexedAt: Date.now(),
 583    }
 584  }
 585  
 586  function replaceSourceChunks(source: KnowledgeSource, chunks: IndexedChunk[]): MemoryEntry[] {
 587    const db = getMemoryDb()
 588    for (const existingChunk of db.listKnowledgeSourceChunks(source.id)) {
 589      db.delete(existingChunk.id)
 590    }
 591  
 592    return chunks.map((chunk) => db.add({
 593      agentId: null,
 594      sessionId: null,
 595      category: 'knowledge',
 596      title: chunk.title,
 597      content: chunk.content,
 598      metadata: toChunkMetadata(source, chunk),
 599      sharedWith: sharedWithForSource(source),
 600    }))
 601  }
 602  
 603  async function ensureLegacyKnowledgeBackfill(): Promise<void> {
 604    if (backfillComplete) return
 605    if (backfillPromise) return backfillPromise
 606    backfillPromise = (async () => {
 607      const db = getMemoryDb()
 608      const entries = db.listByCategory('knowledge', undefined, MAX_KNOWLEDGE_SCAN)
 609  
 610      for (const entry of entries) {
 611        const metadata = memorySourceMeta(entry)
 612        const existingSourceId = typeof metadata.sourceId === 'string' ? metadata.sourceId.trim() : ''
 613        if (existingSourceId) continue
 614  
 615        const scope = normalizeScope(metadata.scope)
 616        const agentIds = normalizeAgentIds(metadata.agentIds)
 617        const sourceId = entry.id
 618        const source = coerceSource({
 619          id: sourceId,
 620          kind: 'manual',
 621          title: entry.title || 'Knowledge Source',
 622          content: entry.content,
 623          sourceLabel: typeof metadata.source === 'string' ? metadata.source : null,
 624          sourceUrl: typeof metadata.sourceUrl === 'string' ? metadata.sourceUrl : null,
 625          sourcePath: typeof metadata.sourcePath === 'string' ? metadata.sourcePath : null,
 626          sourceHash: contentHash(entry.content || ''),
 627          scope,
 628          agentIds,
 629          tags: normalizeTags(metadata.tags),
 630          syncStatus: 'ready',
 631          lastIndexedAt: entry.updatedAt,
 632          lastSyncedAt: entry.updatedAt,
 633          chunkCount: 1,
 634          contentLength: entry.content.length,
 635          createdAt: entry.createdAt,
 636          updatedAt: entry.updatedAt,
 637          metadata: {
 638            legacyMemoryId: entry.id,
 639            migratedAt: Date.now(),
 640          },
 641        })
 642  
 643        upsertKnowledgeSource(sourceId, source)
 644        db.update(entry.id, {
 645          sharedWith: sharedWithForSource(source),
 646          metadata: {
 647            ...metadata,
 648            sourceId,
 649            sourceTitle: source.title,
 650            sourceKind: source.kind,
 651            sourceLabel: source.sourceLabel,
 652            sourceUrl: source.sourceUrl,
 653            tags: source.tags,
 654            scope: source.scope,
 655            agentIds: source.agentIds,
 656            chunkIndex: typeof metadata.chunkIndex === 'number' ? metadata.chunkIndex : 0,
 657            chunkCount: typeof metadata.chunkCount === 'number' ? metadata.chunkCount : 1,
 658            charStart: typeof metadata.charStart === 'number' ? metadata.charStart : 0,
 659            charEnd: typeof metadata.charEnd === 'number' ? metadata.charEnd : entry.content.length,
 660            sectionLabel: typeof metadata.sectionLabel === 'string' ? metadata.sectionLabel : null,
 661            indexedAt: typeof metadata.indexedAt === 'number' ? metadata.indexedAt : entry.updatedAt,
 662          },
 663        })
 664      }
 665      backfillComplete = true
 666    })().finally(() => {
 667      backfillPromise = null
 668    })
 669  
 670    return backfillPromise
 671  }
 672  
 673  export async function listKnowledgeSourceSummaries(options?: {
 674    tags?: string[]
 675    limit?: number
 676    includeArchived?: boolean
 677  }): Promise<KnowledgeSourceSummary[]> {
 678    await ensureLegacyKnowledgeBackfill()
 679    registerKnowledgeMaintenanceIdleCallback()
 680    const tagFilter = normalizeTags(options?.tags)
 681    const limit = Math.max(1, Math.min(500, Math.trunc(options?.limit || 200)))
 682    const includeArchived = options?.includeArchived === true
 683  
 684    const sources = listStoredSources()
 685      .filter((source) => includeArchived || !sourceIsExcludedByDefault(source))
 686      .filter((source) => matchesTagFilter(source.tags, tagFilter))
 687      .slice(0, limit)
 688  
 689    return sources.map((source) => buildSourceSummary(source))
 690  }
 691  
 692  export async function searchKnowledgeHits(options: {
 693    query: string
 694    tags?: string[]
 695    limit?: number
 696    includeArchived?: boolean
 697    viewerAgentId?: string | null
 698  }): Promise<KnowledgeSearchHit[]> {
 699    await ensureLegacyKnowledgeBackfill()
 700    registerKnowledgeMaintenanceIdleCallback()
 701    const query = normalizeText(options.query)
 702    if (!query) return []
 703  
 704    const tagFilter = normalizeTags(options.tags)
 705    const limit = Math.max(1, Math.min(500, Math.trunc(options.limit || 50)))
 706    const includeArchived = options.includeArchived === true
 707    const viewerAgentId = typeof options.viewerAgentId === 'string' ? options.viewerAgentId.trim() : ''
 708    const sourceMap = new Map(listStoredSources().map((source) => [source.id, source] as const))
 709    const matches = getMemoryDb().search(query)
 710      .filter((entry) => entry.category === 'knowledge')
 711  
 712    const hits: KnowledgeSearchHit[] = []
 713    for (const entry of matches) {
 714      const metadata = memorySourceMeta(entry)
 715      const sourceId = typeof metadata.sourceId === 'string' ? metadata.sourceId : ''
 716      const source = sourceMap.get(sourceId)
 717      if (!source) continue
 718      if (!includeArchived && sourceIsExcludedByDefault(source)) continue
 719      if (viewerAgentId && !sourceVisibleToAgent(source, viewerAgentId)) continue
 720      if (!matchesTagFilter(source.tags, tagFilter)) continue
 721      hits.push(buildSearchHit(source, entry, Math.max(0, 1 - hits.length / Math.max(matches.length, 1)), query))
 722      if (hits.length >= limit) break
 723    }
 724  
 725    return hits
 726  }
 727  
 728  export async function getKnowledgeSourceDetail(id: string): Promise<KnowledgeSourceDetail | null> {
 729    await ensureLegacyKnowledgeBackfill()
 730    const source = loadKnowledgeSource(id)
 731    if (!source) return null
 732    const normalized = coerceSource(source)
 733    const chunks = getMemoryDb().listKnowledgeSourceChunks(id)
 734    return {
 735      source: buildSourceSummary(normalized, chunks),
 736      chunks,
 737    }
 738  }
 739  
 740  export async function buildKnowledgeRetrievalTrace(options: {
 741    query: string
 742    viewerAgentId?: string | null
 743    limit?: number
 744  }): Promise<KnowledgeRetrievalTrace | null> {
 745    const hits = await searchKnowledgeHits({
 746      query: options.query,
 747      limit: Math.max(1, Math.min(MAX_GROUNDING_HITS, Math.trunc(options.limit || MAX_GROUNDING_HITS))),
 748      viewerAgentId: options.viewerAgentId || null,
 749    })
 750    if (hits.length === 0) return null
 751    return {
 752      query: normalizeText(options.query),
 753      scope: 'source_knowledge',
 754      hits: hits.map(toCitation),
 755      retrievedAt: Date.now(),
 756      selectorStatus: 'not_run',
 757    }
 758  }
 759  
 760  export function selectKnowledgeCitations(params: {
 761    responseText: string
 762    retrievalTrace?: KnowledgeRetrievalTrace | null
 763    limit?: number
 764  }): { citations: KnowledgeCitation[]; retrievalTrace: KnowledgeRetrievalTrace | null } {
 765    const trace = params.retrievalTrace
 766    if (!trace || !Array.isArray(trace.hits) || trace.hits.length === 0) {
 767      return { citations: [], retrievalTrace: trace || null }
 768    }
 769  
 770    const responseText = normalizeText(params.responseText)
 771    if (!responseText) {
 772      return {
 773        citations: [],
 774        retrievalTrace: { ...trace, selectorStatus: 'no_match' },
 775      }
 776    }
 777  
 778    const ranked = trace.hits
 779      .map((hit) => ({
 780        hit,
 781        overlap: tokenOverlapScore(responseText, `${hit.sourceTitle}\n${hit.sectionLabel || ''}\n${hit.snippet}`),
 782      }))
 783      .sort((left, right) => {
 784        const overlapDelta = right.overlap - left.overlap
 785        if (overlapDelta !== 0) return overlapDelta
 786        return right.hit.score - left.hit.score
 787      })
 788  
 789    const limit = Math.max(1, Math.min(4, Math.trunc(params.limit || 3)))
 790    const selected = ranked
 791      .filter((entry, index) => entry.overlap >= 0.08 || (entry.hit.score >= 0.7 && index === 0))
 792      .slice(0, limit)
 793      .map((entry) => entry.hit)
 794  
 795    return {
 796      citations: selected,
 797      retrievalTrace: {
 798        ...trace,
 799        selectorStatus: selected.length > 0 ? 'selected' : 'no_match',
 800      },
 801    }
 802  }
 803  
 804  async function syncSourceRecord(
 805    source: KnowledgeSource,
 806    options?: { overrideContent?: string | null; forceRewrite?: boolean },
 807  ): Promise<KnowledgeSourceDetail> {
 808    const loading = coerceSource({
 809      ...source,
 810      syncStatus: 'syncing',
 811      lastError: null,
 812      updatedAt: Date.now(),
 813    })
 814    upsertKnowledgeSource(loading.id, loading)
 815  
 816    try {
 817      const resolved = await resolveSourceContent(loading, options?.overrideContent)
 818      const chunks = chunkKnowledgeContent(resolved.title, resolved.content)
 819      if (chunks.length === 0) {
 820        throw new Error('No readable content was extracted for this source.')
 821      }
 822  
 823      const nextHash = contentHash(resolved.content)
 824      const metadataChanged = options?.forceRewrite === true
 825        || loading.title !== resolved.title
 826        || (loading.sourceLabel || null) !== (resolved.sourceLabel || null)
 827  
 828      let indexedChunks = getMemoryDb().listKnowledgeSourceChunks(loading.id)
 829      if (indexedChunks.length === 0 || metadataChanged || loading.sourceHash !== nextHash) {
 830        const rewrittenSource = coerceSource({
 831          ...loading,
 832          title: resolved.title,
 833          content: resolved.content,
 834          sourceLabel: resolved.sourceLabel ?? loading.sourceLabel ?? null,
 835          sourceHash: nextHash,
 836          chunkCount: chunks.length,
 837          contentLength: resolved.content.length,
 838          syncStatus: 'ready',
 839          lastError: null,
 840          lastIndexedAt: Date.now(),
 841          lastSyncedAt: Date.now(),
 842          nextSyncAt: Date.now() + KNOWLEDGE_STALE_AFTER_MS,
 843          updatedAt: Date.now(),
 844        })
 845        upsertKnowledgeSource(rewrittenSource.id, rewrittenSource)
 846        indexedChunks = replaceSourceChunks(rewrittenSource, chunks)
 847        return {
 848          source: buildSourceSummary(rewrittenSource, indexedChunks),
 849          chunks: indexedChunks,
 850        }
 851      }
 852  
 853      const refreshedSource = coerceSource({
 854        ...loading,
 855        content: resolved.content,
 856        sourceHash: nextHash,
 857        syncStatus: 'ready',
 858        lastError: null,
 859        lastSyncedAt: Date.now(),
 860        nextSyncAt: Date.now() + KNOWLEDGE_STALE_AFTER_MS,
 861        updatedAt: Date.now(),
 862      })
 863      upsertKnowledgeSource(refreshedSource.id, refreshedSource)
 864      return {
 865        source: buildSourceSummary(refreshedSource, indexedChunks),
 866        chunks: indexedChunks,
 867      }
 868    } catch (error) {
 869      const message = error instanceof Error ? error.message : 'Knowledge sync failed'
 870      const failed = coerceSource({
 871        ...loading,
 872        syncStatus: 'error',
 873        lastError: message,
 874        updatedAt: Date.now(),
 875      })
 876      upsertKnowledgeSource(failed.id, failed)
 877      throw error
 878    }
 879  }
 880  
 881  export async function createKnowledgeSource(input: KnowledgeSourceInput): Promise<KnowledgeSourceDetail> {
 882    await ensureLegacyKnowledgeBackfill()
 883  
 884    const now = Date.now()
 885    const kind = normalizeKind(input.kind)
 886    const title = normalizeText(input.title)
 887      || (kind === 'file' && input.sourcePath ? deriveKnowledgeTitle(path.basename(input.sourcePath)) : '')
 888      || (kind === 'url' && input.sourceUrl ? sourceTitleFromUrl(input.sourceUrl) : '')
 889      || 'Knowledge Source'
 890  
 891    const source: KnowledgeSource = coerceSource({
 892      id: genId(8),
 893      kind,
 894      title,
 895      content: typeof input.content === 'string' ? input.content : null,
 896      sourceLabel: normalizeOptionalText(input.sourceLabel),
 897      sourceUrl: normalizeOptionalText(input.sourceUrl),
 898      sourcePath: normalizeOptionalText(input.sourcePath),
 899      sourceHash: null,
 900      scope: normalizeScope(input.scope),
 901      agentIds: normalizeAgentIds(input.agentIds),
 902      tags: normalizeTags(input.tags),
 903      syncStatus: 'syncing',
 904      lastIndexedAt: null,
 905      lastSyncedAt: null,
 906      lastError: null,
 907      chunkCount: 0,
 908      contentLength: 0,
 909      createdAt: now,
 910      updatedAt: now,
 911      metadata: input.metadata,
 912    })
 913  
 914    upsertKnowledgeSource(source.id, source)
 915    return syncSourceRecord(source, { overrideContent: input.content, forceRewrite: true })
 916  }
 917  
 918  export async function updateKnowledgeSource(
 919    id: string,
 920    input: KnowledgeSourceInput,
 921  ): Promise<KnowledgeSourceDetail | null> {
 922    await ensureLegacyKnowledgeBackfill()
 923    const existing = loadKnowledgeSource(id)
 924    if (!existing) return null
 925  
 926    const normalizedExisting = coerceSource(existing)
 927    const next: KnowledgeSource = coerceSource({
 928      ...normalizedExisting,
 929      kind: normalizeKind(input.kind ?? normalizedExisting.kind),
 930      title: normalizeText(input.title) || normalizedExisting.title,
 931      content: typeof input.content === 'string' ? input.content : normalizedExisting.content,
 932      sourceLabel: input.sourceLabel !== undefined ? normalizeOptionalText(input.sourceLabel) : normalizedExisting.sourceLabel,
 933      sourceUrl: input.sourceUrl !== undefined ? normalizeOptionalText(input.sourceUrl) : normalizedExisting.sourceUrl,
 934      sourcePath: input.sourcePath !== undefined ? normalizeOptionalText(input.sourcePath) : normalizedExisting.sourcePath,
 935      scope: normalizeScope(input.scope ?? normalizedExisting.scope),
 936      agentIds: normalizeAgentIds(input.agentIds ?? normalizedExisting.agentIds),
 937      tags: normalizeTags(input.tags ?? normalizedExisting.tags),
 938      metadata: input.metadata ? { ...(normalizedExisting.metadata || {}), ...input.metadata } : normalizedExisting.metadata,
 939      updatedAt: Date.now(),
 940    })
 941  
 942    upsertKnowledgeSource(next.id, next)
 943    return syncSourceRecord(next, { overrideContent: input.content, forceRewrite: true })
 944  }
 945  
 946  export async function syncKnowledgeSource(id: string): Promise<KnowledgeSourceDetail | null> {
 947    await ensureLegacyKnowledgeBackfill()
 948    const existing = loadKnowledgeSource(id)
 949    if (!existing) return null
 950    return syncSourceRecord(coerceSource(existing))
 951  }
 952  
 953  export async function deleteKnowledgeSource(id: string): Promise<boolean> {
 954    await ensureLegacyKnowledgeBackfill()
 955    const existing = loadKnowledgeSource(id)
 956    if (!existing) return false
 957  
 958    for (const chunk of getMemoryDb().listKnowledgeSourceChunks(id)) {
 959      getMemoryDb().delete(chunk.id)
 960    }
 961    deleteKnowledgeSourceRecord(id)
 962    return true
 963  }
 964  
 965  function recordMaintenanceAction(action: KnowledgeHygieneAction): void {
 966    maintenanceHistory = [action, ...maintenanceHistory].slice(0, 48)
 967  }
 968  
 969  function upsertSourceLifecycle(id: string, updater: (source: KnowledgeSource) => KnowledgeSource): KnowledgeSource | null {
 970    const updated = patchKnowledgeSource(id, (current) => {
 971      if (!current) return null
 972      return coerceSource(updater(coerceSource(current)))
 973    })
 974    return updated ? coerceSource(updated) : null
 975  }
 976  
 977  export async function archiveKnowledgeSource(
 978    id: string,
 979    input?: { reason?: string | null; duplicateOfSourceId?: string | null; supersededBySourceId?: string | null },
 980  ): Promise<KnowledgeSourceDetail | null> {
 981    await ensureLegacyKnowledgeBackfill()
 982    const updated = upsertSourceLifecycle(id, (source) => ({
 983      ...source,
 984      archivedAt: source.archivedAt || Date.now(),
 985      archivedReason: normalizeOptionalText(input?.reason) || source.archivedReason || 'archived',
 986      duplicateOfSourceId: normalizeOptionalText(input?.duplicateOfSourceId) || source.duplicateOfSourceId || null,
 987      supersededBySourceId: normalizeOptionalText(input?.supersededBySourceId) || source.supersededBySourceId || null,
 988      maintenanceUpdatedAt: Date.now(),
 989      maintenanceNotes: normalizeOptionalText(input?.reason) || source.maintenanceNotes || null,
 990      updatedAt: Date.now(),
 991    }))
 992    if (!updated) return null
 993    recordMaintenanceAction({
 994      kind: 'archive',
 995      sourceId: updated.id,
 996      relatedSourceId: updated.duplicateOfSourceId || updated.supersededBySourceId || null,
 997      summary: `Archived ${updated.title}`,
 998      createdAt: Date.now(),
 999    })
1000    return getKnowledgeSourceDetail(updated.id)
1001  }
1002  
1003  export async function restoreKnowledgeSource(id: string): Promise<KnowledgeSourceDetail | null> {
1004    await ensureLegacyKnowledgeBackfill()
1005    const updated = upsertSourceLifecycle(id, (source) => ({
1006      ...source,
1007      archivedAt: null,
1008      archivedReason: null,
1009      duplicateOfSourceId: null,
1010      supersededBySourceId: null,
1011      maintenanceUpdatedAt: Date.now(),
1012      maintenanceNotes: 'restored',
1013      updatedAt: Date.now(),
1014    }))
1015    if (!updated) return null
1016    recordMaintenanceAction({
1017      kind: 'restore',
1018      sourceId: updated.id,
1019      summary: `Restored ${updated.title}`,
1020      createdAt: Date.now(),
1021    })
1022    return getKnowledgeSourceDetail(updated.id)
1023  }
1024  
1025  export async function supersedeKnowledgeSource(
1026    id: string,
1027    supersededBySourceId: string,
1028  ): Promise<KnowledgeSourceDetail | null> {
1029    await ensureLegacyKnowledgeBackfill()
1030    const target = loadKnowledgeSource(supersededBySourceId)
1031    if (!target) throw new Error('Superseding source not found.')
1032    const updated = upsertSourceLifecycle(id, (source) => ({
1033      ...source,
1034      supersededBySourceId,
1035      archivedAt: source.archivedAt || Date.now(),
1036      archivedReason: source.archivedReason || 'superseded',
1037      maintenanceUpdatedAt: Date.now(),
1038      maintenanceNotes: `Superseded by ${supersededBySourceId}`,
1039      updatedAt: Date.now(),
1040    }))
1041    if (!updated) return null
1042    recordMaintenanceAction({
1043      kind: 'supersede',
1044      sourceId: updated.id,
1045      relatedSourceId: supersededBySourceId,
1046      summary: `Marked ${updated.title} as superseded`,
1047      createdAt: Date.now(),
1048    })
1049    return getKnowledgeSourceDetail(updated.id)
1050  }
1051  
1052  function sameSourceOrigin(left: KnowledgeSource, right: KnowledgeSource): boolean {
1053    if (left.id === right.id) return false
1054    if (left.sourceUrl && right.sourceUrl) return left.sourceUrl === right.sourceUrl
1055    if (left.sourcePath && right.sourcePath) return left.sourcePath === right.sourcePath
1056    return false
1057  }
1058  
1059  function duplicateOriginFingerprint(source: KnowledgeSource): string {
1060    if (source.sourceUrl) return `url:${source.sourceUrl}`
1061    if (source.sourcePath) return `path:${source.sourcePath}`
1062    return `kind:${source.kind}`
1063  }
1064  
1065  function duplicateGroupKey(source: KnowledgeSource): string | null {
1066    if (!source.sourceHash) return null
1067    const sortedAgentIds = [...source.agentIds].sort()
1068    const sortedTags = [...source.tags].map((tag) => tag.toLowerCase()).sort()
1069    return [
1070      source.sourceHash,
1071      source.kind,
1072      source.scope,
1073      sortedAgentIds.join(','),
1074      sortedTags.join(','),
1075      duplicateOriginFingerprint(source),
1076    ].join('|')
1077  }
1078  
1079  function collectDuplicateGroups(sources: KnowledgeSource[]): Map<string, KnowledgeSource[]> {
1080    const duplicateGroups = new Map<string, KnowledgeSource[]>()
1081    for (const source of sources) {
1082      const groupKey = duplicateGroupKey(source)
1083      if (!groupKey) continue
1084      const group = duplicateGroups.get(groupKey) || []
1085      group.push(source)
1086      duplicateGroups.set(groupKey, group)
1087    }
1088    return duplicateGroups
1089  }
1090  
1091  function canonicalSourceForGroup(group: KnowledgeSource[]): KnowledgeSource {
1092    return [...group].sort((left, right) => {
1093      const archiveDelta = Number(sourceIsExcludedByDefault(left)) - Number(sourceIsExcludedByDefault(right))
1094      if (archiveDelta !== 0) return archiveDelta
1095      const indexedDelta = (right.lastIndexedAt || 0) - (left.lastIndexedAt || 0)
1096      if (indexedDelta !== 0) return indexedDelta
1097      return left.createdAt - right.createdAt
1098    })[0]
1099  }
1100  
1101  function buildHygieneSummary(sources: KnowledgeSource[]): KnowledgeHygieneSummary {
1102    const scannedAt = Date.now()
1103    const findings: KnowledgeHygieneFinding[] = []
1104    const pushFinding = (finding: KnowledgeHygieneFinding) => {
1105      if (findings.length < MAX_HYGIENE_FINDINGS) findings.push(finding)
1106    }
1107    const duplicateGroups = collectDuplicateGroups(sources)
1108  
1109    for (const source of sources) {
1110      if (sourceIsArchived(source)) {
1111        pushFinding({
1112          kind: 'archived',
1113          sourceId: source.id,
1114          sourceTitle: source.title,
1115          detail: source.archivedReason || 'Archived source',
1116          createdAt: source.archivedAt || source.updatedAt,
1117        })
1118      }
1119      if (sourceIsSuperseded(source)) {
1120        pushFinding({
1121          kind: 'superseded',
1122          sourceId: source.id,
1123          sourceTitle: source.title,
1124          relatedSourceId: source.supersededBySourceId || null,
1125          detail: `Superseded by ${source.supersededBySourceId}`,
1126          createdAt: source.updatedAt,
1127        })
1128      }
1129      if (source.syncStatus === 'error') {
1130        pushFinding({
1131          kind: 'broken',
1132          sourceId: source.id,
1133          sourceTitle: source.title,
1134          detail: source.lastError || 'Last sync failed',
1135          createdAt: source.updatedAt,
1136        })
1137      } else if (isStaleSource(source)) {
1138        pushFinding({
1139          kind: 'stale',
1140          sourceId: source.id,
1141          sourceTitle: source.title,
1142          detail: 'Source is due for re-sync',
1143          createdAt: source.updatedAt,
1144        })
1145      }
1146    }
1147  
1148    for (const group of duplicateGroups.values()) {
1149      if (group.length < 2) continue
1150      const canonical = canonicalSourceForGroup(group)
1151      for (const source of group) {
1152        if (source.id === canonical.id) continue
1153        pushFinding({
1154          kind: 'duplicate',
1155          sourceId: source.id,
1156          sourceTitle: source.title,
1157          relatedSourceId: canonical.id,
1158          relatedSourceTitle: canonical.title,
1159          detail: 'Exact duplicate content hash',
1160          createdAt: source.updatedAt,
1161        })
1162      }
1163    }
1164  
1165    const activeSources = sources.filter((source) => !sourceIsExcludedByDefault(source))
1166    for (let index = 0; index < activeSources.length; index += 1) {
1167      const left = activeSources[index]
1168      const leftBody = `${left.title}\n${left.content || ''}`
1169      if (!leftBody.trim()) continue
1170      for (let compareIndex = index + 1; compareIndex < activeSources.length; compareIndex += 1) {
1171        const right = activeSources[compareIndex]
1172        const rightBody = `${right.title}\n${right.content || ''}`
1173        if (!rightBody.trim()) continue
1174        if (sameSourceOrigin(left, right)) continue
1175        const overlap = jaccardSimilarity(leftBody, rightBody)
1176        if (overlap < 0.6) continue
1177        pushFinding({
1178          kind: 'overlap',
1179          sourceId: left.id,
1180          sourceTitle: left.title,
1181          relatedSourceId: right.id,
1182          relatedSourceTitle: right.title,
1183          detail: `High content overlap (${Math.round(overlap * 100)}%)`,
1184          createdAt: Math.max(left.updatedAt, right.updatedAt),
1185        })
1186      }
1187    }
1188  
1189    return {
1190      scannedAt,
1191      counts: {
1192        stale: findings.filter((finding) => finding.kind === 'stale').length,
1193        duplicate: findings.filter((finding) => finding.kind === 'duplicate').length,
1194        overlap: findings.filter((finding) => finding.kind === 'overlap').length,
1195        broken: findings.filter((finding) => finding.kind === 'broken').length,
1196        archived: findings.filter((finding) => finding.kind === 'archived').length,
1197        superseded: findings.filter((finding) => finding.kind === 'superseded').length,
1198      },
1199      findings,
1200      recentActions: [...maintenanceHistory],
1201    }
1202  }
1203  
1204  export async function getKnowledgeHygieneSummary(): Promise<KnowledgeHygieneSummary> {
1205    await ensureLegacyKnowledgeBackfill()
1206    registerKnowledgeMaintenanceIdleCallback()
1207    return buildHygieneSummary(listStoredSources())
1208  }
1209  
1210  export async function runKnowledgeHygieneMaintenance(): Promise<KnowledgeHygieneSummary> {
1211    await ensureLegacyKnowledgeBackfill()
1212    const sources = listStoredSources()
1213    const duplicateGroups = collectDuplicateGroups(sources)
1214  
1215    for (const source of sources) {
1216      if (sourceIsExcludedByDefault(source)) continue
1217      if (source.kind !== 'manual' && (isStaleSource(source) || source.syncStatus === 'error')) {
1218        try {
1219          const synced = await syncKnowledgeSource(source.id)
1220          if (synced?.source) {
1221            upsertSourceLifecycle(source.id, (current) => ({
1222              ...current,
1223              lastAutoSyncAt: Date.now(),
1224              maintenanceUpdatedAt: Date.now(),
1225              maintenanceNotes: 'auto-sync completed',
1226              updatedAt: Date.now(),
1227            }))
1228            recordMaintenanceAction({
1229              kind: source.sourceHash === synced.source.sourceHash ? 'sync' : 'reindex',
1230              sourceId: source.id,
1231              summary: `Auto-synced ${synced.source.title}`,
1232              createdAt: Date.now(),
1233            })
1234          }
1235        } catch {
1236          // Keep the existing error state for manual review.
1237        }
1238      }
1239    }
1240  
1241    for (const group of duplicateGroups.values()) {
1242      if (group.length < 2) continue
1243      const canonical = canonicalSourceForGroup(group)
1244      for (const source of group) {
1245        if (source.id === canonical.id || sourceIsExcludedByDefault(source)) continue
1246        await archiveKnowledgeSource(source.id, {
1247          reason: 'duplicate',
1248          duplicateOfSourceId: canonical.id,
1249        })
1250      }
1251    }
1252  
1253    const refreshed = listStoredSources()
1254    const originGroups = new Map<string, KnowledgeSource[]>()
1255    for (const source of refreshed) {
1256      if (sourceIsExcludedByDefault(source)) continue
1257      const origin = source.sourceUrl || source.sourcePath || ''
1258      if (!origin) continue
1259      const group = originGroups.get(origin) || []
1260      group.push(source)
1261      originGroups.set(origin, group)
1262    }
1263    for (const group of originGroups.values()) {
1264      if (group.length < 2) continue
1265      const canonical = canonicalSourceForGroup(group)
1266      for (const source of group) {
1267        if (source.id === canonical.id || sourceIsSuperseded(source)) continue
1268        if ((source.lastIndexedAt || 0) >= (canonical.lastIndexedAt || 0)) continue
1269        await supersedeKnowledgeSource(source.id, canonical.id)
1270      }
1271    }
1272  
1273    return buildHygieneSummary(listStoredSources())
1274  }
1275  
1276  export function registerKnowledgeMaintenanceIdleCallback(): void {
1277    if (maintenanceRegistered) return
1278    maintenanceRegistered = true
1279    onNextIdleWindow(async () => {
1280      maintenanceRegistered = false
1281      await runKnowledgeHygieneMaintenance()
1282      registerKnowledgeMaintenanceIdleCallback()
1283    })
1284  }