knowledge-sources.ts
1 import { createHash } from 'crypto' 2 import path from 'path' 3 4 import { genId } from '@/lib/id' 5 import type { 6 KnowledgeCitation, 7 KnowledgeHygieneAction, 8 KnowledgeHygieneFinding, 9 KnowledgeHygieneSummary, 10 KnowledgeSource, 11 KnowledgeSourceDetail, 12 KnowledgeSourceKind, 13 KnowledgeSourceSummary, 14 KnowledgeRetrievalTrace, 15 KnowledgeSearchHit, 16 MemoryEntry, 17 } from '@/types' 18 import { 19 deleteKnowledgeSource as deleteKnowledgeSourceRecord, 20 loadKnowledgeSource, 21 loadKnowledgeSources, 22 patchKnowledgeSource, 23 upsertKnowledgeSource, 24 } from '@/lib/server/storage' 25 import { getMemoryDb } from '@/lib/server/memory/memory-db' 26 import { 27 deriveKnowledgeTitle, 28 extractKnowledgeTextFromFile, 29 extractKnowledgeTextFromUrl, 30 } from '@/lib/server/knowledge-import' 31 import { onNextIdleWindow } from '@/lib/server/runtime/idle-window' 32 33 const KNOWLEDGE_STALE_AFTER_MS = 1000 * 60 * 60 * 24 * 14 34 const CHUNK_TARGET_CHARS = 2200 35 const CHUNK_OVERLAP_CHARS = 320 36 const MAX_KNOWLEDGE_SCAN = 10_000 37 const MAX_HYGIENE_FINDINGS = 120 38 const MAX_GROUNDING_HITS = 4 39 40 interface KnowledgeSourceInput { 41 kind?: KnowledgeSourceKind 42 title?: string 43 content?: string | null 44 tags?: string[] 45 scope?: 'global' | 'agent' 46 agentIds?: string[] 47 sourceLabel?: string | null 48 sourceUrl?: string | null 49 sourcePath?: string | null 50 metadata?: Record<string, unknown> 51 } 52 53 interface IndexedChunk { 54 title: string 55 content: string 56 chunkIndex: number 57 chunkCount: number 58 charStart: number 59 charEnd: number 60 sectionLabel?: string | null 61 } 62 63 let backfillPromise: Promise<void> | null = null 64 let backfillComplete = false 65 let maintenanceRegistered = false 66 let maintenanceHistory: KnowledgeHygieneAction[] = [] 67 68 function normalizeText(value: unknown): string { 69 return typeof value === 'string' ? value.trim() : '' 70 } 71 72 function normalizeOptionalText(value: unknown): string | null { 73 const trimmed = normalizeText(value) 74 return trimmed || null 75 } 76 77 function normalizeTags(tags: unknown): string[] { 78 if (!Array.isArray(tags)) return [] 79 const seen = new Set<string>() 80 const out: string[] = [] 81 for (const tag of tags) { 82 if (typeof tag !== 'string') continue 83 const trimmed = tag.trim() 84 const key = trimmed.toLowerCase() 85 if (!trimmed || seen.has(key)) continue 86 seen.add(key) 87 out.push(trimmed) 88 } 89 return out 90 } 91 92 function matchesTagFilter(sourceTags: string[], filterTags: string[]): boolean { 93 if (filterTags.length === 0) return true 94 const tagSet = new Set(sourceTags.map((tag) => tag.toLowerCase())) 95 return filterTags.some((tag) => tagSet.has(tag.toLowerCase())) 96 } 97 98 function normalizeAgentIds(agentIds: unknown): string[] { 99 if (!Array.isArray(agentIds)) return [] 100 const seen = new Set<string>() 101 const out: string[] = [] 102 for (const id of agentIds) { 103 if (typeof id !== 'string') continue 104 const trimmed = id.trim() 105 if (!trimmed || seen.has(trimmed)) continue 106 seen.add(trimmed) 107 out.push(trimmed) 108 } 109 return out 110 } 111 112 function normalizeScope(scope: unknown): 'global' | 'agent' { 113 return scope === 'agent' ? 'agent' : 'global' 114 } 115 116 function normalizeKind(kind: unknown): KnowledgeSourceKind { 117 return kind === 'file' || kind === 'url' ? kind : 'manual' 118 } 119 120 function contentHash(content: string): string { 121 return createHash('sha256').update(content).digest('hex') 122 } 123 124 function isStaleSource(source: KnowledgeSource): boolean { 125 if (source.archivedAt || source.supersededBySourceId) return false 126 if (source.syncStatus === 'error') return true 127 if (source.kind === 'manual') return false 128 const indexedAt = typeof source.lastIndexedAt === 'number' ? source.lastIndexedAt : 0 129 if (!indexedAt) return true 130 return (Date.now() - indexedAt) > KNOWLEDGE_STALE_AFTER_MS 131 } 132 133 function coerceSource(source: KnowledgeSource): KnowledgeSource { 134 const now = Date.now() 135 return { 136 id: source.id, 137 kind: normalizeKind(source.kind), 138 title: normalizeText(source.title) || 'Knowledge Source', 139 content: typeof source.content === 'string' ? source.content : null, 140 sourceLabel: normalizeOptionalText(source.sourceLabel), 141 sourceUrl: normalizeOptionalText(source.sourceUrl), 142 sourcePath: normalizeOptionalText(source.sourcePath), 143 sourceHash: normalizeOptionalText(source.sourceHash), 144 scope: normalizeScope(source.scope), 145 agentIds: normalizeAgentIds(source.agentIds), 146 tags: normalizeTags(source.tags), 147 syncStatus: source.syncStatus === 'syncing' || source.syncStatus === 'error' ? source.syncStatus : 'ready', 148 lastIndexedAt: typeof source.lastIndexedAt === 'number' ? source.lastIndexedAt : null, 149 lastSyncedAt: typeof source.lastSyncedAt === 'number' ? source.lastSyncedAt : null, 150 lastError: normalizeOptionalText(source.lastError), 151 archivedAt: typeof source.archivedAt === 'number' ? source.archivedAt : null, 152 archivedReason: normalizeOptionalText(source.archivedReason), 153 duplicateOfSourceId: normalizeOptionalText(source.duplicateOfSourceId), 154 supersededBySourceId: normalizeOptionalText(source.supersededBySourceId), 155 maintenanceUpdatedAt: typeof source.maintenanceUpdatedAt === 'number' ? source.maintenanceUpdatedAt : null, 156 maintenanceNotes: normalizeOptionalText(source.maintenanceNotes), 157 nextSyncAt: typeof source.nextSyncAt === 'number' ? source.nextSyncAt : null, 158 lastAutoSyncAt: typeof source.lastAutoSyncAt === 'number' ? source.lastAutoSyncAt : null, 159 chunkCount: typeof source.chunkCount === 'number' ? source.chunkCount : 0, 160 contentLength: typeof source.contentLength === 'number' ? source.contentLength : 0, 161 createdAt: typeof source.createdAt === 'number' ? source.createdAt : now, 162 updatedAt: typeof source.updatedAt === 'number' ? source.updatedAt : now, 163 metadata: source.metadata && typeof source.metadata === 'object' ? source.metadata : undefined, 164 } 165 } 166 167 function sourceIsArchived(source: KnowledgeSource): boolean { 168 return typeof source.archivedAt === 'number' && source.archivedAt > 0 169 } 170 171 function sourceIsSuperseded(source: KnowledgeSource): boolean { 172 return typeof source.supersededBySourceId === 'string' && source.supersededBySourceId.trim().length > 0 173 } 174 175 function sourceIsExcludedByDefault(source: KnowledgeSource): boolean { 176 return sourceIsArchived(source) || sourceIsSuperseded(source) 177 } 178 179 function sourceVisibleToAgent(source: KnowledgeSource, viewerAgentId?: string | null): boolean { 180 if (source.scope === 'global') return true 181 if (!viewerAgentId) return false 182 return source.agentIds.includes(viewerAgentId) 183 } 184 185 function cleanKnowledgeTokens(value: string): string[] { 186 return Array.from(new Set( 187 String(value || '') 188 .toLowerCase() 189 .replace(/[^a-z0-9]+/g, ' ') 190 .split(/\s+/) 191 .map((token) => token.trim()) 192 .filter((token) => token.length >= 3), 193 )) 194 } 195 196 function tokenOverlapScore(left: string, right: string): number { 197 const leftTokens = cleanKnowledgeTokens(left) 198 const rightSet = new Set(cleanKnowledgeTokens(right)) 199 if (leftTokens.length === 0 || rightSet.size === 0) return 0 200 let matches = 0 201 for (const token of leftTokens) { 202 if (rightSet.has(token)) matches += 1 203 } 204 return matches / Math.max(leftTokens.length, 1) 205 } 206 207 function jaccardSimilarity(left: string, right: string): number { 208 const leftSet = new Set(cleanKnowledgeTokens(left)) 209 const rightSet = new Set(cleanKnowledgeTokens(right)) 210 if (leftSet.size === 0 || rightSet.size === 0) return 0 211 let intersection = 0 212 for (const token of leftSet) { 213 if (rightSet.has(token)) intersection += 1 214 } 215 const union = leftSet.size + rightSet.size - intersection 216 return union > 0 ? intersection / union : 0 217 } 218 219 function whyMatched(query: string, title: string, content: string, sectionLabel?: string | null): string { 220 const queryTokens = cleanKnowledgeTokens(query) 221 const contentText = `${title}\n${sectionLabel || ''}\n${content}` 222 const contentTokens = new Set(cleanKnowledgeTokens(contentText)) 223 const matched = queryTokens.filter((token) => contentTokens.has(token)) 224 if (matched.length > 0) { 225 const head = matched.slice(0, 4).join(', ') 226 return `Matched query terms: ${head}${matched.length > 4 ? ', ...' : ''}` 227 } 228 if (sectionLabel?.trim()) return `Matched the ${sectionLabel.trim()} section` 229 return 'Retrieved as a high-relevance knowledge chunk' 230 } 231 232 function toCitation(hit: KnowledgeSearchHit): KnowledgeCitation { 233 return { 234 sourceId: hit.sourceId, 235 sourceTitle: hit.sourceTitle, 236 sourceKind: hit.sourceKind, 237 sourceUrl: hit.sourceUrl || null, 238 sourceLabel: hit.sourceLabel || null, 239 chunkId: hit.id, 240 chunkIndex: hit.chunkIndex, 241 chunkCount: hit.chunkCount, 242 charStart: hit.charStart, 243 charEnd: hit.charEnd, 244 sectionLabel: hit.sectionLabel || null, 245 snippet: hit.snippet, 246 whyMatched: hit.whyMatched || null, 247 score: hit.score, 248 } 249 } 250 251 function listStoredSources(): KnowledgeSource[] { 252 return Object.values(loadKnowledgeSources()) 253 .map((source) => coerceSource(source)) 254 .sort((left, right) => right.updatedAt - left.updatedAt) 255 } 256 257 function sourceTitleFromUrl(sourceUrl: string): string { 258 try { 259 const parsed = new URL(sourceUrl) 260 const leaf = path.basename(parsed.pathname || '') 261 return leaf ? deriveKnowledgeTitle(leaf) : parsed.hostname 262 } catch { 263 return sourceUrl 264 } 265 } 266 267 function sourceLabelFromUrl(sourceUrl: string): string | null { 268 try { 269 const parsed = new URL(sourceUrl) 270 return parsed.hostname || null 271 } catch { 272 return null 273 } 274 } 275 276 function headingLabel(text: string): string | null { 277 const match = text.match(/^#{1,6}\s+(.+)$/m) 278 return match?.[1]?.trim() || null 279 } 280 281 function previewSnippet(content: string, query?: string): string { 282 const normalized = String(content || '').replace(/\s+/g, ' ').trim() 283 if (!normalized) return '' 284 if (!query) return normalized.slice(0, 180) 285 286 const queryTokens = Array.from(new Set( 287 query 288 .toLowerCase() 289 .split(/\s+/) 290 .map((token) => token.trim()) 291 .filter((token) => token.length >= 3), 292 )) 293 294 const lower = normalized.toLowerCase() 295 let matchIndex = -1 296 for (const token of queryTokens) { 297 const idx = lower.indexOf(token) 298 if (idx !== -1 && (matchIndex === -1 || idx < matchIndex)) { 299 matchIndex = idx 300 } 301 } 302 303 if (matchIndex === -1) return normalized.slice(0, 180) 304 const start = Math.max(0, matchIndex - 80) 305 const end = Math.min(normalized.length, matchIndex + 220) 306 const prefix = start > 0 ? '…' : '' 307 const suffix = end < normalized.length ? '…' : '' 308 return `${prefix}${normalized.slice(start, end)}${suffix}` 309 } 310 311 function splitParagraphs(content: string): Array<{ 312 text: string 313 start: number 314 end: number 315 sectionLabel: string | null 316 }> { 317 const normalized = content.replace(/\r\n/g, '\n').trim() 318 if (!normalized) return [] 319 320 const paragraphs: Array<{ text: string; start: number; end: number; sectionLabel: string | null }> = [] 321 let cursor = 0 322 let lastSection: string | null = null 323 const breakRegex = /\n{2,}/g 324 325 const pushParagraph = (rawStart: number, rawEnd: number) => { 326 const raw = normalized.slice(rawStart, rawEnd) 327 const leadingWhitespace = raw.match(/^\s*/)?.[0].length || 0 328 const trailingWhitespace = raw.match(/\s*$/)?.[0].length || 0 329 const text = raw.trim() 330 if (!text) return 331 const sectionLabel = headingLabel(text) 332 if (sectionLabel) lastSection = sectionLabel 333 paragraphs.push({ 334 text, 335 start: rawStart + leadingWhitespace, 336 end: rawEnd - trailingWhitespace, 337 sectionLabel: lastSection, 338 }) 339 } 340 341 for (const match of normalized.matchAll(breakRegex)) { 342 const boundary = match.index ?? 0 343 pushParagraph(cursor, boundary) 344 cursor = boundary + match[0].length 345 } 346 pushParagraph(cursor, normalized.length) 347 return paragraphs 348 } 349 350 function splitOversizedParagraph( 351 paragraph: { text: string; start: number; end: number; sectionLabel: string | null }, 352 sourceTitle: string, 353 ): IndexedChunk[] { 354 const chunks: IndexedChunk[] = [] 355 let cursor = 0 356 357 while (cursor < paragraph.text.length) { 358 let end = Math.min(paragraph.text.length, cursor + CHUNK_TARGET_CHARS) 359 if (end < paragraph.text.length) { 360 const boundary = paragraph.text.lastIndexOf(' ', end) 361 if (boundary > cursor + 400) end = boundary 362 } 363 364 const raw = paragraph.text.slice(cursor, end) 365 const leadingWhitespace = raw.match(/^\s*/)?.[0].length || 0 366 const trailingWhitespace = raw.match(/\s*$/)?.[0].length || 0 367 const content = raw.trim() 368 if (content) { 369 const relativeStart = cursor + leadingWhitespace 370 const relativeEnd = end - trailingWhitespace 371 chunks.push({ 372 title: paragraph.sectionLabel ? `${sourceTitle} · ${paragraph.sectionLabel}` : sourceTitle, 373 content, 374 chunkIndex: 0, 375 chunkCount: 0, 376 charStart: paragraph.start + relativeStart, 377 charEnd: paragraph.start + relativeEnd, 378 sectionLabel: paragraph.sectionLabel, 379 }) 380 } 381 382 if (end >= paragraph.text.length) break 383 cursor = Math.max(cursor + 1, end - CHUNK_OVERLAP_CHARS) 384 } 385 386 return chunks 387 } 388 389 function chunkKnowledgeContent(sourceTitle: string, content: string): IndexedChunk[] { 390 const normalized = content.replace(/\r\n/g, '\n').trim() 391 if (!normalized) return [] 392 393 const paragraphs = splitParagraphs(normalized) 394 if (paragraphs.length === 0) return [] 395 396 const chunks: IndexedChunk[] = [] 397 let index = 0 398 399 while (index < paragraphs.length) { 400 const firstIndex = index 401 const first = paragraphs[index] 402 403 if (first.text.length > CHUNK_TARGET_CHARS) { 404 chunks.push(...splitOversizedParagraph(first, sourceTitle)) 405 index += 1 406 continue 407 } 408 409 let combined = first.text 410 const charStart = first.start 411 let charEnd = first.end 412 let sectionLabel = first.sectionLabel 413 let nextIndex = index + 1 414 415 while (nextIndex < paragraphs.length) { 416 const nextParagraph = paragraphs[nextIndex] 417 if (nextParagraph.text.length > CHUNK_TARGET_CHARS) break 418 const candidate = `${combined}\n\n${nextParagraph.text}` 419 if (candidate.length > CHUNK_TARGET_CHARS) break 420 combined = candidate 421 charEnd = nextParagraph.end 422 sectionLabel = sectionLabel || nextParagraph.sectionLabel 423 nextIndex += 1 424 } 425 426 chunks.push({ 427 title: sectionLabel ? `${sourceTitle} · ${sectionLabel}` : sourceTitle, 428 content: combined, 429 chunkIndex: 0, 430 chunkCount: 0, 431 charStart, 432 charEnd, 433 sectionLabel, 434 }) 435 436 if (nextIndex >= paragraphs.length) break 437 438 let overlapChars = 0 439 let overlapStart = nextIndex 440 for (let back = nextIndex - 1; back > firstIndex; back--) { 441 overlapChars += paragraphs[back].text.length 442 overlapStart = back 443 if (overlapChars >= CHUNK_OVERLAP_CHARS) break 444 } 445 index = Math.max(firstIndex + 1, overlapStart) 446 } 447 448 const chunkCount = chunks.length 449 return chunks.map((chunk, chunkIndex) => ({ 450 ...chunk, 451 chunkIndex, 452 chunkCount, 453 })) 454 } 455 456 function memorySourceMeta(entry: MemoryEntry): Record<string, unknown> { 457 return entry.metadata && typeof entry.metadata === 'object' 458 ? entry.metadata as Record<string, unknown> 459 : {} 460 } 461 462 function buildSourceSummary(source: KnowledgeSource, chunks?: MemoryEntry[]): KnowledgeSourceSummary { 463 const firstChunk = chunks?.[0] || null 464 const preview = typeof source.content === 'string' && source.content.trim() 465 ? source.content 466 : firstChunk?.content || '' 467 468 return { 469 ...source, 470 stale: isStaleSource(source), 471 topSnippet: preview ? previewSnippet(preview) : null, 472 } 473 } 474 475 function buildSearchHit(source: KnowledgeSource, entry: MemoryEntry, score: number, query: string): KnowledgeSearchHit { 476 const metadata = memorySourceMeta(entry) 477 return { 478 id: entry.id, 479 sourceId: source.id, 480 sourceTitle: source.title, 481 sourceKind: source.kind, 482 sourceUrl: source.sourceUrl || null, 483 sourceLabel: source.sourceLabel || null, 484 scope: source.scope, 485 agentIds: source.agentIds, 486 tags: source.tags, 487 syncStatus: source.syncStatus, 488 stale: isStaleSource(source), 489 title: entry.title || source.title, 490 snippet: previewSnippet(entry.content, query), 491 content: entry.content, 492 chunkIndex: typeof metadata.chunkIndex === 'number' ? metadata.chunkIndex : 0, 493 chunkCount: typeof metadata.chunkCount === 'number' ? metadata.chunkCount : source.chunkCount, 494 charStart: typeof metadata.charStart === 'number' ? metadata.charStart : 0, 495 charEnd: typeof metadata.charEnd === 'number' ? metadata.charEnd : entry.content.length, 496 sectionLabel: typeof metadata.sectionLabel === 'string' ? metadata.sectionLabel : null, 497 score, 498 whyMatched: whyMatched(query, entry.title || source.title, entry.content, typeof metadata.sectionLabel === 'string' ? metadata.sectionLabel : null), 499 createdAt: entry.createdAt, 500 updatedAt: entry.updatedAt, 501 } 502 } 503 504 async function resolveSourceContent( 505 source: KnowledgeSource, 506 overrideContent?: string | null, 507 ): Promise<{ content: string; title: string; sourceLabel?: string | null }> { 508 const inlineContent = typeof overrideContent === 'string' ? overrideContent.trim() : '' 509 if (inlineContent) { 510 return { 511 content: overrideContent || '', 512 title: source.title, 513 sourceLabel: source.sourceLabel || null, 514 } 515 } 516 517 if (source.kind === 'manual') { 518 if (!source.content?.trim()) throw new Error('Content is required for manual knowledge.') 519 return { 520 content: source.content, 521 title: source.title, 522 sourceLabel: source.sourceLabel || null, 523 } 524 } 525 526 if (source.kind === 'file') { 527 if (source.sourcePath) { 528 return { 529 content: await extractKnowledgeTextFromFile(source.sourcePath, source.sourceLabel || source.title), 530 title: source.title, 531 sourceLabel: source.sourceLabel || path.basename(source.sourcePath), 532 } 533 } 534 if (source.content?.trim()) { 535 return { 536 content: source.content, 537 title: source.title, 538 sourceLabel: source.sourceLabel || null, 539 } 540 } 541 throw new Error('A file path or extracted content is required for file knowledge.') 542 } 543 544 if (!source.sourceUrl) { 545 if (source.content?.trim()) { 546 return { 547 content: source.content, 548 title: source.title, 549 sourceLabel: source.sourceLabel || null, 550 } 551 } 552 throw new Error('A URL is required for URL knowledge.') 553 } 554 555 const extracted = await extractKnowledgeTextFromUrl(source.sourceUrl) 556 return { 557 content: extracted.content, 558 title: source.title || extracted.title || sourceTitleFromUrl(source.sourceUrl), 559 sourceLabel: source.sourceLabel || extracted.title || sourceLabelFromUrl(source.sourceUrl), 560 } 561 } 562 563 function sharedWithForSource(source: KnowledgeSource): string[] | undefined { 564 return source.scope === 'agent' && source.agentIds.length > 0 ? source.agentIds : undefined 565 } 566 567 function toChunkMetadata(source: KnowledgeSource, chunk: IndexedChunk): Record<string, unknown> { 568 return { 569 sourceId: source.id, 570 sourceTitle: source.title, 571 sourceKind: source.kind, 572 sourceUrl: source.sourceUrl || null, 573 sourceLabel: source.sourceLabel || null, 574 tags: source.tags, 575 scope: source.scope, 576 agentIds: source.agentIds, 577 chunkIndex: chunk.chunkIndex, 578 chunkCount: chunk.chunkCount, 579 charStart: chunk.charStart, 580 charEnd: chunk.charEnd, 581 sectionLabel: chunk.sectionLabel || null, 582 indexedAt: Date.now(), 583 } 584 } 585 586 function replaceSourceChunks(source: KnowledgeSource, chunks: IndexedChunk[]): MemoryEntry[] { 587 const db = getMemoryDb() 588 for (const existingChunk of db.listKnowledgeSourceChunks(source.id)) { 589 db.delete(existingChunk.id) 590 } 591 592 return chunks.map((chunk) => db.add({ 593 agentId: null, 594 sessionId: null, 595 category: 'knowledge', 596 title: chunk.title, 597 content: chunk.content, 598 metadata: toChunkMetadata(source, chunk), 599 sharedWith: sharedWithForSource(source), 600 })) 601 } 602 603 async function ensureLegacyKnowledgeBackfill(): Promise<void> { 604 if (backfillComplete) return 605 if (backfillPromise) return backfillPromise 606 backfillPromise = (async () => { 607 const db = getMemoryDb() 608 const entries = db.listByCategory('knowledge', undefined, MAX_KNOWLEDGE_SCAN) 609 610 for (const entry of entries) { 611 const metadata = memorySourceMeta(entry) 612 const existingSourceId = typeof metadata.sourceId === 'string' ? metadata.sourceId.trim() : '' 613 if (existingSourceId) continue 614 615 const scope = normalizeScope(metadata.scope) 616 const agentIds = normalizeAgentIds(metadata.agentIds) 617 const sourceId = entry.id 618 const source = coerceSource({ 619 id: sourceId, 620 kind: 'manual', 621 title: entry.title || 'Knowledge Source', 622 content: entry.content, 623 sourceLabel: typeof metadata.source === 'string' ? metadata.source : null, 624 sourceUrl: typeof metadata.sourceUrl === 'string' ? metadata.sourceUrl : null, 625 sourcePath: typeof metadata.sourcePath === 'string' ? metadata.sourcePath : null, 626 sourceHash: contentHash(entry.content || ''), 627 scope, 628 agentIds, 629 tags: normalizeTags(metadata.tags), 630 syncStatus: 'ready', 631 lastIndexedAt: entry.updatedAt, 632 lastSyncedAt: entry.updatedAt, 633 chunkCount: 1, 634 contentLength: entry.content.length, 635 createdAt: entry.createdAt, 636 updatedAt: entry.updatedAt, 637 metadata: { 638 legacyMemoryId: entry.id, 639 migratedAt: Date.now(), 640 }, 641 }) 642 643 upsertKnowledgeSource(sourceId, source) 644 db.update(entry.id, { 645 sharedWith: sharedWithForSource(source), 646 metadata: { 647 ...metadata, 648 sourceId, 649 sourceTitle: source.title, 650 sourceKind: source.kind, 651 sourceLabel: source.sourceLabel, 652 sourceUrl: source.sourceUrl, 653 tags: source.tags, 654 scope: source.scope, 655 agentIds: source.agentIds, 656 chunkIndex: typeof metadata.chunkIndex === 'number' ? metadata.chunkIndex : 0, 657 chunkCount: typeof metadata.chunkCount === 'number' ? metadata.chunkCount : 1, 658 charStart: typeof metadata.charStart === 'number' ? metadata.charStart : 0, 659 charEnd: typeof metadata.charEnd === 'number' ? metadata.charEnd : entry.content.length, 660 sectionLabel: typeof metadata.sectionLabel === 'string' ? metadata.sectionLabel : null, 661 indexedAt: typeof metadata.indexedAt === 'number' ? metadata.indexedAt : entry.updatedAt, 662 }, 663 }) 664 } 665 backfillComplete = true 666 })().finally(() => { 667 backfillPromise = null 668 }) 669 670 return backfillPromise 671 } 672 673 export async function listKnowledgeSourceSummaries(options?: { 674 tags?: string[] 675 limit?: number 676 includeArchived?: boolean 677 }): Promise<KnowledgeSourceSummary[]> { 678 await ensureLegacyKnowledgeBackfill() 679 registerKnowledgeMaintenanceIdleCallback() 680 const tagFilter = normalizeTags(options?.tags) 681 const limit = Math.max(1, Math.min(500, Math.trunc(options?.limit || 200))) 682 const includeArchived = options?.includeArchived === true 683 684 const sources = listStoredSources() 685 .filter((source) => includeArchived || !sourceIsExcludedByDefault(source)) 686 .filter((source) => matchesTagFilter(source.tags, tagFilter)) 687 .slice(0, limit) 688 689 return sources.map((source) => buildSourceSummary(source)) 690 } 691 692 export async function searchKnowledgeHits(options: { 693 query: string 694 tags?: string[] 695 limit?: number 696 includeArchived?: boolean 697 viewerAgentId?: string | null 698 }): Promise<KnowledgeSearchHit[]> { 699 await ensureLegacyKnowledgeBackfill() 700 registerKnowledgeMaintenanceIdleCallback() 701 const query = normalizeText(options.query) 702 if (!query) return [] 703 704 const tagFilter = normalizeTags(options.tags) 705 const limit = Math.max(1, Math.min(500, Math.trunc(options.limit || 50))) 706 const includeArchived = options.includeArchived === true 707 const viewerAgentId = typeof options.viewerAgentId === 'string' ? options.viewerAgentId.trim() : '' 708 const sourceMap = new Map(listStoredSources().map((source) => [source.id, source] as const)) 709 const matches = getMemoryDb().search(query) 710 .filter((entry) => entry.category === 'knowledge') 711 712 const hits: KnowledgeSearchHit[] = [] 713 for (const entry of matches) { 714 const metadata = memorySourceMeta(entry) 715 const sourceId = typeof metadata.sourceId === 'string' ? metadata.sourceId : '' 716 const source = sourceMap.get(sourceId) 717 if (!source) continue 718 if (!includeArchived && sourceIsExcludedByDefault(source)) continue 719 if (viewerAgentId && !sourceVisibleToAgent(source, viewerAgentId)) continue 720 if (!matchesTagFilter(source.tags, tagFilter)) continue 721 hits.push(buildSearchHit(source, entry, Math.max(0, 1 - hits.length / Math.max(matches.length, 1)), query)) 722 if (hits.length >= limit) break 723 } 724 725 return hits 726 } 727 728 export async function getKnowledgeSourceDetail(id: string): Promise<KnowledgeSourceDetail | null> { 729 await ensureLegacyKnowledgeBackfill() 730 const source = loadKnowledgeSource(id) 731 if (!source) return null 732 const normalized = coerceSource(source) 733 const chunks = getMemoryDb().listKnowledgeSourceChunks(id) 734 return { 735 source: buildSourceSummary(normalized, chunks), 736 chunks, 737 } 738 } 739 740 export async function buildKnowledgeRetrievalTrace(options: { 741 query: string 742 viewerAgentId?: string | null 743 limit?: number 744 }): Promise<KnowledgeRetrievalTrace | null> { 745 const hits = await searchKnowledgeHits({ 746 query: options.query, 747 limit: Math.max(1, Math.min(MAX_GROUNDING_HITS, Math.trunc(options.limit || MAX_GROUNDING_HITS))), 748 viewerAgentId: options.viewerAgentId || null, 749 }) 750 if (hits.length === 0) return null 751 return { 752 query: normalizeText(options.query), 753 scope: 'source_knowledge', 754 hits: hits.map(toCitation), 755 retrievedAt: Date.now(), 756 selectorStatus: 'not_run', 757 } 758 } 759 760 export function selectKnowledgeCitations(params: { 761 responseText: string 762 retrievalTrace?: KnowledgeRetrievalTrace | null 763 limit?: number 764 }): { citations: KnowledgeCitation[]; retrievalTrace: KnowledgeRetrievalTrace | null } { 765 const trace = params.retrievalTrace 766 if (!trace || !Array.isArray(trace.hits) || trace.hits.length === 0) { 767 return { citations: [], retrievalTrace: trace || null } 768 } 769 770 const responseText = normalizeText(params.responseText) 771 if (!responseText) { 772 return { 773 citations: [], 774 retrievalTrace: { ...trace, selectorStatus: 'no_match' }, 775 } 776 } 777 778 const ranked = trace.hits 779 .map((hit) => ({ 780 hit, 781 overlap: tokenOverlapScore(responseText, `${hit.sourceTitle}\n${hit.sectionLabel || ''}\n${hit.snippet}`), 782 })) 783 .sort((left, right) => { 784 const overlapDelta = right.overlap - left.overlap 785 if (overlapDelta !== 0) return overlapDelta 786 return right.hit.score - left.hit.score 787 }) 788 789 const limit = Math.max(1, Math.min(4, Math.trunc(params.limit || 3))) 790 const selected = ranked 791 .filter((entry, index) => entry.overlap >= 0.08 || (entry.hit.score >= 0.7 && index === 0)) 792 .slice(0, limit) 793 .map((entry) => entry.hit) 794 795 return { 796 citations: selected, 797 retrievalTrace: { 798 ...trace, 799 selectorStatus: selected.length > 0 ? 'selected' : 'no_match', 800 }, 801 } 802 } 803 804 async function syncSourceRecord( 805 source: KnowledgeSource, 806 options?: { overrideContent?: string | null; forceRewrite?: boolean }, 807 ): Promise<KnowledgeSourceDetail> { 808 const loading = coerceSource({ 809 ...source, 810 syncStatus: 'syncing', 811 lastError: null, 812 updatedAt: Date.now(), 813 }) 814 upsertKnowledgeSource(loading.id, loading) 815 816 try { 817 const resolved = await resolveSourceContent(loading, options?.overrideContent) 818 const chunks = chunkKnowledgeContent(resolved.title, resolved.content) 819 if (chunks.length === 0) { 820 throw new Error('No readable content was extracted for this source.') 821 } 822 823 const nextHash = contentHash(resolved.content) 824 const metadataChanged = options?.forceRewrite === true 825 || loading.title !== resolved.title 826 || (loading.sourceLabel || null) !== (resolved.sourceLabel || null) 827 828 let indexedChunks = getMemoryDb().listKnowledgeSourceChunks(loading.id) 829 if (indexedChunks.length === 0 || metadataChanged || loading.sourceHash !== nextHash) { 830 const rewrittenSource = coerceSource({ 831 ...loading, 832 title: resolved.title, 833 content: resolved.content, 834 sourceLabel: resolved.sourceLabel ?? loading.sourceLabel ?? null, 835 sourceHash: nextHash, 836 chunkCount: chunks.length, 837 contentLength: resolved.content.length, 838 syncStatus: 'ready', 839 lastError: null, 840 lastIndexedAt: Date.now(), 841 lastSyncedAt: Date.now(), 842 nextSyncAt: Date.now() + KNOWLEDGE_STALE_AFTER_MS, 843 updatedAt: Date.now(), 844 }) 845 upsertKnowledgeSource(rewrittenSource.id, rewrittenSource) 846 indexedChunks = replaceSourceChunks(rewrittenSource, chunks) 847 return { 848 source: buildSourceSummary(rewrittenSource, indexedChunks), 849 chunks: indexedChunks, 850 } 851 } 852 853 const refreshedSource = coerceSource({ 854 ...loading, 855 content: resolved.content, 856 sourceHash: nextHash, 857 syncStatus: 'ready', 858 lastError: null, 859 lastSyncedAt: Date.now(), 860 nextSyncAt: Date.now() + KNOWLEDGE_STALE_AFTER_MS, 861 updatedAt: Date.now(), 862 }) 863 upsertKnowledgeSource(refreshedSource.id, refreshedSource) 864 return { 865 source: buildSourceSummary(refreshedSource, indexedChunks), 866 chunks: indexedChunks, 867 } 868 } catch (error) { 869 const message = error instanceof Error ? error.message : 'Knowledge sync failed' 870 const failed = coerceSource({ 871 ...loading, 872 syncStatus: 'error', 873 lastError: message, 874 updatedAt: Date.now(), 875 }) 876 upsertKnowledgeSource(failed.id, failed) 877 throw error 878 } 879 } 880 881 export async function createKnowledgeSource(input: KnowledgeSourceInput): Promise<KnowledgeSourceDetail> { 882 await ensureLegacyKnowledgeBackfill() 883 884 const now = Date.now() 885 const kind = normalizeKind(input.kind) 886 const title = normalizeText(input.title) 887 || (kind === 'file' && input.sourcePath ? deriveKnowledgeTitle(path.basename(input.sourcePath)) : '') 888 || (kind === 'url' && input.sourceUrl ? sourceTitleFromUrl(input.sourceUrl) : '') 889 || 'Knowledge Source' 890 891 const source: KnowledgeSource = coerceSource({ 892 id: genId(8), 893 kind, 894 title, 895 content: typeof input.content === 'string' ? input.content : null, 896 sourceLabel: normalizeOptionalText(input.sourceLabel), 897 sourceUrl: normalizeOptionalText(input.sourceUrl), 898 sourcePath: normalizeOptionalText(input.sourcePath), 899 sourceHash: null, 900 scope: normalizeScope(input.scope), 901 agentIds: normalizeAgentIds(input.agentIds), 902 tags: normalizeTags(input.tags), 903 syncStatus: 'syncing', 904 lastIndexedAt: null, 905 lastSyncedAt: null, 906 lastError: null, 907 chunkCount: 0, 908 contentLength: 0, 909 createdAt: now, 910 updatedAt: now, 911 metadata: input.metadata, 912 }) 913 914 upsertKnowledgeSource(source.id, source) 915 return syncSourceRecord(source, { overrideContent: input.content, forceRewrite: true }) 916 } 917 918 export async function updateKnowledgeSource( 919 id: string, 920 input: KnowledgeSourceInput, 921 ): Promise<KnowledgeSourceDetail | null> { 922 await ensureLegacyKnowledgeBackfill() 923 const existing = loadKnowledgeSource(id) 924 if (!existing) return null 925 926 const normalizedExisting = coerceSource(existing) 927 const next: KnowledgeSource = coerceSource({ 928 ...normalizedExisting, 929 kind: normalizeKind(input.kind ?? normalizedExisting.kind), 930 title: normalizeText(input.title) || normalizedExisting.title, 931 content: typeof input.content === 'string' ? input.content : normalizedExisting.content, 932 sourceLabel: input.sourceLabel !== undefined ? normalizeOptionalText(input.sourceLabel) : normalizedExisting.sourceLabel, 933 sourceUrl: input.sourceUrl !== undefined ? normalizeOptionalText(input.sourceUrl) : normalizedExisting.sourceUrl, 934 sourcePath: input.sourcePath !== undefined ? normalizeOptionalText(input.sourcePath) : normalizedExisting.sourcePath, 935 scope: normalizeScope(input.scope ?? normalizedExisting.scope), 936 agentIds: normalizeAgentIds(input.agentIds ?? normalizedExisting.agentIds), 937 tags: normalizeTags(input.tags ?? normalizedExisting.tags), 938 metadata: input.metadata ? { ...(normalizedExisting.metadata || {}), ...input.metadata } : normalizedExisting.metadata, 939 updatedAt: Date.now(), 940 }) 941 942 upsertKnowledgeSource(next.id, next) 943 return syncSourceRecord(next, { overrideContent: input.content, forceRewrite: true }) 944 } 945 946 export async function syncKnowledgeSource(id: string): Promise<KnowledgeSourceDetail | null> { 947 await ensureLegacyKnowledgeBackfill() 948 const existing = loadKnowledgeSource(id) 949 if (!existing) return null 950 return syncSourceRecord(coerceSource(existing)) 951 } 952 953 export async function deleteKnowledgeSource(id: string): Promise<boolean> { 954 await ensureLegacyKnowledgeBackfill() 955 const existing = loadKnowledgeSource(id) 956 if (!existing) return false 957 958 for (const chunk of getMemoryDb().listKnowledgeSourceChunks(id)) { 959 getMemoryDb().delete(chunk.id) 960 } 961 deleteKnowledgeSourceRecord(id) 962 return true 963 } 964 965 function recordMaintenanceAction(action: KnowledgeHygieneAction): void { 966 maintenanceHistory = [action, ...maintenanceHistory].slice(0, 48) 967 } 968 969 function upsertSourceLifecycle(id: string, updater: (source: KnowledgeSource) => KnowledgeSource): KnowledgeSource | null { 970 const updated = patchKnowledgeSource(id, (current) => { 971 if (!current) return null 972 return coerceSource(updater(coerceSource(current))) 973 }) 974 return updated ? coerceSource(updated) : null 975 } 976 977 export async function archiveKnowledgeSource( 978 id: string, 979 input?: { reason?: string | null; duplicateOfSourceId?: string | null; supersededBySourceId?: string | null }, 980 ): Promise<KnowledgeSourceDetail | null> { 981 await ensureLegacyKnowledgeBackfill() 982 const updated = upsertSourceLifecycle(id, (source) => ({ 983 ...source, 984 archivedAt: source.archivedAt || Date.now(), 985 archivedReason: normalizeOptionalText(input?.reason) || source.archivedReason || 'archived', 986 duplicateOfSourceId: normalizeOptionalText(input?.duplicateOfSourceId) || source.duplicateOfSourceId || null, 987 supersededBySourceId: normalizeOptionalText(input?.supersededBySourceId) || source.supersededBySourceId || null, 988 maintenanceUpdatedAt: Date.now(), 989 maintenanceNotes: normalizeOptionalText(input?.reason) || source.maintenanceNotes || null, 990 updatedAt: Date.now(), 991 })) 992 if (!updated) return null 993 recordMaintenanceAction({ 994 kind: 'archive', 995 sourceId: updated.id, 996 relatedSourceId: updated.duplicateOfSourceId || updated.supersededBySourceId || null, 997 summary: `Archived ${updated.title}`, 998 createdAt: Date.now(), 999 }) 1000 return getKnowledgeSourceDetail(updated.id) 1001 } 1002 1003 export async function restoreKnowledgeSource(id: string): Promise<KnowledgeSourceDetail | null> { 1004 await ensureLegacyKnowledgeBackfill() 1005 const updated = upsertSourceLifecycle(id, (source) => ({ 1006 ...source, 1007 archivedAt: null, 1008 archivedReason: null, 1009 duplicateOfSourceId: null, 1010 supersededBySourceId: null, 1011 maintenanceUpdatedAt: Date.now(), 1012 maintenanceNotes: 'restored', 1013 updatedAt: Date.now(), 1014 })) 1015 if (!updated) return null 1016 recordMaintenanceAction({ 1017 kind: 'restore', 1018 sourceId: updated.id, 1019 summary: `Restored ${updated.title}`, 1020 createdAt: Date.now(), 1021 }) 1022 return getKnowledgeSourceDetail(updated.id) 1023 } 1024 1025 export async function supersedeKnowledgeSource( 1026 id: string, 1027 supersededBySourceId: string, 1028 ): Promise<KnowledgeSourceDetail | null> { 1029 await ensureLegacyKnowledgeBackfill() 1030 const target = loadKnowledgeSource(supersededBySourceId) 1031 if (!target) throw new Error('Superseding source not found.') 1032 const updated = upsertSourceLifecycle(id, (source) => ({ 1033 ...source, 1034 supersededBySourceId, 1035 archivedAt: source.archivedAt || Date.now(), 1036 archivedReason: source.archivedReason || 'superseded', 1037 maintenanceUpdatedAt: Date.now(), 1038 maintenanceNotes: `Superseded by ${supersededBySourceId}`, 1039 updatedAt: Date.now(), 1040 })) 1041 if (!updated) return null 1042 recordMaintenanceAction({ 1043 kind: 'supersede', 1044 sourceId: updated.id, 1045 relatedSourceId: supersededBySourceId, 1046 summary: `Marked ${updated.title} as superseded`, 1047 createdAt: Date.now(), 1048 }) 1049 return getKnowledgeSourceDetail(updated.id) 1050 } 1051 1052 function sameSourceOrigin(left: KnowledgeSource, right: KnowledgeSource): boolean { 1053 if (left.id === right.id) return false 1054 if (left.sourceUrl && right.sourceUrl) return left.sourceUrl === right.sourceUrl 1055 if (left.sourcePath && right.sourcePath) return left.sourcePath === right.sourcePath 1056 return false 1057 } 1058 1059 function duplicateOriginFingerprint(source: KnowledgeSource): string { 1060 if (source.sourceUrl) return `url:${source.sourceUrl}` 1061 if (source.sourcePath) return `path:${source.sourcePath}` 1062 return `kind:${source.kind}` 1063 } 1064 1065 function duplicateGroupKey(source: KnowledgeSource): string | null { 1066 if (!source.sourceHash) return null 1067 const sortedAgentIds = [...source.agentIds].sort() 1068 const sortedTags = [...source.tags].map((tag) => tag.toLowerCase()).sort() 1069 return [ 1070 source.sourceHash, 1071 source.kind, 1072 source.scope, 1073 sortedAgentIds.join(','), 1074 sortedTags.join(','), 1075 duplicateOriginFingerprint(source), 1076 ].join('|') 1077 } 1078 1079 function collectDuplicateGroups(sources: KnowledgeSource[]): Map<string, KnowledgeSource[]> { 1080 const duplicateGroups = new Map<string, KnowledgeSource[]>() 1081 for (const source of sources) { 1082 const groupKey = duplicateGroupKey(source) 1083 if (!groupKey) continue 1084 const group = duplicateGroups.get(groupKey) || [] 1085 group.push(source) 1086 duplicateGroups.set(groupKey, group) 1087 } 1088 return duplicateGroups 1089 } 1090 1091 function canonicalSourceForGroup(group: KnowledgeSource[]): KnowledgeSource { 1092 return [...group].sort((left, right) => { 1093 const archiveDelta = Number(sourceIsExcludedByDefault(left)) - Number(sourceIsExcludedByDefault(right)) 1094 if (archiveDelta !== 0) return archiveDelta 1095 const indexedDelta = (right.lastIndexedAt || 0) - (left.lastIndexedAt || 0) 1096 if (indexedDelta !== 0) return indexedDelta 1097 return left.createdAt - right.createdAt 1098 })[0] 1099 } 1100 1101 function buildHygieneSummary(sources: KnowledgeSource[]): KnowledgeHygieneSummary { 1102 const scannedAt = Date.now() 1103 const findings: KnowledgeHygieneFinding[] = [] 1104 const pushFinding = (finding: KnowledgeHygieneFinding) => { 1105 if (findings.length < MAX_HYGIENE_FINDINGS) findings.push(finding) 1106 } 1107 const duplicateGroups = collectDuplicateGroups(sources) 1108 1109 for (const source of sources) { 1110 if (sourceIsArchived(source)) { 1111 pushFinding({ 1112 kind: 'archived', 1113 sourceId: source.id, 1114 sourceTitle: source.title, 1115 detail: source.archivedReason || 'Archived source', 1116 createdAt: source.archivedAt || source.updatedAt, 1117 }) 1118 } 1119 if (sourceIsSuperseded(source)) { 1120 pushFinding({ 1121 kind: 'superseded', 1122 sourceId: source.id, 1123 sourceTitle: source.title, 1124 relatedSourceId: source.supersededBySourceId || null, 1125 detail: `Superseded by ${source.supersededBySourceId}`, 1126 createdAt: source.updatedAt, 1127 }) 1128 } 1129 if (source.syncStatus === 'error') { 1130 pushFinding({ 1131 kind: 'broken', 1132 sourceId: source.id, 1133 sourceTitle: source.title, 1134 detail: source.lastError || 'Last sync failed', 1135 createdAt: source.updatedAt, 1136 }) 1137 } else if (isStaleSource(source)) { 1138 pushFinding({ 1139 kind: 'stale', 1140 sourceId: source.id, 1141 sourceTitle: source.title, 1142 detail: 'Source is due for re-sync', 1143 createdAt: source.updatedAt, 1144 }) 1145 } 1146 } 1147 1148 for (const group of duplicateGroups.values()) { 1149 if (group.length < 2) continue 1150 const canonical = canonicalSourceForGroup(group) 1151 for (const source of group) { 1152 if (source.id === canonical.id) continue 1153 pushFinding({ 1154 kind: 'duplicate', 1155 sourceId: source.id, 1156 sourceTitle: source.title, 1157 relatedSourceId: canonical.id, 1158 relatedSourceTitle: canonical.title, 1159 detail: 'Exact duplicate content hash', 1160 createdAt: source.updatedAt, 1161 }) 1162 } 1163 } 1164 1165 const activeSources = sources.filter((source) => !sourceIsExcludedByDefault(source)) 1166 for (let index = 0; index < activeSources.length; index += 1) { 1167 const left = activeSources[index] 1168 const leftBody = `${left.title}\n${left.content || ''}` 1169 if (!leftBody.trim()) continue 1170 for (let compareIndex = index + 1; compareIndex < activeSources.length; compareIndex += 1) { 1171 const right = activeSources[compareIndex] 1172 const rightBody = `${right.title}\n${right.content || ''}` 1173 if (!rightBody.trim()) continue 1174 if (sameSourceOrigin(left, right)) continue 1175 const overlap = jaccardSimilarity(leftBody, rightBody) 1176 if (overlap < 0.6) continue 1177 pushFinding({ 1178 kind: 'overlap', 1179 sourceId: left.id, 1180 sourceTitle: left.title, 1181 relatedSourceId: right.id, 1182 relatedSourceTitle: right.title, 1183 detail: `High content overlap (${Math.round(overlap * 100)}%)`, 1184 createdAt: Math.max(left.updatedAt, right.updatedAt), 1185 }) 1186 } 1187 } 1188 1189 return { 1190 scannedAt, 1191 counts: { 1192 stale: findings.filter((finding) => finding.kind === 'stale').length, 1193 duplicate: findings.filter((finding) => finding.kind === 'duplicate').length, 1194 overlap: findings.filter((finding) => finding.kind === 'overlap').length, 1195 broken: findings.filter((finding) => finding.kind === 'broken').length, 1196 archived: findings.filter((finding) => finding.kind === 'archived').length, 1197 superseded: findings.filter((finding) => finding.kind === 'superseded').length, 1198 }, 1199 findings, 1200 recentActions: [...maintenanceHistory], 1201 } 1202 } 1203 1204 export async function getKnowledgeHygieneSummary(): Promise<KnowledgeHygieneSummary> { 1205 await ensureLegacyKnowledgeBackfill() 1206 registerKnowledgeMaintenanceIdleCallback() 1207 return buildHygieneSummary(listStoredSources()) 1208 } 1209 1210 export async function runKnowledgeHygieneMaintenance(): Promise<KnowledgeHygieneSummary> { 1211 await ensureLegacyKnowledgeBackfill() 1212 const sources = listStoredSources() 1213 const duplicateGroups = collectDuplicateGroups(sources) 1214 1215 for (const source of sources) { 1216 if (sourceIsExcludedByDefault(source)) continue 1217 if (source.kind !== 'manual' && (isStaleSource(source) || source.syncStatus === 'error')) { 1218 try { 1219 const synced = await syncKnowledgeSource(source.id) 1220 if (synced?.source) { 1221 upsertSourceLifecycle(source.id, (current) => ({ 1222 ...current, 1223 lastAutoSyncAt: Date.now(), 1224 maintenanceUpdatedAt: Date.now(), 1225 maintenanceNotes: 'auto-sync completed', 1226 updatedAt: Date.now(), 1227 })) 1228 recordMaintenanceAction({ 1229 kind: source.sourceHash === synced.source.sourceHash ? 'sync' : 'reindex', 1230 sourceId: source.id, 1231 summary: `Auto-synced ${synced.source.title}`, 1232 createdAt: Date.now(), 1233 }) 1234 } 1235 } catch { 1236 // Keep the existing error state for manual review. 1237 } 1238 } 1239 } 1240 1241 for (const group of duplicateGroups.values()) { 1242 if (group.length < 2) continue 1243 const canonical = canonicalSourceForGroup(group) 1244 for (const source of group) { 1245 if (source.id === canonical.id || sourceIsExcludedByDefault(source)) continue 1246 await archiveKnowledgeSource(source.id, { 1247 reason: 'duplicate', 1248 duplicateOfSourceId: canonical.id, 1249 }) 1250 } 1251 } 1252 1253 const refreshed = listStoredSources() 1254 const originGroups = new Map<string, KnowledgeSource[]>() 1255 for (const source of refreshed) { 1256 if (sourceIsExcludedByDefault(source)) continue 1257 const origin = source.sourceUrl || source.sourcePath || '' 1258 if (!origin) continue 1259 const group = originGroups.get(origin) || [] 1260 group.push(source) 1261 originGroups.set(origin, group) 1262 } 1263 for (const group of originGroups.values()) { 1264 if (group.length < 2) continue 1265 const canonical = canonicalSourceForGroup(group) 1266 for (const source of group) { 1267 if (source.id === canonical.id || sourceIsSuperseded(source)) continue 1268 if ((source.lastIndexedAt || 0) >= (canonical.lastIndexedAt || 0)) continue 1269 await supersedeKnowledgeSource(source.id, canonical.id) 1270 } 1271 } 1272 1273 return buildHygieneSummary(listStoredSources()) 1274 } 1275 1276 export function registerKnowledgeMaintenanceIdleCallback(): void { 1277 if (maintenanceRegistered) return 1278 maintenanceRegistered = true 1279 onNextIdleWindow(async () => { 1280 maintenanceRegistered = false 1281 await runKnowledgeHygieneMaintenance() 1282 registerKnowledgeMaintenanceIdleCallback() 1283 }) 1284 }