structured-extract.ts
1 import type { Session } from '@/types' 2 import { getProvider, streamChatWithFailover } from '@/lib/providers' 3 import { requireCredentialSecret, resolveCredentialSecret } from './credentials/credential-service' 4 import { extractDocumentArtifact, type DocumentArtifact } from './document-utils' 5 6 type JsonSchemaLike = Record<string, unknown> 7 8 interface ExtractionSession extends Pick<Session, 'id' | 'provider' | 'model' | 'credentialId' | 'fallbackCredentialIds' | 'apiEndpoint' | 'thinkingLevel'> { 9 name?: string 10 cwd?: string 11 } 12 13 export interface StructuredExtractionSource { 14 kind: 'text' | 'file' | 'mixed' 15 text: string 16 filePath?: string | null 17 artifact?: DocumentArtifact | null 18 } 19 20 export interface StructuredExtractionResult { 21 object: unknown 22 raw: string 23 validationErrors: string[] 24 provider: string 25 model: string 26 source: StructuredExtractionSource 27 } 28 29 function resolveApiKey(session: ExtractionSession): string | null { 30 const provider = getProvider(session.provider) 31 if (!provider) throw new Error(`Unknown provider: ${session.provider}`) 32 if (provider.requiresApiKey) { 33 if (!session.credentialId) throw new Error('No API key configured for this session') 34 return requireCredentialSecret(session.credentialId, 'API key not found. Please add one in Settings.') 35 } 36 if (provider.optionalApiKey && session.credentialId) { 37 return resolveCredentialSecret(session.credentialId) 38 } 39 return null 40 } 41 42 function normalizeSchemaInput(schema: unknown): JsonSchemaLike { 43 if (schema && typeof schema === 'object' && !Array.isArray(schema)) return schema as JsonSchemaLike 44 if (typeof schema === 'string' && schema.trim()) { 45 const parsed = JSON.parse(schema) 46 if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) return parsed as JsonSchemaLike 47 } 48 throw new Error('schema must be a JSON object or a JSON string representing an object.') 49 } 50 51 function defaultSummarySchema(): JsonSchemaLike { 52 return { 53 type: 'object', 54 properties: { 55 summary: { type: 'string' }, 56 keyPoints: { type: 'array', items: { type: 'string' } }, 57 entities: { 58 type: 'array', 59 items: { 60 type: 'object', 61 properties: { 62 name: { type: 'string' }, 63 type: { type: 'string' }, 64 value: {}, 65 }, 66 required: ['name'], 67 }, 68 }, 69 }, 70 required: ['summary', 'keyPoints'], 71 } 72 } 73 74 function normalizeText(value: string, maxChars = 120_000): string { 75 const cleaned = value.replace(/\r\n/g, '\n').replace(/\u0000/g, '').trim() 76 if (cleaned.length <= maxChars) return cleaned 77 return `${cleaned.slice(0, maxChars)}\n\n[... truncated ...]` 78 } 79 80 function extractJsonBlock(text: string): string | null { 81 const raw = (text || '').trim() 82 if (!raw) return null 83 84 const fenced = raw.match(/```(?:json)?\s*([\s\S]*?)```/i)?.[1]?.trim() 85 if (fenced) return fenced 86 87 if ((raw.startsWith('{') && raw.endsWith('}')) || (raw.startsWith('[') && raw.endsWith(']'))) { 88 return raw 89 } 90 91 let inString = false 92 let escaped = false 93 let start = -1 94 const stack: string[] = [] 95 for (let index = 0; index < raw.length; index += 1) { 96 const char = raw[index] 97 if (inString) { 98 if (escaped) { 99 escaped = false 100 continue 101 } 102 if (char === '\\') { 103 escaped = true 104 continue 105 } 106 if (char === '"') inString = false 107 continue 108 } 109 if (char === '"') { 110 inString = true 111 continue 112 } 113 if (char === '{' || char === '[') { 114 if (stack.length === 0) start = index 115 stack.push(char) 116 continue 117 } 118 if (char === '}' || char === ']') { 119 const last = stack.at(-1) 120 if ((char === '}' && last === '{') || (char === ']' && last === '[')) { 121 stack.pop() 122 if (stack.length === 0 && start >= 0) { 123 return raw.slice(start, index + 1) 124 } 125 } 126 } 127 } 128 129 return null 130 } 131 132 function parseModelJson(text: string): unknown { 133 const candidate = extractJsonBlock(text) 134 if (!candidate) throw new Error('Model did not return JSON.') 135 return JSON.parse(candidate) 136 } 137 138 function typeMatches(value: unknown, expected: string): boolean { 139 if (expected === 'array') return Array.isArray(value) 140 if (expected === 'object') return !!value && typeof value === 'object' && !Array.isArray(value) 141 if (expected === 'string') return typeof value === 'string' 142 if (expected === 'number') return typeof value === 'number' && Number.isFinite(value) 143 if (expected === 'integer') return typeof value === 'number' && Number.isInteger(value) 144 if (expected === 'boolean') return typeof value === 'boolean' 145 if (expected === 'null') return value === null 146 return true 147 } 148 149 function validateJsonLikeSchema( 150 value: unknown, 151 schema: JsonSchemaLike, 152 path = '$', 153 errors: string[] = [], 154 ): string[] { 155 const expected = schema.type 156 if (typeof expected === 'string' && !typeMatches(value, expected)) { 157 errors.push(`${path} should be ${expected}`) 158 return errors 159 } 160 161 if (Array.isArray(schema.enum) && !schema.enum.some((entry) => JSON.stringify(entry) === JSON.stringify(value))) { 162 errors.push(`${path} must be one of the allowed enum values`) 163 } 164 165 if (expected === 'object' && value && typeof value === 'object' && !Array.isArray(value)) { 166 const asRecord = value as Record<string, unknown> 167 const properties = (schema.properties && typeof schema.properties === 'object' && !Array.isArray(schema.properties)) 168 ? schema.properties as Record<string, JsonSchemaLike> 169 : {} 170 const required = Array.isArray(schema.required) ? schema.required.filter((entry): entry is string => typeof entry === 'string') : [] 171 for (const key of required) { 172 if (!(key in asRecord)) errors.push(`${path}.${key} is required`) 173 } 174 for (const [key, childSchema] of Object.entries(properties)) { 175 if (!(key in asRecord)) continue 176 validateJsonLikeSchema(asRecord[key], childSchema, `${path}.${key}`, errors) 177 } 178 if (schema.additionalProperties === false) { 179 for (const key of Object.keys(asRecord)) { 180 if (!(key in properties)) errors.push(`${path}.${key} is not allowed`) 181 } 182 } 183 } 184 185 if (expected === 'array' && Array.isArray(value)) { 186 const itemSchema = (schema.items && typeof schema.items === 'object' && !Array.isArray(schema.items)) 187 ? schema.items as JsonSchemaLike 188 : null 189 if (typeof schema.minItems === 'number' && value.length < schema.minItems) { 190 errors.push(`${path} must contain at least ${schema.minItems} items`) 191 } 192 if (typeof schema.maxItems === 'number' && value.length > schema.maxItems) { 193 errors.push(`${path} must contain at most ${schema.maxItems} items`) 194 } 195 if (itemSchema) { 196 value.slice(0, 100).forEach((entry, index) => validateJsonLikeSchema(entry, itemSchema, `${path}[${index}]`, errors)) 197 } 198 } 199 200 return errors 201 } 202 203 async function callExtractionModel(params: { 204 session: ExtractionSession 205 prompt: string 206 }): Promise<string> { 207 const provider = getProvider(params.session.provider) 208 if (!provider) throw new Error(`Unknown provider: ${params.session.provider}`) 209 210 const apiKey = resolveApiKey(params.session) 211 const streamedText: string[] = [] 212 const streamedErrors: string[] = [] 213 214 const raw = await streamChatWithFailover({ 215 session: { 216 id: `${params.session.id}:extract:${Date.now()}`, 217 provider: params.session.provider, 218 model: params.session.model, 219 credentialId: params.session.credentialId ?? null, 220 fallbackCredentialIds: params.session.fallbackCredentialIds || [], 221 apiEndpoint: params.session.apiEndpoint || undefined, 222 thinkingLevel: params.session.thinkingLevel, 223 }, 224 message: params.prompt, 225 apiKey, 226 active: new Map(), 227 loadHistory: () => [], 228 write: (chunk) => { 229 for (const line of chunk.split('\n')) { 230 if (!line.startsWith('data: ')) continue 231 try { 232 const event = JSON.parse(line.slice(6).trim()) as Record<string, unknown> 233 if (event.t === 'd' && typeof event.text === 'string') streamedText.push(event.text) 234 if (event.t === 'err' && typeof event.text === 'string') streamedErrors.push(event.text) 235 } catch { 236 // ignore malformed SSE fragments 237 } 238 } 239 }, 240 }) 241 242 const text = (raw || streamedText.join('')).trim() 243 if (!text) { 244 throw new Error(streamedErrors[0] || `Provider "${provider.name}" returned no content.`) 245 } 246 return text 247 } 248 249 function buildExtractionPrompt(params: { 250 instruction?: string | null 251 schema: JsonSchemaLike 252 source: StructuredExtractionSource 253 }): string { 254 const parts = [ 255 'Extract structured data from the provided source.', 256 'Return only valid JSON. Do not include markdown fences, commentary, or explanatory text.', 257 'If a field cannot be determined, use null, an empty string, or an empty array based on the schema.', 258 ] 259 if (params.instruction?.trim()) { 260 parts.push(`Task:\n${params.instruction.trim()}`) 261 } 262 parts.push(`JSON Schema:\n${JSON.stringify(params.schema, null, 2)}`) 263 if (params.source.artifact) { 264 const artifact = params.source.artifact 265 parts.push(`Source metadata:\n${JSON.stringify({ 266 filePath: artifact.filePath, 267 fileName: artifact.fileName, 268 ext: artifact.ext, 269 method: artifact.method, 270 metadata: artifact.metadata, 271 tableCount: artifact.tables.length, 272 }, null, 2)}`) 273 } 274 parts.push(`Source text:\n${params.source.text}`) 275 return parts.join('\n\n') 276 } 277 278 async function prepareSource(params: { 279 text?: string | null 280 filePath?: string | null 281 preferOcr?: boolean 282 maxChars?: number 283 }): Promise<StructuredExtractionSource> { 284 const chunks: string[] = [] 285 let artifact: DocumentArtifact | null = null 286 287 if (params.filePath) { 288 artifact = await extractDocumentArtifact(params.filePath, { 289 preferOcr: params.preferOcr, 290 maxChars: params.maxChars, 291 }) 292 if (artifact.text.trim()) chunks.push(artifact.text) 293 } 294 295 if (params.text?.trim()) chunks.push(params.text.trim()) 296 if (chunks.length === 0) throw new Error('text or filePath is required.') 297 298 return { 299 kind: params.filePath && params.text ? 'mixed' : params.filePath ? 'file' : 'text', 300 filePath: params.filePath || null, 301 artifact, 302 text: normalizeText(chunks.join('\n\n'), params.maxChars || 120_000), 303 } 304 } 305 306 export async function runStructuredExtraction(params: { 307 session: ExtractionSession 308 text?: string | null 309 filePath?: string | null 310 instruction?: string | null 311 schema?: unknown 312 preferOcr?: boolean 313 maxChars?: number 314 }): Promise<StructuredExtractionResult> { 315 if (!params.session.provider || !params.session.model) { 316 throw new Error('Current session is missing provider/model configuration.') 317 } 318 319 const source = await prepareSource({ 320 text: params.text, 321 filePath: params.filePath, 322 preferOcr: params.preferOcr, 323 maxChars: params.maxChars, 324 }) 325 const schema = params.schema === undefined ? defaultSummarySchema() : normalizeSchemaInput(params.schema) 326 const prompt = buildExtractionPrompt({ 327 instruction: params.instruction, 328 schema, 329 source, 330 }) 331 332 let raw = await callExtractionModel({ 333 session: params.session, 334 prompt, 335 }) 336 337 let parsed: unknown 338 try { 339 parsed = parseModelJson(raw) 340 } catch { 341 raw = await callExtractionModel({ 342 session: params.session, 343 prompt: [ 344 'Repair the invalid JSON below so it becomes valid JSON that matches the provided schema.', 345 'Return only JSON.', 346 `JSON Schema:\n${JSON.stringify(schema, null, 2)}`, 347 `Invalid output:\n${raw}`, 348 ].join('\n\n'), 349 }) 350 parsed = parseModelJson(raw) 351 } 352 353 const validationErrors = validateJsonLikeSchema(parsed, schema).slice(0, 50) 354 return { 355 object: parsed, 356 raw, 357 validationErrors, 358 provider: params.session.provider, 359 model: params.session.model, 360 source, 361 } 362 }