/ src / lib / server / structured-extract.ts
structured-extract.ts
  1  import type { Session } from '@/types'
  2  import { getProvider, streamChatWithFailover } from '@/lib/providers'
  3  import { requireCredentialSecret, resolveCredentialSecret } from './credentials/credential-service'
  4  import { extractDocumentArtifact, type DocumentArtifact } from './document-utils'
  5  
  6  type JsonSchemaLike = Record<string, unknown>
  7  
  8  interface ExtractionSession extends Pick<Session, 'id' | 'provider' | 'model' | 'credentialId' | 'fallbackCredentialIds' | 'apiEndpoint' | 'thinkingLevel'> {
  9    name?: string
 10    cwd?: string
 11  }
 12  
 13  export interface StructuredExtractionSource {
 14    kind: 'text' | 'file' | 'mixed'
 15    text: string
 16    filePath?: string | null
 17    artifact?: DocumentArtifact | null
 18  }
 19  
 20  export interface StructuredExtractionResult {
 21    object: unknown
 22    raw: string
 23    validationErrors: string[]
 24    provider: string
 25    model: string
 26    source: StructuredExtractionSource
 27  }
 28  
 29  function resolveApiKey(session: ExtractionSession): string | null {
 30    const provider = getProvider(session.provider)
 31    if (!provider) throw new Error(`Unknown provider: ${session.provider}`)
 32    if (provider.requiresApiKey) {
 33      if (!session.credentialId) throw new Error('No API key configured for this session')
 34      return requireCredentialSecret(session.credentialId, 'API key not found. Please add one in Settings.')
 35    }
 36    if (provider.optionalApiKey && session.credentialId) {
 37      return resolveCredentialSecret(session.credentialId)
 38    }
 39    return null
 40  }
 41  
 42  function normalizeSchemaInput(schema: unknown): JsonSchemaLike {
 43    if (schema && typeof schema === 'object' && !Array.isArray(schema)) return schema as JsonSchemaLike
 44    if (typeof schema === 'string' && schema.trim()) {
 45      const parsed = JSON.parse(schema)
 46      if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) return parsed as JsonSchemaLike
 47    }
 48    throw new Error('schema must be a JSON object or a JSON string representing an object.')
 49  }
 50  
 51  function defaultSummarySchema(): JsonSchemaLike {
 52    return {
 53      type: 'object',
 54      properties: {
 55        summary: { type: 'string' },
 56        keyPoints: { type: 'array', items: { type: 'string' } },
 57        entities: {
 58          type: 'array',
 59          items: {
 60            type: 'object',
 61            properties: {
 62              name: { type: 'string' },
 63              type: { type: 'string' },
 64              value: {},
 65            },
 66            required: ['name'],
 67          },
 68        },
 69      },
 70      required: ['summary', 'keyPoints'],
 71    }
 72  }
 73  
 74  function normalizeText(value: string, maxChars = 120_000): string {
 75    const cleaned = value.replace(/\r\n/g, '\n').replace(/\u0000/g, '').trim()
 76    if (cleaned.length <= maxChars) return cleaned
 77    return `${cleaned.slice(0, maxChars)}\n\n[... truncated ...]`
 78  }
 79  
 80  function extractJsonBlock(text: string): string | null {
 81    const raw = (text || '').trim()
 82    if (!raw) return null
 83  
 84    const fenced = raw.match(/```(?:json)?\s*([\s\S]*?)```/i)?.[1]?.trim()
 85    if (fenced) return fenced
 86  
 87    if ((raw.startsWith('{') && raw.endsWith('}')) || (raw.startsWith('[') && raw.endsWith(']'))) {
 88      return raw
 89    }
 90  
 91    let inString = false
 92    let escaped = false
 93    let start = -1
 94    const stack: string[] = []
 95    for (let index = 0; index < raw.length; index += 1) {
 96      const char = raw[index]
 97      if (inString) {
 98        if (escaped) {
 99          escaped = false
100          continue
101        }
102        if (char === '\\') {
103          escaped = true
104          continue
105        }
106        if (char === '"') inString = false
107        continue
108      }
109      if (char === '"') {
110        inString = true
111        continue
112      }
113      if (char === '{' || char === '[') {
114        if (stack.length === 0) start = index
115        stack.push(char)
116        continue
117      }
118      if (char === '}' || char === ']') {
119        const last = stack.at(-1)
120        if ((char === '}' && last === '{') || (char === ']' && last === '[')) {
121          stack.pop()
122          if (stack.length === 0 && start >= 0) {
123            return raw.slice(start, index + 1)
124          }
125        }
126      }
127    }
128  
129    return null
130  }
131  
132  function parseModelJson(text: string): unknown {
133    const candidate = extractJsonBlock(text)
134    if (!candidate) throw new Error('Model did not return JSON.')
135    return JSON.parse(candidate)
136  }
137  
138  function typeMatches(value: unknown, expected: string): boolean {
139    if (expected === 'array') return Array.isArray(value)
140    if (expected === 'object') return !!value && typeof value === 'object' && !Array.isArray(value)
141    if (expected === 'string') return typeof value === 'string'
142    if (expected === 'number') return typeof value === 'number' && Number.isFinite(value)
143    if (expected === 'integer') return typeof value === 'number' && Number.isInteger(value)
144    if (expected === 'boolean') return typeof value === 'boolean'
145    if (expected === 'null') return value === null
146    return true
147  }
148  
149  function validateJsonLikeSchema(
150    value: unknown,
151    schema: JsonSchemaLike,
152    path = '$',
153    errors: string[] = [],
154  ): string[] {
155    const expected = schema.type
156    if (typeof expected === 'string' && !typeMatches(value, expected)) {
157      errors.push(`${path} should be ${expected}`)
158      return errors
159    }
160  
161    if (Array.isArray(schema.enum) && !schema.enum.some((entry) => JSON.stringify(entry) === JSON.stringify(value))) {
162      errors.push(`${path} must be one of the allowed enum values`)
163    }
164  
165    if (expected === 'object' && value && typeof value === 'object' && !Array.isArray(value)) {
166      const asRecord = value as Record<string, unknown>
167      const properties = (schema.properties && typeof schema.properties === 'object' && !Array.isArray(schema.properties))
168        ? schema.properties as Record<string, JsonSchemaLike>
169        : {}
170      const required = Array.isArray(schema.required) ? schema.required.filter((entry): entry is string => typeof entry === 'string') : []
171      for (const key of required) {
172        if (!(key in asRecord)) errors.push(`${path}.${key} is required`)
173      }
174      for (const [key, childSchema] of Object.entries(properties)) {
175        if (!(key in asRecord)) continue
176        validateJsonLikeSchema(asRecord[key], childSchema, `${path}.${key}`, errors)
177      }
178      if (schema.additionalProperties === false) {
179        for (const key of Object.keys(asRecord)) {
180          if (!(key in properties)) errors.push(`${path}.${key} is not allowed`)
181        }
182      }
183    }
184  
185    if (expected === 'array' && Array.isArray(value)) {
186      const itemSchema = (schema.items && typeof schema.items === 'object' && !Array.isArray(schema.items))
187        ? schema.items as JsonSchemaLike
188        : null
189      if (typeof schema.minItems === 'number' && value.length < schema.minItems) {
190        errors.push(`${path} must contain at least ${schema.minItems} items`)
191      }
192      if (typeof schema.maxItems === 'number' && value.length > schema.maxItems) {
193        errors.push(`${path} must contain at most ${schema.maxItems} items`)
194      }
195      if (itemSchema) {
196        value.slice(0, 100).forEach((entry, index) => validateJsonLikeSchema(entry, itemSchema, `${path}[${index}]`, errors))
197      }
198    }
199  
200    return errors
201  }
202  
203  async function callExtractionModel(params: {
204    session: ExtractionSession
205    prompt: string
206  }): Promise<string> {
207    const provider = getProvider(params.session.provider)
208    if (!provider) throw new Error(`Unknown provider: ${params.session.provider}`)
209  
210    const apiKey = resolveApiKey(params.session)
211    const streamedText: string[] = []
212    const streamedErrors: string[] = []
213  
214    const raw = await streamChatWithFailover({
215      session: {
216        id: `${params.session.id}:extract:${Date.now()}`,
217        provider: params.session.provider,
218        model: params.session.model,
219        credentialId: params.session.credentialId ?? null,
220        fallbackCredentialIds: params.session.fallbackCredentialIds || [],
221        apiEndpoint: params.session.apiEndpoint || undefined,
222        thinkingLevel: params.session.thinkingLevel,
223      },
224      message: params.prompt,
225      apiKey,
226      active: new Map(),
227      loadHistory: () => [],
228      write: (chunk) => {
229        for (const line of chunk.split('\n')) {
230          if (!line.startsWith('data: ')) continue
231          try {
232            const event = JSON.parse(line.slice(6).trim()) as Record<string, unknown>
233            if (event.t === 'd' && typeof event.text === 'string') streamedText.push(event.text)
234            if (event.t === 'err' && typeof event.text === 'string') streamedErrors.push(event.text)
235          } catch {
236            // ignore malformed SSE fragments
237          }
238        }
239      },
240    })
241  
242    const text = (raw || streamedText.join('')).trim()
243    if (!text) {
244      throw new Error(streamedErrors[0] || `Provider "${provider.name}" returned no content.`)
245    }
246    return text
247  }
248  
249  function buildExtractionPrompt(params: {
250    instruction?: string | null
251    schema: JsonSchemaLike
252    source: StructuredExtractionSource
253  }): string {
254    const parts = [
255      'Extract structured data from the provided source.',
256      'Return only valid JSON. Do not include markdown fences, commentary, or explanatory text.',
257      'If a field cannot be determined, use null, an empty string, or an empty array based on the schema.',
258    ]
259    if (params.instruction?.trim()) {
260      parts.push(`Task:\n${params.instruction.trim()}`)
261    }
262    parts.push(`JSON Schema:\n${JSON.stringify(params.schema, null, 2)}`)
263    if (params.source.artifact) {
264      const artifact = params.source.artifact
265      parts.push(`Source metadata:\n${JSON.stringify({
266        filePath: artifact.filePath,
267        fileName: artifact.fileName,
268        ext: artifact.ext,
269        method: artifact.method,
270        metadata: artifact.metadata,
271        tableCount: artifact.tables.length,
272      }, null, 2)}`)
273    }
274    parts.push(`Source text:\n${params.source.text}`)
275    return parts.join('\n\n')
276  }
277  
278  async function prepareSource(params: {
279    text?: string | null
280    filePath?: string | null
281    preferOcr?: boolean
282    maxChars?: number
283  }): Promise<StructuredExtractionSource> {
284    const chunks: string[] = []
285    let artifact: DocumentArtifact | null = null
286  
287    if (params.filePath) {
288      artifact = await extractDocumentArtifact(params.filePath, {
289        preferOcr: params.preferOcr,
290        maxChars: params.maxChars,
291      })
292      if (artifact.text.trim()) chunks.push(artifact.text)
293    }
294  
295    if (params.text?.trim()) chunks.push(params.text.trim())
296    if (chunks.length === 0) throw new Error('text or filePath is required.')
297  
298    return {
299      kind: params.filePath && params.text ? 'mixed' : params.filePath ? 'file' : 'text',
300      filePath: params.filePath || null,
301      artifact,
302      text: normalizeText(chunks.join('\n\n'), params.maxChars || 120_000),
303    }
304  }
305  
306  export async function runStructuredExtraction(params: {
307    session: ExtractionSession
308    text?: string | null
309    filePath?: string | null
310    instruction?: string | null
311    schema?: unknown
312    preferOcr?: boolean
313    maxChars?: number
314  }): Promise<StructuredExtractionResult> {
315    if (!params.session.provider || !params.session.model) {
316      throw new Error('Current session is missing provider/model configuration.')
317    }
318  
319    const source = await prepareSource({
320      text: params.text,
321      filePath: params.filePath,
322      preferOcr: params.preferOcr,
323      maxChars: params.maxChars,
324    })
325    const schema = params.schema === undefined ? defaultSummarySchema() : normalizeSchemaInput(params.schema)
326    const prompt = buildExtractionPrompt({
327      instruction: params.instruction,
328      schema,
329      source,
330    })
331  
332    let raw = await callExtractionModel({
333      session: params.session,
334      prompt,
335    })
336  
337    let parsed: unknown
338    try {
339      parsed = parseModelJson(raw)
340    } catch {
341      raw = await callExtractionModel({
342        session: params.session,
343        prompt: [
344          'Repair the invalid JSON below so it becomes valid JSON that matches the provided schema.',
345          'Return only JSON.',
346          `JSON Schema:\n${JSON.stringify(schema, null, 2)}`,
347          `Invalid output:\n${raw}`,
348        ].join('\n\n'),
349      })
350      parsed = parseModelJson(raw)
351    }
352  
353    const validationErrors = validateJsonLikeSchema(parsed, schema).slice(0, 50)
354    return {
355      object: parsed,
356      raw,
357      validationErrors,
358      provider: params.session.provider,
359      model: params.session.model,
360      source,
361    }
362  }