index.ts
1 /** 2 * Evaluation Framework - Accuracy, Performance, and Reliability evaluation 3 */ 4 5 export interface EvalResult { 6 passed: boolean; 7 score: number; 8 message?: string; 9 details?: Record<string, any>; 10 duration: number; 11 } 12 13 export interface AccuracyEvalConfig { 14 input: string; 15 expectedOutput: string; 16 actualOutput: string; 17 threshold?: number; 18 } 19 20 export interface PerformanceEvalConfig { 21 func: () => Promise<any>; 22 iterations?: number; 23 warmupRuns?: number; 24 } 25 26 export interface PerformanceResult extends EvalResult { 27 avgTime: number; 28 minTime: number; 29 maxTime: number; 30 p95Time: number; 31 times: number[]; 32 } 33 34 export interface ReliabilityEvalConfig { 35 expectedToolCalls: string[]; 36 actualToolCalls: string[]; 37 } 38 39 /** 40 * Accuracy Evaluation - Compare actual output to expected 41 */ 42 export async function accuracyEval(config: AccuracyEvalConfig): Promise<EvalResult> { 43 const start = Date.now(); 44 const threshold = config.threshold ?? 0.8; 45 46 const similarity = calculateSimilarity(config.expectedOutput, config.actualOutput); 47 const passed = similarity >= threshold; 48 49 return { 50 passed, 51 score: similarity, 52 message: passed ? 'Output matches expected' : 'Output does not match expected', 53 details: { 54 expected: config.expectedOutput, 55 actual: config.actualOutput, 56 threshold 57 }, 58 duration: Date.now() - start 59 }; 60 } 61 62 /** 63 * Performance Evaluation - Measure execution time 64 */ 65 export async function performanceEval(config: PerformanceEvalConfig): Promise<PerformanceResult> { 66 const iterations = config.iterations ?? 10; 67 const warmupRuns = config.warmupRuns ?? 2; 68 const times: number[] = []; 69 70 // Warmup runs 71 for (let i = 0; i < warmupRuns; i++) { 72 await config.func(); 73 } 74 75 // Actual runs 76 const start = Date.now(); 77 for (let i = 0; i < iterations; i++) { 78 const runStart = Date.now(); 79 await config.func(); 80 times.push(Date.now() - runStart); 81 } 82 83 const sortedTimes = [...times].sort((a, b) => a - b); 84 const avgTime = times.reduce((a, b) => a + b, 0) / times.length; 85 const minTime = sortedTimes[0]; 86 const maxTime = sortedTimes[sortedTimes.length - 1]; 87 const p95Index = Math.floor(sortedTimes.length * 0.95); 88 const p95Time = sortedTimes[p95Index] || maxTime; 89 90 return { 91 passed: true, 92 score: 1, 93 avgTime, 94 minTime, 95 maxTime, 96 p95Time, 97 times, 98 duration: Date.now() - start, 99 details: { 100 iterations, 101 warmupRuns 102 } 103 }; 104 } 105 106 /** 107 * Reliability Evaluation - Check tool call accuracy 108 */ 109 export async function reliabilityEval(config: ReliabilityEvalConfig): Promise<EvalResult> { 110 const start = Date.now(); 111 112 const expected = new Set(config.expectedToolCalls); 113 const actual = new Set(config.actualToolCalls); 114 115 const matched = config.expectedToolCalls.filter(t => actual.has(t)); 116 const missing = config.expectedToolCalls.filter(t => !actual.has(t)); 117 const extra = config.actualToolCalls.filter(t => !expected.has(t)); 118 119 const score = expected.size > 0 ? matched.length / expected.size : 1; 120 const passed = missing.length === 0; 121 122 return { 123 passed, 124 score, 125 message: passed ? 'All expected tool calls made' : `Missing tool calls: ${missing.join(', ')}`, 126 details: { 127 matched, 128 missing, 129 extra, 130 expected: config.expectedToolCalls, 131 actual: config.actualToolCalls 132 }, 133 duration: Date.now() - start 134 }; 135 } 136 137 /** 138 * Calculate text similarity (simple Jaccard similarity) 139 */ 140 function calculateSimilarity(a: string, b: string): number { 141 const wordsA = new Set(a.toLowerCase().split(/\s+/)); 142 const wordsB = new Set(b.toLowerCase().split(/\s+/)); 143 144 const intersection = new Set([...wordsA].filter(x => wordsB.has(x))); 145 const union = new Set([...wordsA, ...wordsB]); 146 147 return union.size > 0 ? intersection.size / union.size : 0; 148 } 149 150 /** 151 * Eval Suite - Run multiple evaluations 152 */ 153 export class EvalSuite { 154 private results: Map<string, EvalResult> = new Map(); 155 156 async runAccuracy(name: string, config: AccuracyEvalConfig): Promise<EvalResult> { 157 const result = await accuracyEval(config); 158 this.results.set(name, result); 159 return result; 160 } 161 162 async runPerformance(name: string, config: PerformanceEvalConfig): Promise<PerformanceResult> { 163 const result = await performanceEval(config); 164 this.results.set(name, result); 165 return result; 166 } 167 168 async runReliability(name: string, config: ReliabilityEvalConfig): Promise<EvalResult> { 169 const result = await reliabilityEval(config); 170 this.results.set(name, result); 171 return result; 172 } 173 174 getResults(): Map<string, EvalResult> { 175 return new Map(this.results); 176 } 177 178 getSummary(): { total: number; passed: number; failed: number; avgScore: number } { 179 const results = Array.from(this.results.values()); 180 const passed = results.filter(r => r.passed).length; 181 const avgScore = results.length > 0 182 ? results.reduce((a, b) => a + b.score, 0) / results.length 183 : 0; 184 185 return { 186 total: results.length, 187 passed, 188 failed: results.length - passed, 189 avgScore 190 }; 191 } 192 193 printSummary(): void { 194 const summary = this.getSummary(); 195 console.log('\n=== Evaluation Summary ==='); 196 console.log(`Total: ${summary.total}`); 197 console.log(`Passed: ${summary.passed}`); 198 console.log(`Failed: ${summary.failed}`); 199 console.log(`Avg Score: ${(summary.avgScore * 100).toFixed(1)}%`); 200 201 console.log('\nResults:'); 202 for (const [name, result] of this.results) { 203 const status = result.passed ? '✅' : '❌'; 204 console.log(` ${status} ${name}: ${(result.score * 100).toFixed(1)}%`); 205 } 206 } 207 } 208 209 // Re-export base Evaluator with criteria 210 export { 211 Evaluator, 212 createEvaluator, 213 createDefaultEvaluator, 214 relevanceCriterion, 215 lengthCriterion, 216 containsKeywordsCriterion, 217 noHarmfulContentCriterion, 218 type EvalCriteria, 219 type EvalResult as BaseEvalResult, 220 type EvalSummary, 221 type EvaluatorConfig, 222 } from './base'; 223 224 // Re-export EvalResults 225 export { 226 EvalResults, 227 createEvalResults, 228 type TestResult, 229 type AggregatedResults, 230 type TrendPoint, 231 } from './results'; 232 233 // Re-export Judge (LLM-as-Judge) 234 export { 235 Judge, 236 AccuracyJudge, 237 CriteriaJudge, 238 RecipeJudge, 239 addJudge, 240 getJudge, 241 listJudges, 242 removeJudge, 243 addOptimizationRule, 244 getOptimizationRule, 245 listOptimizationRules, 246 removeOptimizationRule, 247 parseJudgeResponse, 248 type JudgeConfig, 249 type JudgeCriteriaConfig, 250 type JudgeResult, 251 type JudgeRunOptions, 252 type JudgeOptions, 253 type JudgeProtocol, 254 } from './judge';