Cradicle Explorer

/ src / praisonai-ts / src / eval / index.ts
index.ts
  1  /**
  2   * Evaluation Framework - Accuracy, Performance, and Reliability evaluation
  3   */
  4  
  5  export interface EvalResult {
  6    passed: boolean;
  7    score: number;
  8    message?: string;
  9    details?: Record<string, any>;
 10    duration: number;
 11  }
 12  
 13  export interface AccuracyEvalConfig {
 14    input: string;
 15    expectedOutput: string;
 16    actualOutput: string;
 17    threshold?: number;
 18  }
 19  
 20  export interface PerformanceEvalConfig {
 21    func: () => Promise<any>;
 22    iterations?: number;
 23    warmupRuns?: number;
 24  }
 25  
 26  export interface PerformanceResult extends EvalResult {
 27    avgTime: number;
 28    minTime: number;
 29    maxTime: number;
 30    p95Time: number;
 31    times: number[];
 32  }
 33  
 34  export interface ReliabilityEvalConfig {
 35    expectedToolCalls: string[];
 36    actualToolCalls: string[];
 37  }
 38  
 39  /**
 40   * Accuracy Evaluation - Compare actual output to expected
 41   */
 42  export async function accuracyEval(config: AccuracyEvalConfig): Promise<EvalResult> {
 43    const start = Date.now();
 44    const threshold = config.threshold ?? 0.8;
 45  
 46    const similarity = calculateSimilarity(config.expectedOutput, config.actualOutput);
 47    const passed = similarity >= threshold;
 48  
 49    return {
 50      passed,
 51      score: similarity,
 52      message: passed ? 'Output matches expected' : 'Output does not match expected',
 53      details: {
 54        expected: config.expectedOutput,
 55        actual: config.actualOutput,
 56        threshold
 57      },
 58      duration: Date.now() - start
 59    };
 60  }
 61  
 62  /**
 63   * Performance Evaluation - Measure execution time
 64   */
 65  export async function performanceEval(config: PerformanceEvalConfig): Promise<PerformanceResult> {
 66    const iterations = config.iterations ?? 10;
 67    const warmupRuns = config.warmupRuns ?? 2;
 68    const times: number[] = [];
 69  
 70    // Warmup runs
 71    for (let i = 0; i < warmupRuns; i++) {
 72      await config.func();
 73    }
 74  
 75    // Actual runs
 76    const start = Date.now();
 77    for (let i = 0; i < iterations; i++) {
 78      const runStart = Date.now();
 79      await config.func();
 80      times.push(Date.now() - runStart);
 81    }
 82  
 83    const sortedTimes = [...times].sort((a, b) => a - b);
 84    const avgTime = times.reduce((a, b) => a + b, 0) / times.length;
 85    const minTime = sortedTimes[0];
 86    const maxTime = sortedTimes[sortedTimes.length - 1];
 87    const p95Index = Math.floor(sortedTimes.length * 0.95);
 88    const p95Time = sortedTimes[p95Index] || maxTime;
 89  
 90    return {
 91      passed: true,
 92      score: 1,
 93      avgTime,
 94      minTime,
 95      maxTime,
 96      p95Time,
 97      times,
 98      duration: Date.now() - start,
 99      details: {
100        iterations,
101        warmupRuns
102      }
103    };
104  }
105  
106  /**
107   * Reliability Evaluation - Check tool call accuracy
108   */
109  export async function reliabilityEval(config: ReliabilityEvalConfig): Promise<EvalResult> {
110    const start = Date.now();
111  
112    const expected = new Set(config.expectedToolCalls);
113    const actual = new Set(config.actualToolCalls);
114  
115    const matched = config.expectedToolCalls.filter(t => actual.has(t));
116    const missing = config.expectedToolCalls.filter(t => !actual.has(t));
117    const extra = config.actualToolCalls.filter(t => !expected.has(t));
118  
119    const score = expected.size > 0 ? matched.length / expected.size : 1;
120    const passed = missing.length === 0;
121  
122    return {
123      passed,
124      score,
125      message: passed ? 'All expected tool calls made' : `Missing tool calls: ${missing.join(', ')}`,
126      details: {
127        matched,
128        missing,
129        extra,
130        expected: config.expectedToolCalls,
131        actual: config.actualToolCalls
132      },
133      duration: Date.now() - start
134    };
135  }
136  
137  /**
138   * Calculate text similarity (simple Jaccard similarity)
139   */
140  function calculateSimilarity(a: string, b: string): number {
141    const wordsA = new Set(a.toLowerCase().split(/\s+/));
142    const wordsB = new Set(b.toLowerCase().split(/\s+/));
143  
144    const intersection = new Set([...wordsA].filter(x => wordsB.has(x)));
145    const union = new Set([...wordsA, ...wordsB]);
146  
147    return union.size > 0 ? intersection.size / union.size : 0;
148  }
149  
150  /**
151   * Eval Suite - Run multiple evaluations
152   */
153  export class EvalSuite {
154    private results: Map<string, EvalResult> = new Map();
155  
156    async runAccuracy(name: string, config: AccuracyEvalConfig): Promise<EvalResult> {
157      const result = await accuracyEval(config);
158      this.results.set(name, result);
159      return result;
160    }
161  
162    async runPerformance(name: string, config: PerformanceEvalConfig): Promise<PerformanceResult> {
163      const result = await performanceEval(config);
164      this.results.set(name, result);
165      return result;
166    }
167  
168    async runReliability(name: string, config: ReliabilityEvalConfig): Promise<EvalResult> {
169      const result = await reliabilityEval(config);
170      this.results.set(name, result);
171      return result;
172    }
173  
174    getResults(): Map<string, EvalResult> {
175      return new Map(this.results);
176    }
177  
178    getSummary(): { total: number; passed: number; failed: number; avgScore: number } {
179      const results = Array.from(this.results.values());
180      const passed = results.filter(r => r.passed).length;
181      const avgScore = results.length > 0
182        ? results.reduce((a, b) => a + b.score, 0) / results.length
183        : 0;
184  
185      return {
186        total: results.length,
187        passed,
188        failed: results.length - passed,
189        avgScore
190      };
191    }
192  
193    printSummary(): void {
194      const summary = this.getSummary();
195      console.log('\n=== Evaluation Summary ===');
196      console.log(`Total: ${summary.total}`);
197      console.log(`Passed: ${summary.passed}`);
198      console.log(`Failed: ${summary.failed}`);
199      console.log(`Avg Score: ${(summary.avgScore * 100).toFixed(1)}%`);
200  
201      console.log('\nResults:');
202      for (const [name, result] of this.results) {
203        const status = result.passed ? '✅' : '❌';
204        console.log(`  ${status} ${name}: ${(result.score * 100).toFixed(1)}%`);
205      }
206    }
207  }
208  
209  // Re-export base Evaluator with criteria
210  export {
211    Evaluator,
212    createEvaluator,
213    createDefaultEvaluator,
214    relevanceCriterion,
215    lengthCriterion,
216    containsKeywordsCriterion,
217    noHarmfulContentCriterion,
218    type EvalCriteria,
219    type EvalResult as BaseEvalResult,
220    type EvalSummary,
221    type EvaluatorConfig,
222  } from './base';
223  
224  // Re-export EvalResults
225  export {
226    EvalResults,
227    createEvalResults,
228    type TestResult,
229    type AggregatedResults,
230    type TrendPoint,
231  } from './results';
232  
233  // Re-export Judge (LLM-as-Judge)
234  export {
235    Judge,
236    AccuracyJudge,
237    CriteriaJudge,
238    RecipeJudge,
239    addJudge,
240    getJudge,
241    listJudges,
242    removeJudge,
243    addOptimizationRule,
244    getOptimizationRule,
245    listOptimizationRules,
246    removeOptimizationRule,
247    parseJudgeResponse,
248    type JudgeConfig,
249    type JudgeCriteriaConfig,
250    type JudgeResult,
251    type JudgeRunOptions,
252    type JudgeOptions,
253    type JudgeProtocol,
254  } from './judge';