Cradicle Explorer

eval-save.ts
  1  #!/usr/bin/env npx tsx
  2  /**
  3   * Layer 4: Save as CLI Testing — "Save as CLI" Pipeline
  4   *
  5   * Tests the full browser init → write adapter → browser verify flow.
  6   * Validates that browser exploration can be crystallized into reusable CLI adapters.
  7   *
  8   * Usage:
  9   *   npx tsx autoresearch/eval-save.ts              # Run all tasks
 10   *   npx tsx autoresearch/eval-save.ts --task hn-top # Run single task
 11   */
 12  
 13  import { execSync } from 'node:child_process';
 14  import { readFileSync, writeFileSync, mkdirSync, readdirSync, existsSync, rmSync } from 'node:fs';
 15  import { join, dirname } from 'node:path';
 16  import { fileURLToPath } from 'node:url';
 17  import { homedir } from 'node:os';
 18  
 19  const __dirname = dirname(fileURLToPath(import.meta.url));
 20  const TASKS_FILE = join(__dirname, 'save-tasks.json');
 21  const RESULTS_DIR = join(__dirname, 'results');
 22  const USER_CLIS_DIR = join(homedir(), '.opencli', 'clis');
 23  
 24  interface SaveTask {
 25    name: string;
 26    site: string;
 27    command: string;
 28    /** Inline adapter code (simple tasks) */
 29    adapter?: string;
 30    /** Path to adapter file relative to autoresearch/ dir (complex tasks — avoids JSON escape issues) */
 31    adapterFile?: string;
 32    judge: JudgeCriteria;
 33    set?: 'test';
 34    note?: string;
 35  }
 36  
 37  type JudgeCriteria =
 38    | { type: 'contains'; value: string }
 39    | { type: 'arrayMinLength'; minLength: number }
 40    | { type: 'nonEmpty' }
 41    | { type: 'matchesPattern'; pattern: string };
 42  
 43  interface TaskResult {
 44    name: string;
 45    phase: 'init' | 'write' | 'verify' | 'judge';
 46    passed: boolean;
 47    duration: number;
 48    error?: string;
 49    set: 'train' | 'test';
 50  }
 51  
 52  function judge(criteria: JudgeCriteria, output: string): boolean {
 53    try {
 54      switch (criteria.type) {
 55        case 'contains':
 56          return output.toLowerCase().includes(criteria.value.toLowerCase());
 57        case 'arrayMinLength': {
 58          // browser verify outputs table text; try JSON parse first, then count non-empty lines
 59          try {
 60            const arr = JSON.parse(output);
 61            if (Array.isArray(arr)) return arr.length >= criteria.minLength;
 62          } catch { /* not JSON — try line counting */ }
 63          // Table output: count data rows (skip header, separator, empty lines)
 64          const lines = output.split('\n').filter(l => l.trim() && !l.startsWith('─') && !l.startsWith('┌') && !l.startsWith('└') && !l.startsWith('├'));
 65          // Subtract header row
 66          const dataLines = lines.length > 1 ? lines.length - 1 : 0;
 67          return dataLines >= criteria.minLength;
 68        }
 69        case 'nonEmpty':
 70          return output.trim().length > 0 && output.trim() !== 'null' && output.trim() !== 'undefined';
 71        case 'matchesPattern':
 72          return new RegExp(criteria.pattern).test(output);
 73        default:
 74          return false;
 75      }
 76    } catch {
 77      return false;
 78    }
 79  }
 80  
 81  const PROJECT_ROOT = join(__dirname, '..');
 82  
 83  /** Run a command, using the local built entrypoint instead of global opencli for consistency */
 84  function runCommand(cmd: string, timeout = 30000): string {
 85    // Use local build so tests always run against the current source
 86    const localCmd = cmd.replace(/^opencli /, `node dist/src/main.js `);
 87    try {
 88      return execSync(localCmd, {
 89        cwd: PROJECT_ROOT,
 90        timeout,
 91        encoding: 'utf-8',
 92        env: process.env,
 93        stdio: ['pipe', 'pipe', 'pipe'],
 94      }).trim();
 95    } catch (err: any) {
 96      return err.stdout?.trim() || err.stderr?.trim() || '';
 97    }
 98  }
 99  
100  function cleanupAdapter(site: string, command: string): void {
101    const siteDir = join(USER_CLIS_DIR, site);
102    const filePath = join(siteDir, `${command}.ts`);
103    try {
104      if (existsSync(filePath)) rmSync(filePath);
105      // Remove site dir if empty
106      if (existsSync(siteDir)) {
107        const remaining = readdirSync(siteDir);
108        if (remaining.length === 0) rmSync(siteDir, { recursive: true });
109      }
110    } catch { /* best effort */ }
111  }
112  
113  function runTask(task: SaveTask): TaskResult {
114    const start = Date.now();
115    const { site, command } = task;
116    const adapterDir = join(USER_CLIS_DIR, site);
117    const adapterPath = join(adapterDir, `${command}.ts`);
118  
119    // Cleanup any leftover from previous runs
120    cleanupAdapter(site, command);
121  
122    try {
123      // Phase 1: init — create scaffold
124      const initOutput = runCommand(`opencli browser init ${site}/${command}`);
125      if (!existsSync(adapterPath)) {
126        return {
127          name: task.name, phase: 'init', passed: false,
128          duration: Date.now() - start,
129          error: `init failed: file not created. Output: ${initOutput.slice(0, 100)}`,
130          set: task.set === 'test' ? 'test' : 'train',
131        };
132      }
133  
134      // Phase 2: write — overwrite scaffold with real adapter code
135      if (task.adapterFile) {
136        // Read from file (complex adapters — avoids JSON string escape issues)
137        const srcPath = join(__dirname, task.adapterFile);
138        const code = readFileSync(srcPath, 'utf-8');
139        writeFileSync(adapterPath, code, 'utf-8');
140      } else if (task.adapter) {
141        writeFileSync(adapterPath, task.adapter, 'utf-8');
142      }
143  
144      // Phase 3: verify — run the adapter via browser verify
145      const verifyOutput = runCommand(
146        `opencli browser verify ${site}/${command}`,
147        45000, // longer timeout for network calls
148      );
149  
150      if (verifyOutput.includes('✗ Adapter failed')) {
151        return {
152          name: task.name, phase: 'verify', passed: false,
153          duration: Date.now() - start,
154          error: `verify failed: ${verifyOutput.slice(0, 200)}`,
155          set: task.set === 'test' ? 'test' : 'train',
156        };
157      }
158  
159      // Phase 4: judge — check output quality
160      const passed = judge(task.judge, verifyOutput);
161  
162      return {
163        name: task.name,
164        phase: 'judge',
165        passed,
166        duration: Date.now() - start,
167        error: passed ? undefined : `Judge failed on output: ${verifyOutput.slice(0, 150)}`,
168        set: task.set === 'test' ? 'test' : 'train',
169      };
170    } catch (err: any) {
171      return {
172        name: task.name, phase: 'verify', passed: false,
173        duration: Date.now() - start,
174        error: err.message?.slice(0, 150),
175        set: task.set === 'test' ? 'test' : 'train',
176      };
177    } finally {
178      // Always cleanup test adapters
179      cleanupAdapter(site, command);
180    }
181  }
182  
183  function main() {
184    const args = process.argv.slice(2);
185    const singleTask = args.includes('--task') ? args[args.indexOf('--task') + 1] : null;
186  
187    const allTasks: SaveTask[] = JSON.parse(readFileSync(TASKS_FILE, 'utf-8'));
188    const tasks = singleTask ? allTasks.filter(t => t.name === singleTask) : allTasks;
189  
190    if (tasks.length === 0) {
191      console.error(`Task "${singleTask}" not found.`);
192      process.exit(1);
193    }
194  
195    console.log(`\n🧪 Layer 4: Save as CLI — ${tasks.length} tasks\n`);
196  
197    const results: TaskResult[] = [];
198  
199    for (let i = 0; i < tasks.length; i++) {
200      const task = tasks[i];
201      process.stdout.write(`  [${i + 1}/${tasks.length}] ${task.name}...`);
202  
203      const result = runTask(task);
204      results.push(result);
205  
206      const icon = result.passed ? '✓' : '✗';
207      const phase = result.passed ? '' : ` (${result.phase})`;
208      console.log(` ${icon}${phase} (${(result.duration / 1000).toFixed(1)}s)`);
209    }
210  
211    // Summary
212    const trainResults = results.filter(r => r.set === 'train');
213    const testResults = results.filter(r => r.set === 'test');
214    const totalPassed = results.filter(r => r.passed).length;
215    const trainPassed = trainResults.filter(r => r.passed).length;
216    const testPassed = testResults.filter(r => r.passed).length;
217    const totalDuration = results.reduce((s, r) => s + r.duration, 0);
218  
219    console.log(`\n${'─'.repeat(50)}`);
220    console.log(`  Score:  ${totalPassed}/${results.length} (train: ${trainPassed}/${trainResults.length}, test: ${testPassed}/${testResults.length})`);
221    console.log(`  Time:   ${Math.round(totalDuration / 1000)}s`);
222  
223    const failures = results.filter(r => !r.passed);
224    if (failures.length > 0) {
225      console.log(`\n  Failures:`);
226      for (const f of failures) {
227        console.log(`    ✗ ${f.name} [${f.phase}]: ${f.error ?? 'unknown'}`);
228      }
229    }
230    console.log('');
231  
232    // Save result
233    mkdirSync(RESULTS_DIR, { recursive: true });
234    const existing = readdirSync(RESULTS_DIR).filter(f => f.startsWith('save-')).length;
235    const roundNum = String(existing + 1).padStart(3, '0');
236    const resultPath = join(RESULTS_DIR, `save-${roundNum}.json`);
237    writeFileSync(resultPath, JSON.stringify({
238      timestamp: new Date().toISOString(),
239      score: `${totalPassed}/${results.length}`,
240      trainScore: `${trainPassed}/${trainResults.length}`,
241      testScore: `${testPassed}/${testResults.length}`,
242      duration: `${Math.round(totalDuration / 1000)}s`,
243      tasks: results,
244    }, null, 2), 'utf-8');
245    console.log(`  Results saved to: ${resultPath}`);
246    console.log(`\nSCORE=${totalPassed}/${results.length}`);
247  }
248  
249  main();