/ autoresearch / eval-save.ts
eval-save.ts
1 #!/usr/bin/env npx tsx 2 /** 3 * Layer 4: Save as CLI Testing — "Save as CLI" Pipeline 4 * 5 * Tests the full browser init → write adapter → browser verify flow. 6 * Validates that browser exploration can be crystallized into reusable CLI adapters. 7 * 8 * Usage: 9 * npx tsx autoresearch/eval-save.ts # Run all tasks 10 * npx tsx autoresearch/eval-save.ts --task hn-top # Run single task 11 */ 12 13 import { execSync } from 'node:child_process'; 14 import { readFileSync, writeFileSync, mkdirSync, readdirSync, existsSync, rmSync } from 'node:fs'; 15 import { join, dirname } from 'node:path'; 16 import { fileURLToPath } from 'node:url'; 17 import { homedir } from 'node:os'; 18 19 const __dirname = dirname(fileURLToPath(import.meta.url)); 20 const TASKS_FILE = join(__dirname, 'save-tasks.json'); 21 const RESULTS_DIR = join(__dirname, 'results'); 22 const USER_CLIS_DIR = join(homedir(), '.opencli', 'clis'); 23 24 interface SaveTask { 25 name: string; 26 site: string; 27 command: string; 28 /** Inline adapter code (simple tasks) */ 29 adapter?: string; 30 /** Path to adapter file relative to autoresearch/ dir (complex tasks — avoids JSON escape issues) */ 31 adapterFile?: string; 32 judge: JudgeCriteria; 33 set?: 'test'; 34 note?: string; 35 } 36 37 type JudgeCriteria = 38 | { type: 'contains'; value: string } 39 | { type: 'arrayMinLength'; minLength: number } 40 | { type: 'nonEmpty' } 41 | { type: 'matchesPattern'; pattern: string }; 42 43 interface TaskResult { 44 name: string; 45 phase: 'init' | 'write' | 'verify' | 'judge'; 46 passed: boolean; 47 duration: number; 48 error?: string; 49 set: 'train' | 'test'; 50 } 51 52 function judge(criteria: JudgeCriteria, output: string): boolean { 53 try { 54 switch (criteria.type) { 55 case 'contains': 56 return output.toLowerCase().includes(criteria.value.toLowerCase()); 57 case 'arrayMinLength': { 58 // browser verify outputs table text; try JSON parse first, then count non-empty lines 59 try { 60 const arr = JSON.parse(output); 61 if (Array.isArray(arr)) return arr.length >= criteria.minLength; 62 } catch { /* not JSON — try line counting */ } 63 // Table output: count data rows (skip header, separator, empty lines) 64 const lines = output.split('\n').filter(l => l.trim() && !l.startsWith('─') && !l.startsWith('┌') && !l.startsWith('└') && !l.startsWith('├')); 65 // Subtract header row 66 const dataLines = lines.length > 1 ? lines.length - 1 : 0; 67 return dataLines >= criteria.minLength; 68 } 69 case 'nonEmpty': 70 return output.trim().length > 0 && output.trim() !== 'null' && output.trim() !== 'undefined'; 71 case 'matchesPattern': 72 return new RegExp(criteria.pattern).test(output); 73 default: 74 return false; 75 } 76 } catch { 77 return false; 78 } 79 } 80 81 const PROJECT_ROOT = join(__dirname, '..'); 82 83 /** Run a command, using the local built entrypoint instead of global opencli for consistency */ 84 function runCommand(cmd: string, timeout = 30000): string { 85 // Use local build so tests always run against the current source 86 const localCmd = cmd.replace(/^opencli /, `node dist/src/main.js `); 87 try { 88 return execSync(localCmd, { 89 cwd: PROJECT_ROOT, 90 timeout, 91 encoding: 'utf-8', 92 env: process.env, 93 stdio: ['pipe', 'pipe', 'pipe'], 94 }).trim(); 95 } catch (err: any) { 96 return err.stdout?.trim() || err.stderr?.trim() || ''; 97 } 98 } 99 100 function cleanupAdapter(site: string, command: string): void { 101 const siteDir = join(USER_CLIS_DIR, site); 102 const filePath = join(siteDir, `${command}.ts`); 103 try { 104 if (existsSync(filePath)) rmSync(filePath); 105 // Remove site dir if empty 106 if (existsSync(siteDir)) { 107 const remaining = readdirSync(siteDir); 108 if (remaining.length === 0) rmSync(siteDir, { recursive: true }); 109 } 110 } catch { /* best effort */ } 111 } 112 113 function runTask(task: SaveTask): TaskResult { 114 const start = Date.now(); 115 const { site, command } = task; 116 const adapterDir = join(USER_CLIS_DIR, site); 117 const adapterPath = join(adapterDir, `${command}.ts`); 118 119 // Cleanup any leftover from previous runs 120 cleanupAdapter(site, command); 121 122 try { 123 // Phase 1: init — create scaffold 124 const initOutput = runCommand(`opencli browser init ${site}/${command}`); 125 if (!existsSync(adapterPath)) { 126 return { 127 name: task.name, phase: 'init', passed: false, 128 duration: Date.now() - start, 129 error: `init failed: file not created. Output: ${initOutput.slice(0, 100)}`, 130 set: task.set === 'test' ? 'test' : 'train', 131 }; 132 } 133 134 // Phase 2: write — overwrite scaffold with real adapter code 135 if (task.adapterFile) { 136 // Read from file (complex adapters — avoids JSON string escape issues) 137 const srcPath = join(__dirname, task.adapterFile); 138 const code = readFileSync(srcPath, 'utf-8'); 139 writeFileSync(adapterPath, code, 'utf-8'); 140 } else if (task.adapter) { 141 writeFileSync(adapterPath, task.adapter, 'utf-8'); 142 } 143 144 // Phase 3: verify — run the adapter via browser verify 145 const verifyOutput = runCommand( 146 `opencli browser verify ${site}/${command}`, 147 45000, // longer timeout for network calls 148 ); 149 150 if (verifyOutput.includes('✗ Adapter failed')) { 151 return { 152 name: task.name, phase: 'verify', passed: false, 153 duration: Date.now() - start, 154 error: `verify failed: ${verifyOutput.slice(0, 200)}`, 155 set: task.set === 'test' ? 'test' : 'train', 156 }; 157 } 158 159 // Phase 4: judge — check output quality 160 const passed = judge(task.judge, verifyOutput); 161 162 return { 163 name: task.name, 164 phase: 'judge', 165 passed, 166 duration: Date.now() - start, 167 error: passed ? undefined : `Judge failed on output: ${verifyOutput.slice(0, 150)}`, 168 set: task.set === 'test' ? 'test' : 'train', 169 }; 170 } catch (err: any) { 171 return { 172 name: task.name, phase: 'verify', passed: false, 173 duration: Date.now() - start, 174 error: err.message?.slice(0, 150), 175 set: task.set === 'test' ? 'test' : 'train', 176 }; 177 } finally { 178 // Always cleanup test adapters 179 cleanupAdapter(site, command); 180 } 181 } 182 183 function main() { 184 const args = process.argv.slice(2); 185 const singleTask = args.includes('--task') ? args[args.indexOf('--task') + 1] : null; 186 187 const allTasks: SaveTask[] = JSON.parse(readFileSync(TASKS_FILE, 'utf-8')); 188 const tasks = singleTask ? allTasks.filter(t => t.name === singleTask) : allTasks; 189 190 if (tasks.length === 0) { 191 console.error(`Task "${singleTask}" not found.`); 192 process.exit(1); 193 } 194 195 console.log(`\n🧪 Layer 4: Save as CLI — ${tasks.length} tasks\n`); 196 197 const results: TaskResult[] = []; 198 199 for (let i = 0; i < tasks.length; i++) { 200 const task = tasks[i]; 201 process.stdout.write(` [${i + 1}/${tasks.length}] ${task.name}...`); 202 203 const result = runTask(task); 204 results.push(result); 205 206 const icon = result.passed ? '✓' : '✗'; 207 const phase = result.passed ? '' : ` (${result.phase})`; 208 console.log(` ${icon}${phase} (${(result.duration / 1000).toFixed(1)}s)`); 209 } 210 211 // Summary 212 const trainResults = results.filter(r => r.set === 'train'); 213 const testResults = results.filter(r => r.set === 'test'); 214 const totalPassed = results.filter(r => r.passed).length; 215 const trainPassed = trainResults.filter(r => r.passed).length; 216 const testPassed = testResults.filter(r => r.passed).length; 217 const totalDuration = results.reduce((s, r) => s + r.duration, 0); 218 219 console.log(`\n${'─'.repeat(50)}`); 220 console.log(` Score: ${totalPassed}/${results.length} (train: ${trainPassed}/${trainResults.length}, test: ${testPassed}/${testResults.length})`); 221 console.log(` Time: ${Math.round(totalDuration / 1000)}s`); 222 223 const failures = results.filter(r => !r.passed); 224 if (failures.length > 0) { 225 console.log(`\n Failures:`); 226 for (const f of failures) { 227 console.log(` ✗ ${f.name} [${f.phase}]: ${f.error ?? 'unknown'}`); 228 } 229 } 230 console.log(''); 231 232 // Save result 233 mkdirSync(RESULTS_DIR, { recursive: true }); 234 const existing = readdirSync(RESULTS_DIR).filter(f => f.startsWith('save-')).length; 235 const roundNum = String(existing + 1).padStart(3, '0'); 236 const resultPath = join(RESULTS_DIR, `save-${roundNum}.json`); 237 writeFileSync(resultPath, JSON.stringify({ 238 timestamp: new Date().toISOString(), 239 score: `${totalPassed}/${results.length}`, 240 trainScore: `${trainPassed}/${trainResults.length}`, 241 testScore: `${testPassed}/${testResults.length}`, 242 duration: `${Math.round(totalDuration / 1000)}s`, 243 tasks: results, 244 }, null, 2), 'utf-8'); 245 console.log(` Results saved to: ${resultPath}`); 246 console.log(`\nSCORE=${totalPassed}/${results.length}`); 247 } 248 249 main();