/ autoresearch / eval-zhihu.ts
eval-zhihu.ts
1 #!/usr/bin/env npx tsx 2 /** 3 * Zhihu Test Suite: Deterministic command testing against v2ex.com. 4 * 5 * 40 tasks across 5 difficulty layers: 6 * L1 Atomic (10) ā L2 Single Page (10) ā L3 Multi-Step (10) 7 * ā L4 Write Ops (5) ā L5 Complex Chain (5) 8 * 9 * Usage: 10 * npx tsx autoresearch/eval-v2ex.ts # Run all tasks 11 * npx tsx autoresearch/eval-v2ex.ts --task zhihu-hot-topics # Run single task 12 * npx tsx autoresearch/eval-v2ex.ts --layer 1 # Run only Layer 1 (atomic) 13 */ 14 15 import { execSync } from 'node:child_process'; 16 import { readFileSync, writeFileSync, mkdirSync, readdirSync } from 'node:fs'; 17 import { join, dirname } from 'node:path'; 18 import { fileURLToPath } from 'node:url'; 19 20 const __dirname = dirname(fileURLToPath(import.meta.url)); 21 const TASKS_FILE = join(__dirname, 'zhihu-tasks.json'); 22 const RESULTS_DIR = join(__dirname, 'results'); 23 24 interface BrowseTask { 25 name: string; 26 steps: string[]; 27 judge: JudgeCriteria; 28 set?: 'test'; 29 note?: string; 30 _comment?: string; 31 } 32 33 type JudgeCriteria = 34 | { type: 'contains'; value: string } 35 | { type: 'arrayMinLength'; minLength: number } 36 | { type: 'nonEmpty' } 37 | { type: 'matchesPattern'; pattern: string }; 38 39 interface TaskResult { 40 name: string; 41 passed: boolean; 42 duration: number; 43 error?: string; 44 layer: string; 45 } 46 47 // Layer classification by task name 48 function getLayer(name: string): string { 49 const l1 = ['zhihu-open-home', 'zhihu-get-title', 'zhihu-state', 'zhihu-get-url', 'zhihu-scroll-down', 50 'zhihu-click-tab-hot', 'zhihu-back-navigation', 'zhihu-wait-page-load', 'zhihu-keys-escape', 'zhihu-screenshot']; 51 const l2 = ['zhihu-feed-titles', 'zhihu-hot-list', 'zhihu-hot-metrics', 'zhihu-nav-tabs', 52 'zhihu-feed-with-authors', 'zhihu-feed-types', 'zhihu-user-avatar', 'zhihu-search-input-exists']; 53 const l3 = ['zhihu-question-title', 'zhihu-question-meta', 'zhihu-first-answer', 'zhihu-answer-votes', 54 'zhihu-question-buttons', 'zhihu-multiple-answers', 'zhihu-question-description', 'zhihu-answer-count-number']; 55 const l4 = ['zhihu-hot-to-question', 'zhihu-feed-to-question', 'zhihu-question-to-author', 56 'zhihu-search-navigate', 'zhihu-topic-page', 'zhihu-user-profile', 'zhihu-question-and-back', 'zhihu-scroll-load-more']; 57 const l5 = ['zhihu-upvote-button-find', 'zhihu-follow-question-find', 'zhihu-comment-button-find', 58 'zhihu-bookmark-find', 'zhihu-write-answer-btn', 'zhihu-share-find']; 59 const l6 = ['zhihu-hot-read-answer-author', 'zhihu-hot-to-author-profile', 'zhihu-multi-hot-topics', 60 'zhihu-search-then-read', 'zhihu-question-scroll-answers', 'zhihu-compare-tabs', 'zhihu-user-answers', 'zhihu-topic-questions']; 61 const l7 = ['zhihu-search-basic', 'zhihu-search-people', 'zhihu-search-topic', 62 'zhihu-search-click-result', 'zhihu-search-filter-answers', 'zhihu-search-and-back']; 63 const l8 = ['zhihu-full-browse-workflow', 'zhihu-deep-author-chain', 'zhihu-cross-question-compare', 64 'zhihu-search-read-chain', 'zhihu-3-page-chain', 'zhihu-hot-scroll-deep-read']; 65 66 if (l1.includes(name)) return 'L1-atomic'; 67 if (l2.includes(name)) return 'L2-feed'; 68 if (l3.includes(name)) return 'L3-question'; 69 if (l4.includes(name)) return 'L4-navigation'; 70 if (l5.includes(name)) return 'L5-write'; 71 if (l6.includes(name)) return 'L6-chain'; 72 if (l7.includes(name)) return 'L7-search'; 73 if (l8.includes(name)) return 'L8-complex'; 74 return 'unknown'; 75 } 76 77 function judge(criteria: JudgeCriteria, output: string): boolean { 78 try { 79 switch (criteria.type) { 80 case 'contains': 81 return output.toLowerCase().includes(criteria.value.toLowerCase()); 82 case 'arrayMinLength': { 83 try { 84 const arr = JSON.parse(output); 85 if (Array.isArray(arr)) return arr.length >= criteria.minLength; 86 } catch { /* not JSON array */ } 87 return false; 88 } 89 case 'nonEmpty': 90 return output.trim().length > 0 && output.trim() !== 'null' && output.trim() !== 'undefined'; 91 case 'matchesPattern': 92 return new RegExp(criteria.pattern).test(output); 93 default: 94 return false; 95 } 96 } catch { 97 return false; 98 } 99 } 100 101 function runCommand(cmd: string, timeout = 30000): string { 102 try { 103 return execSync(cmd, { 104 cwd: join(__dirname, '..'), 105 timeout, 106 encoding: 'utf-8', 107 env: process.env, 108 stdio: ['pipe', 'pipe', 'pipe'], 109 }).trim(); 110 } catch (err: any) { 111 return err.stdout?.trim() ?? ''; 112 } 113 } 114 115 function runTask(task: BrowseTask): TaskResult { 116 const start = Date.now(); 117 let lastOutput = ''; 118 119 try { 120 for (const step of task.steps) { 121 lastOutput = runCommand(step); 122 } 123 124 const passed = judge(task.judge, lastOutput); 125 126 return { 127 name: task.name, 128 passed, 129 duration: Date.now() - start, 130 error: passed ? undefined : `Output: ${lastOutput.slice(0, 150)}`, 131 layer: getLayer(task.name), 132 }; 133 } catch (err: any) { 134 return { 135 name: task.name, 136 passed: false, 137 duration: Date.now() - start, 138 error: err.message?.slice(0, 100), 139 layer: getLayer(task.name), 140 }; 141 } 142 } 143 144 function main() { 145 const args = process.argv.slice(2); 146 const singleTask = args.includes('--task') ? args[args.indexOf('--task') + 1] : null; 147 const layerFilter = args.includes('--layer') ? args[args.indexOf('--layer') + 1] : null; 148 149 const raw = JSON.parse(readFileSync(TASKS_FILE, 'utf-8')) as (BrowseTask | { _comment: string })[]; 150 const allTasks = raw.filter((t): t is BrowseTask => 'name' in t && 'steps' in t); 151 152 let tasks = allTasks; 153 if (singleTask) { 154 tasks = allTasks.filter(t => t.name === singleTask); 155 } else if (layerFilter) { 156 const prefix = `L${layerFilter}`; 157 tasks = allTasks.filter(t => getLayer(t.name).startsWith(prefix)); 158 } 159 160 if (tasks.length === 0) { 161 console.error(singleTask ? `Task "${singleTask}" not found.` : `No tasks for layer ${layerFilter}.`); 162 process.exit(1); 163 } 164 165 console.log(`\nš¬ Zhihu Test Suite ā ${tasks.length} tasks\n`); 166 167 const results: TaskResult[] = []; 168 169 for (let i = 0; i < tasks.length; i++) { 170 const task = tasks[i]; 171 process.stdout.write(` [${i + 1}/${tasks.length}] ${task.name}...`); 172 173 const result = runTask(task); 174 results.push(result); 175 176 const icon = result.passed ? 'ā' : 'ā'; 177 console.log(` ${icon} (${(result.duration / 1000).toFixed(1)}s)`); 178 179 // Close browser between tasks for clean state 180 if (i < tasks.length - 1) { 181 try { runCommand('opencli browser close'); } catch { /* ignore */ } 182 } 183 } 184 185 // Final close 186 try { runCommand('opencli browser close'); } catch { /* ignore */ } 187 188 // Summary by layer 189 const layers = [...new Set(results.map(r => r.layer))].sort(); 190 const totalPassed = results.filter(r => r.passed).length; 191 const totalDuration = results.reduce((s, r) => s + r.duration, 0); 192 193 console.log(`\n${'ā'.repeat(50)}`); 194 console.log(` Total: ${totalPassed}/${results.length}`); 195 for (const layer of layers) { 196 const layerResults = results.filter(r => r.layer === layer); 197 const layerPassed = layerResults.filter(r => r.passed).length; 198 console.log(` ${layer}: ${layerPassed}/${layerResults.length}`); 199 } 200 console.log(` Time: ${Math.round(totalDuration / 60000)}min`); 201 202 const failures = results.filter(r => !r.passed); 203 if (failures.length > 0) { 204 console.log(`\n Failures:`); 205 for (const f of failures) { 206 console.log(` ā [${f.layer}] ${f.name}: ${f.error ?? 'unknown'}`); 207 } 208 } 209 console.log(''); 210 211 // Save result 212 mkdirSync(RESULTS_DIR, { recursive: true }); 213 const existing = readdirSync(RESULTS_DIR).filter(f => f.startsWith('zhihu-')).length; 214 const roundNum = String(existing + 1).padStart(3, '0'); 215 const resultPath = join(RESULTS_DIR, `zhihu-${roundNum}.json`); 216 writeFileSync(resultPath, JSON.stringify({ 217 timestamp: new Date().toISOString(), 218 score: `${totalPassed}/${results.length}`, 219 layers: Object.fromEntries(layers.map(l => { 220 const lr = results.filter(r => r.layer === l); 221 return [l, `${lr.filter(r => r.passed).length}/${lr.length}`]; 222 })), 223 duration: `${Math.round(totalDuration / 60000)}min`, 224 tasks: results, 225 }, null, 2), 'utf-8'); 226 console.log(` Results saved to: ${resultPath}`); 227 console.log(`\nSCORE=${totalPassed}/${results.length}`); 228 } 229 230 main();