/ autoresearch / eval-zhihu.ts
eval-zhihu.ts
  1  #!/usr/bin/env npx tsx
  2  /**
  3   * Zhihu Test Suite: Deterministic command testing against v2ex.com.
  4   *
  5   * 40 tasks across 5 difficulty layers:
  6   *   L1 Atomic (10) → L2 Single Page (10) → L3 Multi-Step (10)
  7   *   → L4 Write Ops (5) → L5 Complex Chain (5)
  8   *
  9   * Usage:
 10   *   npx tsx autoresearch/eval-v2ex.ts                    # Run all tasks
 11   *   npx tsx autoresearch/eval-v2ex.ts --task zhihu-hot-topics  # Run single task
 12   *   npx tsx autoresearch/eval-v2ex.ts --layer 1          # Run only Layer 1 (atomic)
 13   */
 14  
 15  import { execSync } from 'node:child_process';
 16  import { readFileSync, writeFileSync, mkdirSync, readdirSync } from 'node:fs';
 17  import { join, dirname } from 'node:path';
 18  import { fileURLToPath } from 'node:url';
 19  
 20  const __dirname = dirname(fileURLToPath(import.meta.url));
 21  const TASKS_FILE = join(__dirname, 'zhihu-tasks.json');
 22  const RESULTS_DIR = join(__dirname, 'results');
 23  
 24  interface BrowseTask {
 25    name: string;
 26    steps: string[];
 27    judge: JudgeCriteria;
 28    set?: 'test';
 29    note?: string;
 30    _comment?: string;
 31  }
 32  
 33  type JudgeCriteria =
 34    | { type: 'contains'; value: string }
 35    | { type: 'arrayMinLength'; minLength: number }
 36    | { type: 'nonEmpty' }
 37    | { type: 'matchesPattern'; pattern: string };
 38  
 39  interface TaskResult {
 40    name: string;
 41    passed: boolean;
 42    duration: number;
 43    error?: string;
 44    layer: string;
 45  }
 46  
 47  // Layer classification by task name
 48  function getLayer(name: string): string {
 49    const l1 = ['zhihu-open-home', 'zhihu-get-title', 'zhihu-state', 'zhihu-get-url', 'zhihu-scroll-down',
 50      'zhihu-click-tab-hot', 'zhihu-back-navigation', 'zhihu-wait-page-load', 'zhihu-keys-escape', 'zhihu-screenshot'];
 51    const l2 = ['zhihu-feed-titles', 'zhihu-hot-list', 'zhihu-hot-metrics', 'zhihu-nav-tabs',
 52      'zhihu-feed-with-authors', 'zhihu-feed-types', 'zhihu-user-avatar', 'zhihu-search-input-exists'];
 53    const l3 = ['zhihu-question-title', 'zhihu-question-meta', 'zhihu-first-answer', 'zhihu-answer-votes',
 54      'zhihu-question-buttons', 'zhihu-multiple-answers', 'zhihu-question-description', 'zhihu-answer-count-number'];
 55    const l4 = ['zhihu-hot-to-question', 'zhihu-feed-to-question', 'zhihu-question-to-author',
 56      'zhihu-search-navigate', 'zhihu-topic-page', 'zhihu-user-profile', 'zhihu-question-and-back', 'zhihu-scroll-load-more'];
 57    const l5 = ['zhihu-upvote-button-find', 'zhihu-follow-question-find', 'zhihu-comment-button-find',
 58      'zhihu-bookmark-find', 'zhihu-write-answer-btn', 'zhihu-share-find'];
 59    const l6 = ['zhihu-hot-read-answer-author', 'zhihu-hot-to-author-profile', 'zhihu-multi-hot-topics',
 60      'zhihu-search-then-read', 'zhihu-question-scroll-answers', 'zhihu-compare-tabs', 'zhihu-user-answers', 'zhihu-topic-questions'];
 61    const l7 = ['zhihu-search-basic', 'zhihu-search-people', 'zhihu-search-topic',
 62      'zhihu-search-click-result', 'zhihu-search-filter-answers', 'zhihu-search-and-back'];
 63    const l8 = ['zhihu-full-browse-workflow', 'zhihu-deep-author-chain', 'zhihu-cross-question-compare',
 64      'zhihu-search-read-chain', 'zhihu-3-page-chain', 'zhihu-hot-scroll-deep-read'];
 65  
 66    if (l1.includes(name)) return 'L1-atomic';
 67    if (l2.includes(name)) return 'L2-feed';
 68    if (l3.includes(name)) return 'L3-question';
 69    if (l4.includes(name)) return 'L4-navigation';
 70    if (l5.includes(name)) return 'L5-write';
 71    if (l6.includes(name)) return 'L6-chain';
 72    if (l7.includes(name)) return 'L7-search';
 73    if (l8.includes(name)) return 'L8-complex';
 74    return 'unknown';
 75  }
 76  
 77  function judge(criteria: JudgeCriteria, output: string): boolean {
 78    try {
 79      switch (criteria.type) {
 80        case 'contains':
 81          return output.toLowerCase().includes(criteria.value.toLowerCase());
 82        case 'arrayMinLength': {
 83          try {
 84            const arr = JSON.parse(output);
 85            if (Array.isArray(arr)) return arr.length >= criteria.minLength;
 86          } catch { /* not JSON array */ }
 87          return false;
 88        }
 89        case 'nonEmpty':
 90          return output.trim().length > 0 && output.trim() !== 'null' && output.trim() !== 'undefined';
 91        case 'matchesPattern':
 92          return new RegExp(criteria.pattern).test(output);
 93        default:
 94          return false;
 95      }
 96    } catch {
 97      return false;
 98    }
 99  }
100  
101  function runCommand(cmd: string, timeout = 30000): string {
102    try {
103      return execSync(cmd, {
104        cwd: join(__dirname, '..'),
105        timeout,
106        encoding: 'utf-8',
107        env: process.env,
108        stdio: ['pipe', 'pipe', 'pipe'],
109      }).trim();
110    } catch (err: any) {
111      return err.stdout?.trim() ?? '';
112    }
113  }
114  
115  function runTask(task: BrowseTask): TaskResult {
116    const start = Date.now();
117    let lastOutput = '';
118  
119    try {
120      for (const step of task.steps) {
121        lastOutput = runCommand(step);
122      }
123  
124      const passed = judge(task.judge, lastOutput);
125  
126      return {
127        name: task.name,
128        passed,
129        duration: Date.now() - start,
130        error: passed ? undefined : `Output: ${lastOutput.slice(0, 150)}`,
131        layer: getLayer(task.name),
132      };
133    } catch (err: any) {
134      return {
135        name: task.name,
136        passed: false,
137        duration: Date.now() - start,
138        error: err.message?.slice(0, 100),
139        layer: getLayer(task.name),
140      };
141    }
142  }
143  
144  function main() {
145    const args = process.argv.slice(2);
146    const singleTask = args.includes('--task') ? args[args.indexOf('--task') + 1] : null;
147    const layerFilter = args.includes('--layer') ? args[args.indexOf('--layer') + 1] : null;
148  
149    const raw = JSON.parse(readFileSync(TASKS_FILE, 'utf-8')) as (BrowseTask | { _comment: string })[];
150    const allTasks = raw.filter((t): t is BrowseTask => 'name' in t && 'steps' in t);
151  
152    let tasks = allTasks;
153    if (singleTask) {
154      tasks = allTasks.filter(t => t.name === singleTask);
155    } else if (layerFilter) {
156      const prefix = `L${layerFilter}`;
157      tasks = allTasks.filter(t => getLayer(t.name).startsWith(prefix));
158    }
159  
160    if (tasks.length === 0) {
161      console.error(singleTask ? `Task "${singleTask}" not found.` : `No tasks for layer ${layerFilter}.`);
162      process.exit(1);
163    }
164  
165    console.log(`\nšŸ”¬ Zhihu Test Suite — ${tasks.length} tasks\n`);
166  
167    const results: TaskResult[] = [];
168  
169    for (let i = 0; i < tasks.length; i++) {
170      const task = tasks[i];
171      process.stdout.write(`  [${i + 1}/${tasks.length}] ${task.name}...`);
172  
173      const result = runTask(task);
174      results.push(result);
175  
176      const icon = result.passed ? 'āœ“' : 'āœ—';
177      console.log(` ${icon} (${(result.duration / 1000).toFixed(1)}s)`);
178  
179      // Close browser between tasks for clean state
180      if (i < tasks.length - 1) {
181        try { runCommand('opencli browser close'); } catch { /* ignore */ }
182      }
183    }
184  
185    // Final close
186    try { runCommand('opencli browser close'); } catch { /* ignore */ }
187  
188    // Summary by layer
189    const layers = [...new Set(results.map(r => r.layer))].sort();
190    const totalPassed = results.filter(r => r.passed).length;
191    const totalDuration = results.reduce((s, r) => s + r.duration, 0);
192  
193    console.log(`\n${'─'.repeat(50)}`);
194    console.log(`  Total:  ${totalPassed}/${results.length}`);
195    for (const layer of layers) {
196      const layerResults = results.filter(r => r.layer === layer);
197      const layerPassed = layerResults.filter(r => r.passed).length;
198      console.log(`  ${layer}: ${layerPassed}/${layerResults.length}`);
199    }
200    console.log(`  Time:   ${Math.round(totalDuration / 60000)}min`);
201  
202    const failures = results.filter(r => !r.passed);
203    if (failures.length > 0) {
204      console.log(`\n  Failures:`);
205      for (const f of failures) {
206        console.log(`    āœ— [${f.layer}] ${f.name}: ${f.error ?? 'unknown'}`);
207      }
208    }
209    console.log('');
210  
211    // Save result
212    mkdirSync(RESULTS_DIR, { recursive: true });
213    const existing = readdirSync(RESULTS_DIR).filter(f => f.startsWith('zhihu-')).length;
214    const roundNum = String(existing + 1).padStart(3, '0');
215    const resultPath = join(RESULTS_DIR, `zhihu-${roundNum}.json`);
216    writeFileSync(resultPath, JSON.stringify({
217      timestamp: new Date().toISOString(),
218      score: `${totalPassed}/${results.length}`,
219      layers: Object.fromEntries(layers.map(l => {
220        const lr = results.filter(r => r.layer === l);
221        return [l, `${lr.filter(r => r.passed).length}/${lr.length}`];
222      })),
223      duration: `${Math.round(totalDuration / 60000)}min`,
224      tasks: results,
225    }, null, 2), 'utf-8');
226    console.log(`  Results saved to: ${resultPath}`);
227    console.log(`\nSCORE=${totalPassed}/${results.length}`);
228  }
229  
230  main();