/ src / browser / analyze.test.ts
analyze.test.ts
  1  import { describe, it, expect } from 'vitest';
  2  import {
  3    analyzeSite,
  4    detectAntiBot,
  5    classifyPattern,
  6    findNearestAdapter,
  7    type PageSignals,
  8  } from './analyze.js';
  9  import type { CliCommand } from '../registry.js';
 10  
 11  function mkSignals(overrides: Partial<PageSignals> = {}): PageSignals {
 12    return {
 13      requestedUrl: 'https://example.com/',
 14      finalUrl: 'https://example.com/',
 15      cookieNames: [],
 16      networkEntries: [],
 17      initialState: {
 18        __INITIAL_STATE__: false,
 19        __NUXT__: false,
 20        __NEXT_DATA__: false,
 21        __APOLLO_STATE__: false,
 22      },
 23      title: 'Example',
 24      ...overrides,
 25    };
 26  }
 27  
 28  function mkCmd(site: string, name: string, domain?: string): CliCommand {
 29    return {
 30      site,
 31      name,
 32      description: '',
 33      domain,
 34      args: [],
 35    };
 36  }
 37  
 38  describe('detectAntiBot', () => {
 39    it('flags Aliyun WAF from cookie', () => {
 40      const v = detectAntiBot(mkSignals({ cookieNames: ['JSESSIONID', 'acw_sc__v2'] }));
 41      expect(v.detected).toBe(true);
 42      expect(v.vendor).toBe('aliyun_waf');
 43      expect(v.evidence).toContain('cookie:acw_sc__v2');
 44      expect(v.implication).toMatch(/browser context/i);
 45    });
 46  
 47    it('flags Aliyun WAF from challenge HTML body', () => {
 48      const v = detectAntiBot(
 49        mkSignals({
 50          networkEntries: [
 51            {
 52              url: 'https://x.com/',
 53              status: 200,
 54              contentType: 'text/html',
 55              bodyPreview: "var arg1 = 'A1B2C3D4E5F6A7B8C9D0E1F2A3B4C5D6';",
 56            },
 57          ],
 58        }),
 59      );
 60      expect(v.detected).toBe(true);
 61      expect(v.vendor).toBe('aliyun_waf');
 62    });
 63  
 64    it('flags Cloudflare from cf_clearance cookie', () => {
 65      const v = detectAntiBot(mkSignals({ cookieNames: ['cf_clearance'] }));
 66      expect(v.vendor).toBe('cloudflare');
 67      expect(v.implication).toMatch(/Cloudflare/i);
 68    });
 69  
 70    it('flags Akamai from _abck cookie', () => {
 71      const v = detectAntiBot(mkSignals({ cookieNames: ['_abck', 'bm_sz'] }));
 72      expect(v.vendor).toBe('akamai');
 73    });
 74  
 75    it('returns no-match verdict with actionable fallback advice', () => {
 76      const v = detectAntiBot(mkSignals());
 77      expect(v.detected).toBe(false);
 78      expect(v.vendor).toBeNull();
 79      expect(v.implication).toMatch(/Node-side COOKIE fetch first/);
 80    });
 81  });
 82  
 83  describe('classifyPattern', () => {
 84    it('returns A for JSON-heavy pages without SSR state', () => {
 85      const v = classifyPattern(
 86        mkSignals({
 87          networkEntries: [
 88            { url: 'https://x.com/api/a', status: 200, contentType: 'application/json', bodyPreview: '{}' },
 89            { url: 'https://x.com/api/b', status: 200, contentType: 'application/json;charset=utf-8', bodyPreview: '{}' },
 90          ],
 91        }),
 92      );
 93      expect(v.pattern).toBe('A');
 94      expect(v.json_responses).toBe(2);
 95    });
 96  
 97    it('returns B when __INITIAL_STATE__ is present, beating JSON signals', () => {
 98      const v = classifyPattern(
 99        mkSignals({
100          initialState: { __INITIAL_STATE__: true, __NUXT__: false, __NEXT_DATA__: false, __APOLLO_STATE__: false },
101          networkEntries: [
102            { url: 'https://x.com/api/a', status: 200, contentType: 'application/json', bodyPreview: '{}' },
103          ],
104        }),
105      );
106      expect(v.pattern).toBe('B');
107    });
108  
109    it('returns D when auth failures dominate', () => {
110      const v = classifyPattern(
111        mkSignals({
112          networkEntries: [
113            { url: 'https://x.com/api/a', status: 401, contentType: 'application/json', bodyPreview: '' },
114            { url: 'https://x.com/api/b', status: 403, contentType: 'application/json', bodyPreview: '' },
115          ],
116        }),
117      );
118      expect(v.pattern).toBe('D');
119      expect(v.auth_failures).toBe(2);
120    });
121  
122    it('returns C by default for static pages', () => {
123      const v = classifyPattern(mkSignals());
124      expect(v.pattern).toBe('C');
125    });
126  });
127  
128  describe('findNearestAdapter', () => {
129    it('matches by domain suffix', () => {
130      const reg = new Map<string, CliCommand>([
131        ['51job search', mkCmd('51job', 'search', '51job.com')],
132        ['51job detail', mkCmd('51job', 'detail', '51job.com')],
133        ['xueqiu search', mkCmd('xueqiu', 'search', 'xueqiu.com')],
134      ]);
135      const v = findNearestAdapter('https://jobs.51job.com/', reg);
136      expect(v?.site).toBe('51job');
137      expect(v?.example_commands).toContain('51job search');
138    });
139  
140    it('falls back to site-name containment when no domain is registered', () => {
141      const reg = new Map<string, CliCommand>([
142        ['51job search', mkCmd('51job', 'search')],
143      ]);
144      const v = findNearestAdapter('https://we.51job.com/', reg);
145      expect(v?.site).toBe('51job');
146    });
147  
148    it('returns null when no adapter matches', () => {
149      const reg = new Map<string, CliCommand>([
150        ['xueqiu search', mkCmd('xueqiu', 'search', 'xueqiu.com')],
151      ]);
152      const v = findNearestAdapter('https://random-site.io/', reg);
153      expect(v).toBeNull();
154    });
155  
156    it('prefers the site with the most commands', () => {
157      const reg = new Map<string, CliCommand>([
158        ['a search', mkCmd('a', 'search', 'a.com')],
159        ['b search', mkCmd('b', 'search', 'a.com')],
160        ['b detail', mkCmd('b', 'detail', 'a.com')],
161        ['b company', mkCmd('b', 'company', 'a.com')],
162      ]);
163      const v = findNearestAdapter('https://jobs.a.com/', reg);
164      expect(v?.site).toBe('b');
165    });
166  });
167  
168  describe('analyzeSite', () => {
169    it('recommends browser-context fetch when WAF is detected', () => {
170      const report = analyzeSite(
171        mkSignals({ cookieNames: ['acw_sc__v2'] }),
172        new Map(),
173      );
174      expect(report.anti_bot.vendor).toBe('aliyun_waf');
175      expect(report.recommended_next_step).toMatch(/browser context/i);
176    });
177  
178    it('recommends reading SSR state when Pattern B fires', () => {
179      const report = analyzeSite(
180        mkSignals({
181          initialState: { __INITIAL_STATE__: false, __NUXT__: true, __NEXT_DATA__: false, __APOLLO_STATE__: false },
182        }),
183        new Map(),
184      );
185      expect(report.pattern.pattern).toBe('B');
186      expect(report.recommended_next_step).toMatch(/__NUXT__|__INITIAL_STATE__|__NEXT_DATA__/);
187    });
188  
189    it('includes __APOLLO_STATE__ in Pattern B next-step guidance', () => {
190      const report = analyzeSite(
191        mkSignals({
192          initialState: { __INITIAL_STATE__: false, __NUXT__: false, __NEXT_DATA__: false, __APOLLO_STATE__: true },
193        }),
194        new Map(),
195      );
196      expect(report.pattern.pattern).toBe('B');
197      expect(report.recommended_next_step).toMatch(/__APOLLO_STATE__/);
198    });
199  
200    it('includes nearest_adapter when the registry has a match', () => {
201      const reg = new Map<string, CliCommand>([
202        ['51job search', mkCmd('51job', 'search', '51job.com')],
203      ]);
204      const report = analyzeSite(
205        mkSignals({ finalUrl: 'https://we.51job.com/' }),
206        reg,
207      );
208      expect(report.nearest_adapter?.site).toBe('51job');
209    });
210  });