analyze.test.ts
1 import { describe, it, expect } from 'vitest'; 2 import { 3 analyzeSite, 4 detectAntiBot, 5 classifyPattern, 6 findNearestAdapter, 7 type PageSignals, 8 } from './analyze.js'; 9 import type { CliCommand } from '../registry.js'; 10 11 function mkSignals(overrides: Partial<PageSignals> = {}): PageSignals { 12 return { 13 requestedUrl: 'https://example.com/', 14 finalUrl: 'https://example.com/', 15 cookieNames: [], 16 networkEntries: [], 17 initialState: { 18 __INITIAL_STATE__: false, 19 __NUXT__: false, 20 __NEXT_DATA__: false, 21 __APOLLO_STATE__: false, 22 }, 23 title: 'Example', 24 ...overrides, 25 }; 26 } 27 28 function mkCmd(site: string, name: string, domain?: string): CliCommand { 29 return { 30 site, 31 name, 32 description: '', 33 domain, 34 args: [], 35 }; 36 } 37 38 describe('detectAntiBot', () => { 39 it('flags Aliyun WAF from cookie', () => { 40 const v = detectAntiBot(mkSignals({ cookieNames: ['JSESSIONID', 'acw_sc__v2'] })); 41 expect(v.detected).toBe(true); 42 expect(v.vendor).toBe('aliyun_waf'); 43 expect(v.evidence).toContain('cookie:acw_sc__v2'); 44 expect(v.implication).toMatch(/browser context/i); 45 }); 46 47 it('flags Aliyun WAF from challenge HTML body', () => { 48 const v = detectAntiBot( 49 mkSignals({ 50 networkEntries: [ 51 { 52 url: 'https://x.com/', 53 status: 200, 54 contentType: 'text/html', 55 bodyPreview: "var arg1 = 'A1B2C3D4E5F6A7B8C9D0E1F2A3B4C5D6';", 56 }, 57 ], 58 }), 59 ); 60 expect(v.detected).toBe(true); 61 expect(v.vendor).toBe('aliyun_waf'); 62 }); 63 64 it('flags Cloudflare from cf_clearance cookie', () => { 65 const v = detectAntiBot(mkSignals({ cookieNames: ['cf_clearance'] })); 66 expect(v.vendor).toBe('cloudflare'); 67 expect(v.implication).toMatch(/Cloudflare/i); 68 }); 69 70 it('flags Akamai from _abck cookie', () => { 71 const v = detectAntiBot(mkSignals({ cookieNames: ['_abck', 'bm_sz'] })); 72 expect(v.vendor).toBe('akamai'); 73 }); 74 75 it('returns no-match verdict with actionable fallback advice', () => { 76 const v = detectAntiBot(mkSignals()); 77 expect(v.detected).toBe(false); 78 expect(v.vendor).toBeNull(); 79 expect(v.implication).toMatch(/Node-side COOKIE fetch first/); 80 }); 81 }); 82 83 describe('classifyPattern', () => { 84 it('returns A for JSON-heavy pages without SSR state', () => { 85 const v = classifyPattern( 86 mkSignals({ 87 networkEntries: [ 88 { url: 'https://x.com/api/a', status: 200, contentType: 'application/json', bodyPreview: '{}' }, 89 { url: 'https://x.com/api/b', status: 200, contentType: 'application/json;charset=utf-8', bodyPreview: '{}' }, 90 ], 91 }), 92 ); 93 expect(v.pattern).toBe('A'); 94 expect(v.json_responses).toBe(2); 95 }); 96 97 it('returns B when __INITIAL_STATE__ is present, beating JSON signals', () => { 98 const v = classifyPattern( 99 mkSignals({ 100 initialState: { __INITIAL_STATE__: true, __NUXT__: false, __NEXT_DATA__: false, __APOLLO_STATE__: false }, 101 networkEntries: [ 102 { url: 'https://x.com/api/a', status: 200, contentType: 'application/json', bodyPreview: '{}' }, 103 ], 104 }), 105 ); 106 expect(v.pattern).toBe('B'); 107 }); 108 109 it('returns D when auth failures dominate', () => { 110 const v = classifyPattern( 111 mkSignals({ 112 networkEntries: [ 113 { url: 'https://x.com/api/a', status: 401, contentType: 'application/json', bodyPreview: '' }, 114 { url: 'https://x.com/api/b', status: 403, contentType: 'application/json', bodyPreview: '' }, 115 ], 116 }), 117 ); 118 expect(v.pattern).toBe('D'); 119 expect(v.auth_failures).toBe(2); 120 }); 121 122 it('returns C by default for static pages', () => { 123 const v = classifyPattern(mkSignals()); 124 expect(v.pattern).toBe('C'); 125 }); 126 }); 127 128 describe('findNearestAdapter', () => { 129 it('matches by domain suffix', () => { 130 const reg = new Map<string, CliCommand>([ 131 ['51job search', mkCmd('51job', 'search', '51job.com')], 132 ['51job detail', mkCmd('51job', 'detail', '51job.com')], 133 ['xueqiu search', mkCmd('xueqiu', 'search', 'xueqiu.com')], 134 ]); 135 const v = findNearestAdapter('https://jobs.51job.com/', reg); 136 expect(v?.site).toBe('51job'); 137 expect(v?.example_commands).toContain('51job search'); 138 }); 139 140 it('falls back to site-name containment when no domain is registered', () => { 141 const reg = new Map<string, CliCommand>([ 142 ['51job search', mkCmd('51job', 'search')], 143 ]); 144 const v = findNearestAdapter('https://we.51job.com/', reg); 145 expect(v?.site).toBe('51job'); 146 }); 147 148 it('returns null when no adapter matches', () => { 149 const reg = new Map<string, CliCommand>([ 150 ['xueqiu search', mkCmd('xueqiu', 'search', 'xueqiu.com')], 151 ]); 152 const v = findNearestAdapter('https://random-site.io/', reg); 153 expect(v).toBeNull(); 154 }); 155 156 it('prefers the site with the most commands', () => { 157 const reg = new Map<string, CliCommand>([ 158 ['a search', mkCmd('a', 'search', 'a.com')], 159 ['b search', mkCmd('b', 'search', 'a.com')], 160 ['b detail', mkCmd('b', 'detail', 'a.com')], 161 ['b company', mkCmd('b', 'company', 'a.com')], 162 ]); 163 const v = findNearestAdapter('https://jobs.a.com/', reg); 164 expect(v?.site).toBe('b'); 165 }); 166 }); 167 168 describe('analyzeSite', () => { 169 it('recommends browser-context fetch when WAF is detected', () => { 170 const report = analyzeSite( 171 mkSignals({ cookieNames: ['acw_sc__v2'] }), 172 new Map(), 173 ); 174 expect(report.anti_bot.vendor).toBe('aliyun_waf'); 175 expect(report.recommended_next_step).toMatch(/browser context/i); 176 }); 177 178 it('recommends reading SSR state when Pattern B fires', () => { 179 const report = analyzeSite( 180 mkSignals({ 181 initialState: { __INITIAL_STATE__: false, __NUXT__: true, __NEXT_DATA__: false, __APOLLO_STATE__: false }, 182 }), 183 new Map(), 184 ); 185 expect(report.pattern.pattern).toBe('B'); 186 expect(report.recommended_next_step).toMatch(/__NUXT__|__INITIAL_STATE__|__NEXT_DATA__/); 187 }); 188 189 it('includes __APOLLO_STATE__ in Pattern B next-step guidance', () => { 190 const report = analyzeSite( 191 mkSignals({ 192 initialState: { __INITIAL_STATE__: false, __NUXT__: false, __NEXT_DATA__: false, __APOLLO_STATE__: true }, 193 }), 194 new Map(), 195 ); 196 expect(report.pattern.pattern).toBe('B'); 197 expect(report.recommended_next_step).toMatch(/__APOLLO_STATE__/); 198 }); 199 200 it('includes nearest_adapter when the registry has a match', () => { 201 const reg = new Map<string, CliCommand>([ 202 ['51job search', mkCmd('51job', 'search', '51job.com')], 203 ]); 204 const report = analyzeSite( 205 mkSignals({ finalUrl: 'https://we.51job.com/' }), 206 reg, 207 ); 208 expect(report.nearest_adapter?.site).toBe('51job'); 209 }); 210 });