analysis.ts
1 /** 2 * Shared API analysis helpers used by both explore.ts and record.ts. 3 * 4 * Extracts common logic for: 5 * - URL pattern normalization 6 * - Array path discovery in JSON responses 7 * - Field role detection 8 * - Auth indicator inference 9 * - Capability name inference 10 * - Strategy inference 11 */ 12 13 import { 14 VOLATILE_PARAMS, 15 SEARCH_PARAMS, 16 PAGINATION_PARAMS, 17 LIMIT_PARAMS, 18 FIELD_ROLES, 19 } from './constants.js'; 20 21 // ── URL pattern normalization ─────────────────────────────────────────────── 22 23 /** Normalize a full URL into a pattern (replace IDs, strip volatile params). */ 24 export function urlToPattern(url: string): string { 25 try { 26 const p = new URL(url); 27 const pathNorm = p.pathname 28 .replace(/\/\d+/g, '/{id}') 29 .replace(/\/[0-9a-fA-F]{8,}/g, '/{hex}') 30 .replace(/\/BV[a-zA-Z0-9]{10}/g, '/{bvid}'); 31 const params: string[] = []; 32 p.searchParams.forEach((_v, k) => { if (!VOLATILE_PARAMS.has(k)) params.push(k); }); 33 return `${p.host}${pathNorm}${params.length ? '?' + params.sort().map(k => `${k}={}`).join('&') : ''}`; 34 } catch { return url; } 35 } 36 37 // ── Array discovery in JSON responses ─────────────────────────────────────── 38 39 export interface ArrayDiscovery { 40 path: string; 41 items: unknown[]; 42 } 43 44 /** Find the best (largest) array of objects in a JSON response body. */ 45 export function findArrayPath(obj: unknown, depth = 0): ArrayDiscovery | null { 46 if (depth > 5 || !obj || typeof obj !== 'object') return null; 47 if (Array.isArray(obj)) { 48 if (obj.length >= 2 && obj.some(i => i && typeof i === 'object' && !Array.isArray(i))) { 49 return { path: '', items: obj }; 50 } 51 return null; 52 } 53 let best: ArrayDiscovery | null = null; 54 for (const [key, val] of Object.entries(obj as Record<string, unknown>)) { 55 const found = findArrayPath(val, depth + 1); 56 if (found) { 57 const fullPath = found.path ? `${key}.${found.path}` : key; 58 const candidate = { path: fullPath, items: found.items }; 59 if (!best || candidate.items.length > best.items.length) best = candidate; 60 } 61 } 62 return best; 63 } 64 65 // ── Field flattening & role detection ─────────────────────────────────────── 66 67 /** Flatten nested object keys up to maxDepth. */ 68 export function flattenFields(obj: unknown, prefix: string, maxDepth: number): string[] { 69 if (maxDepth <= 0 || !obj || typeof obj !== 'object') return []; 70 const names: string[] = []; 71 const record = obj as Record<string, unknown>; 72 for (const key of Object.keys(record)) { 73 const full = prefix ? `${prefix}.${key}` : key; 74 names.push(full); 75 const val = record[key]; 76 if (val && typeof val === 'object' && !Array.isArray(val)) names.push(...flattenFields(val, full, maxDepth - 1)); 77 } 78 return names; 79 } 80 81 /** Detect semantic field roles (title, url, author, etc.) from sample fields. */ 82 export function detectFieldRoles(sampleFields: string[]): Record<string, string> { 83 const detectedFields: Record<string, string> = {}; 84 for (const [role, aliases] of Object.entries(FIELD_ROLES)) { 85 for (const f of sampleFields) { 86 if (aliases.includes(f.split('.').pop()?.toLowerCase() ?? '')) { 87 detectedFields[role] = f; 88 break; 89 } 90 } 91 } 92 return detectedFields; 93 } 94 95 // ── Capability name inference ─────────────────────────────────────────────── 96 97 /** Infer a CLI capability name from a URL. */ 98 export function inferCapabilityName(url: string, goal?: string): string { 99 if (goal) return goal; 100 const u = url.toLowerCase(); 101 if (u.includes('hot') || u.includes('popular') || u.includes('ranking') || u.includes('trending')) return 'hot'; 102 if (u.includes('search')) return 'search'; 103 if (u.includes('feed') || u.includes('timeline') || u.includes('dynamic')) return 'feed'; 104 if (u.includes('comment') || u.includes('reply')) return 'comments'; 105 if (u.includes('history')) return 'history'; 106 if (u.includes('profile') || u.includes('userinfo') || u.includes('/me')) return 'me'; 107 if (u.includes('favorite') || u.includes('collect') || u.includes('bookmark')) return 'favorite'; 108 try { 109 const segs = new URL(url).pathname 110 .split('/') 111 .filter(s => s && !s.match(/^\d+$/) && !s.match(/^[0-9a-f]{8,}$/i) && !s.match(/^v\d+$/)); 112 if (segs.length) return segs[segs.length - 1].replace(/[^a-z0-9]/gi, '_').toLowerCase(); 113 } catch {} 114 return 'data'; 115 } 116 117 // ── Strategy inference ────────────────────────────────────────────────────── 118 119 /** Infer auth strategy from detected indicators. */ 120 export function inferStrategy(authIndicators: string[]): string { 121 if (authIndicators.includes('signature')) return 'intercept'; 122 if (authIndicators.includes('bearer') || authIndicators.includes('csrf')) return 'header'; 123 return 'cookie'; 124 } 125 126 // ── Auth indicator detection ──────────────────────────────────────────────── 127 128 /** Detect auth indicators from HTTP headers. */ 129 export function detectAuthFromHeaders(headers?: Record<string, string>): string[] { 130 if (!headers) return []; 131 const indicators: string[] = []; 132 const keys = Object.keys(headers).map(k => k.toLowerCase()); 133 if (keys.some(k => k === 'authorization')) indicators.push('bearer'); 134 if (keys.some(k => k.startsWith('x-csrf') || k.startsWith('x-xsrf'))) indicators.push('csrf'); 135 if (keys.some(k => k.startsWith('x-s') || k === 'x-t' || k === 'x-s-common')) indicators.push('signature'); 136 return indicators; 137 } 138 139 /** Detect auth indicators from URL and response body (heuristic). */ 140 export function detectAuthFromContent(url: string, body: unknown): string[] { 141 const indicators: string[] = []; 142 if (body && typeof body === 'object') { 143 const keys = Object.keys(body as object).map(k => k.toLowerCase()); 144 if (keys.some(k => k.includes('sign') || k === 'w_rid' || k.includes('token'))) { 145 indicators.push('signature'); 146 } 147 } 148 if (url.includes('/wbi/') || url.includes('w_rid=')) indicators.push('signature'); 149 if (url.includes('bearer') || url.includes('access_token')) indicators.push('bearer'); 150 return indicators; 151 } 152 153 // ── Noise filtering ───────────────────────────────────────────────────────── 154 155 const NOISE_URL_PATTERN = /\/(track|log|analytics|beacon|pixel|ping|heartbeat|keep.?alive)\b/i; 156 157 /** Check whether a URL looks like tracking/telemetry noise rather than a business API. */ 158 export function isNoiseUrl(url: string): boolean { 159 return NOISE_URL_PATTERN.test(url); 160 } 161 162 // ── Query param classification ────────────────────────────────────────────── 163 164 /** Extract non-volatile query params and classify them. */ 165 export function classifyQueryParams(url: string): { 166 params: string[]; 167 hasSearch: boolean; 168 hasPagination: boolean; 169 hasLimit: boolean; 170 } { 171 const params: string[] = []; 172 try { new URL(url).searchParams.forEach((_v, k) => { if (!VOLATILE_PARAMS.has(k)) params.push(k); }); } catch {} 173 return { 174 params, 175 hasSearch: params.some(p => SEARCH_PARAMS.has(p)), 176 hasPagination: params.some(p => PAGINATION_PARAMS.has(p)), 177 hasLimit: params.some(p => LIMIT_PARAMS.has(p)), 178 }; 179 }