/ src / analysis.ts
analysis.ts
  1  /**
  2   * Shared API analysis helpers used by both explore.ts and record.ts.
  3   *
  4   * Extracts common logic for:
  5   *   - URL pattern normalization
  6   *   - Array path discovery in JSON responses
  7   *   - Field role detection
  8   *   - Auth indicator inference
  9   *   - Capability name inference
 10   *   - Strategy inference
 11   */
 12  
 13  import {
 14    VOLATILE_PARAMS,
 15    SEARCH_PARAMS,
 16    PAGINATION_PARAMS,
 17    LIMIT_PARAMS,
 18    FIELD_ROLES,
 19  } from './constants.js';
 20  
 21  // ── URL pattern normalization ───────────────────────────────────────────────
 22  
 23  /** Normalize a full URL into a pattern (replace IDs, strip volatile params). */
 24  export function urlToPattern(url: string): string {
 25    try {
 26      const p = new URL(url);
 27      const pathNorm = p.pathname
 28        .replace(/\/\d+/g, '/{id}')
 29        .replace(/\/[0-9a-fA-F]{8,}/g, '/{hex}')
 30        .replace(/\/BV[a-zA-Z0-9]{10}/g, '/{bvid}');
 31      const params: string[] = [];
 32      p.searchParams.forEach((_v, k) => { if (!VOLATILE_PARAMS.has(k)) params.push(k); });
 33      return `${p.host}${pathNorm}${params.length ? '?' + params.sort().map(k => `${k}={}`).join('&') : ''}`;
 34    } catch { return url; }
 35  }
 36  
 37  // ── Array discovery in JSON responses ───────────────────────────────────────
 38  
 39  export interface ArrayDiscovery {
 40    path: string;
 41    items: unknown[];
 42  }
 43  
 44  /** Find the best (largest) array of objects in a JSON response body. */
 45  export function findArrayPath(obj: unknown, depth = 0): ArrayDiscovery | null {
 46    if (depth > 5 || !obj || typeof obj !== 'object') return null;
 47    if (Array.isArray(obj)) {
 48      if (obj.length >= 2 && obj.some(i => i && typeof i === 'object' && !Array.isArray(i))) {
 49        return { path: '', items: obj };
 50      }
 51      return null;
 52    }
 53    let best: ArrayDiscovery | null = null;
 54    for (const [key, val] of Object.entries(obj as Record<string, unknown>)) {
 55      const found = findArrayPath(val, depth + 1);
 56      if (found) {
 57        const fullPath = found.path ? `${key}.${found.path}` : key;
 58        const candidate = { path: fullPath, items: found.items };
 59        if (!best || candidate.items.length > best.items.length) best = candidate;
 60      }
 61    }
 62    return best;
 63  }
 64  
 65  // ── Field flattening & role detection ───────────────────────────────────────
 66  
 67  /** Flatten nested object keys up to maxDepth. */
 68  export function flattenFields(obj: unknown, prefix: string, maxDepth: number): string[] {
 69    if (maxDepth <= 0 || !obj || typeof obj !== 'object') return [];
 70    const names: string[] = [];
 71    const record = obj as Record<string, unknown>;
 72    for (const key of Object.keys(record)) {
 73      const full = prefix ? `${prefix}.${key}` : key;
 74      names.push(full);
 75      const val = record[key];
 76      if (val && typeof val === 'object' && !Array.isArray(val)) names.push(...flattenFields(val, full, maxDepth - 1));
 77    }
 78    return names;
 79  }
 80  
 81  /** Detect semantic field roles (title, url, author, etc.) from sample fields. */
 82  export function detectFieldRoles(sampleFields: string[]): Record<string, string> {
 83    const detectedFields: Record<string, string> = {};
 84    for (const [role, aliases] of Object.entries(FIELD_ROLES)) {
 85      for (const f of sampleFields) {
 86        if (aliases.includes(f.split('.').pop()?.toLowerCase() ?? '')) {
 87          detectedFields[role] = f;
 88          break;
 89        }
 90      }
 91    }
 92    return detectedFields;
 93  }
 94  
 95  // ── Capability name inference ───────────────────────────────────────────────
 96  
 97  /** Infer a CLI capability name from a URL. */
 98  export function inferCapabilityName(url: string, goal?: string): string {
 99    if (goal) return goal;
100    const u = url.toLowerCase();
101    if (u.includes('hot') || u.includes('popular') || u.includes('ranking') || u.includes('trending')) return 'hot';
102    if (u.includes('search')) return 'search';
103    if (u.includes('feed') || u.includes('timeline') || u.includes('dynamic')) return 'feed';
104    if (u.includes('comment') || u.includes('reply')) return 'comments';
105    if (u.includes('history')) return 'history';
106    if (u.includes('profile') || u.includes('userinfo') || u.includes('/me')) return 'me';
107    if (u.includes('favorite') || u.includes('collect') || u.includes('bookmark')) return 'favorite';
108    try {
109      const segs = new URL(url).pathname
110        .split('/')
111        .filter(s => s && !s.match(/^\d+$/) && !s.match(/^[0-9a-f]{8,}$/i) && !s.match(/^v\d+$/));
112      if (segs.length) return segs[segs.length - 1].replace(/[^a-z0-9]/gi, '_').toLowerCase();
113    } catch {}
114    return 'data';
115  }
116  
117  // ── Strategy inference ──────────────────────────────────────────────────────
118  
119  /** Infer auth strategy from detected indicators. */
120  export function inferStrategy(authIndicators: string[]): string {
121    if (authIndicators.includes('signature')) return 'intercept';
122    if (authIndicators.includes('bearer') || authIndicators.includes('csrf')) return 'header';
123    return 'cookie';
124  }
125  
126  // ── Auth indicator detection ────────────────────────────────────────────────
127  
128  /** Detect auth indicators from HTTP headers. */
129  export function detectAuthFromHeaders(headers?: Record<string, string>): string[] {
130    if (!headers) return [];
131    const indicators: string[] = [];
132    const keys = Object.keys(headers).map(k => k.toLowerCase());
133    if (keys.some(k => k === 'authorization')) indicators.push('bearer');
134    if (keys.some(k => k.startsWith('x-csrf') || k.startsWith('x-xsrf'))) indicators.push('csrf');
135    if (keys.some(k => k.startsWith('x-s') || k === 'x-t' || k === 'x-s-common')) indicators.push('signature');
136    return indicators;
137  }
138  
139  /** Detect auth indicators from URL and response body (heuristic). */
140  export function detectAuthFromContent(url: string, body: unknown): string[] {
141    const indicators: string[] = [];
142    if (body && typeof body === 'object') {
143      const keys = Object.keys(body as object).map(k => k.toLowerCase());
144      if (keys.some(k => k.includes('sign') || k === 'w_rid' || k.includes('token'))) {
145        indicators.push('signature');
146      }
147    }
148    if (url.includes('/wbi/') || url.includes('w_rid=')) indicators.push('signature');
149    if (url.includes('bearer') || url.includes('access_token')) indicators.push('bearer');
150    return indicators;
151  }
152  
153  // ── Noise filtering ─────────────────────────────────────────────────────────
154  
155  const NOISE_URL_PATTERN = /\/(track|log|analytics|beacon|pixel|ping|heartbeat|keep.?alive)\b/i;
156  
157  /** Check whether a URL looks like tracking/telemetry noise rather than a business API. */
158  export function isNoiseUrl(url: string): boolean {
159    return NOISE_URL_PATTERN.test(url);
160  }
161  
162  // ── Query param classification ──────────────────────────────────────────────
163  
164  /** Extract non-volatile query params and classify them. */
165  export function classifyQueryParams(url: string): {
166    params: string[];
167    hasSearch: boolean;
168    hasPagination: boolean;
169    hasLimit: boolean;
170  } {
171    const params: string[] = [];
172    try { new URL(url).searchParams.forEach((_v, k) => { if (!VOLATILE_PARAMS.has(k)) params.push(k); }); } catch {}
173    return {
174      params,
175      hasSearch: params.some(p => SEARCH_PARAMS.has(p)),
176      hasPagination: params.some(p => PAGINATION_PARAMS.has(p)),
177      hasLimit: params.some(p => LIMIT_PARAMS.has(p)),
178    };
179  }