/ src / snapshotFormatter.ts
snapshotFormatter.ts
  1  /**
  2   * Aria snapshot formatter: parses snapshot text into clean format.
  3   *
  4   * 4-pass pipeline:
  5   * 1. Parse & filter: strip annotations, metadata, noise, ads, boilerplate subtrees
  6   * 2. Deduplicate: generic/text parent match, heading+link, nested identical links
  7   * 3. Prune: empty containers (iterative bottom-up)
  8   * 4. Collapse: single-child containers
  9   */
 10  
 11  import type { SnapshotOptions } from './types.js';
 12  
 13  const DEFAULT_MAX_TEXT_LENGTH = 200;
 14  
 15  // Roles that are pure noise and should always be filtered
 16  const NOISE_ROLES = new Set([
 17    'none', 'presentation', 'separator', 'paragraph', 'tooltip', 'status',
 18  ]);
 19  
 20  // Roles whose entire subtree should be removed (footer boilerplate, etc.)
 21  const SUBTREE_NOISE_ROLES = new Set([
 22    'contentinfo',
 23  ]);
 24  
 25  // Roles considered interactive (clickable/typeable)
 26  const INTERACTIVE_ROLES = new Set([
 27    'button', 'link', 'textbox', 'checkbox', 'radio',
 28    'combobox', 'tab', 'menuitem', 'option', 'switch',
 29    'slider', 'spinbutton', 'searchbox',
 30  ]);
 31  
 32  // Structural landmark roles kept even in interactive mode
 33  const LANDMARK_ROLES = new Set([
 34    'main', 'navigation', 'banner', 'heading', 'search',
 35    'region', 'list', 'listitem', 'article', 'complementary',
 36    'group', 'toolbar', 'tablist',
 37  ]);
 38  
 39  // Container roles eligible for pruning and collapse
 40  const CONTAINER_ROLES = new Set([
 41    'list', 'listitem', 'group', 'toolbar', 'tablist',
 42    'navigation', 'region', 'complementary',
 43    'search', 'article', 'paragraph', 'figure',
 44  ]);
 45  
 46  // Decorator / separator text that adds no semantic value
 47  const DECORATOR_TEXT = new Set(['•', '·', '|', '—', '-', '/', '\\']);
 48  
 49  // Ad-related URL patterns
 50  const AD_URL_PATTERNS = [
 51    'googleadservices.com/pagead/',
 52    'alb.reddit.com/cr?',
 53    'doubleclick.net/',
 54    'cm.bilibili.com/cm/api/fees/',
 55  ];
 56  
 57  // Boilerplate button labels to filter (back-to-top, etc.)
 58  const BOILERPLATE_LABELS = [
 59    '回到顶部', 'back to top', 'scroll to top', 'go to top',
 60  ];
 61  
 62  /**
 63   * Parse role and text from a trimmed snapshot line.
 64   * Handles quoted labels and trailing text after colon correctly,
 65   * including lines wrapped in single quotes.
 66   */
 67  function parseLine(trimmed: string): { role: string; text: string; hasText: boolean; trailingText: string } {
 68    // Unwrap outer single quotes if present (snapshot wraps lines with special chars)
 69    let line = trimmed;
 70    if (line.startsWith("'") && line.endsWith("':")) {
 71      line = line.slice(1, -2) + ':';
 72    } else if (line.startsWith("'") && line.endsWith("'")) {
 73      line = line.slice(1, -1);
 74    }
 75  
 76    // Role is the first word
 77    const roleMatch = line.match(/^([a-zA-Z]+)\b/);
 78    const role = roleMatch ? roleMatch[1].toLowerCase() : '';
 79  
 80    // Extract quoted text content (the semantic label)
 81    const textMatch = line.match(/"([^"]*)"/);
 82    const text = textMatch ? textMatch[1] : '';
 83  
 84    // For trailing text: strip annotations and quoted strings first, then check after last colon
 85    // This avoids matching colons inside quoted labels like "Account: user@email.com"
 86    let stripped = line;
 87    // Remove all quoted strings
 88    stripped = stripped.replace(/"[^"]*"/g, '""');
 89    // Remove all bracket annotations
 90    stripped = stripped.replace(/\[[^\]]*\]/g, '');
 91  
 92    const colonIdx = stripped.lastIndexOf(':');
 93    let trailingText = '';
 94    if (colonIdx !== -1) {
 95      const afterColon = stripped.slice(colonIdx + 1).trim();
 96      if (afterColon.length > 0) {
 97        // Get the actual trailing text from original line at same position
 98        const origColonIdx = line.lastIndexOf(':');
 99        if (origColonIdx !== -1) {
100          trailingText = line.slice(origColonIdx + 1).trim();
101        }
102      }
103    }
104  
105    return { role, text, hasText: text.length > 0 || trailingText.length > 0, trailingText };
106  }
107  
108  /**
109   * Strip ALL bracket annotations from a content line, preserving quoted strings.
110   * Handles both double-quoted and outer single-quoted lines.
111   */
112  function stripAnnotations(content: string): string {
113    // Unwrap outer single quotes first
114    let line = content;
115    if (line.startsWith("'") && (line.endsWith("':") || line.endsWith("'"))) {
116      if (line.endsWith("':")) {
117        line = line.slice(1, -2) + ':';
118      } else {
119        line = line.slice(1, -1);
120      }
121    }
122  
123    // Split by double quotes to protect quoted content
124    const parts = line.split('"');
125    for (let i = 0; i < parts.length; i += 2) {
126      // Only strip annotations from non-quoted parts (even indices)
127      parts[i] = parts[i].replace(/\s*\[[^\]]*\]/g, '');
128    }
129    let result = parts.join('"').replace(/\s{2,}/g, ' ').trim();
130  
131    return result;
132  }
133  
134  /**
135   * Check if a line is a metadata-only line (like /url: ...).
136   */
137  function isMetadataLine(trimmed: string): boolean {
138    return /^\/[a-zA-Z]+:/.test(trimmed);
139  }
140  
141  /**
142   * Check if text content is purely decorative (separators, dots, etc.)
143   */
144  function isDecoratorText(text: string): boolean {
145    return DECORATOR_TEXT.has(text.trim());
146  }
147  
148  /**
149   * Check if a node is ad-related based on its text content.
150   */
151  function isAdNode(text: string, trailingText: string): boolean {
152    const t = (text + ' ' + trailingText).toLowerCase();
153    if (t.includes('sponsored') || t.includes('advertisement')) return true;
154    if (t.includes('广告')) return true;
155    // Check for ad tracking URLs in the label
156    for (const pattern of AD_URL_PATTERNS) {
157      if (text.includes(pattern) || trailingText.includes(pattern)) return true;
158    }
159    return false;
160  }
161  
162  /**
163   * Check if a node is boilerplate UI (back-to-top, etc.)
164   */
165  function isBoilerplateNode(text: string): boolean {
166    const t = text.toLowerCase();
167    return BOILERPLATE_LABELS.some(label => t.includes(label));
168  }
169  
170  /**
171   * Check if a role is noise that should be filtered.
172   */
173  function isNoiseNode(role: string, hasText: boolean, text: string, trailingText: string): boolean {
174    if (NOISE_ROLES.has(role)) return true;
175    // generic without text is a wrapper
176    if (role === 'generic' && !hasText) return true;
177    // img without alt text is noise
178    if (role === 'img' && !hasText) return true;
179    // Decorator-only text nodes
180    if ((role === 'generic' || role === 'text') && hasText) {
181      const content = trailingText || text;
182      if (isDecoratorText(content)) return true;
183    }
184    return false;
185  }
186  
187  interface Entry {
188    depth: number;
189    content: string;
190    role: string;
191    text: string;
192    trailingText: string;
193    isInteractive: boolean;
194    isLandmark: boolean;
195  }
196  
197  export function formatSnapshot(raw: string, opts: SnapshotOptions = {}): string {
198    if (!raw || typeof raw !== 'string') return '';
199  
200    const maxTextLen = opts.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH;
201    const lines = raw.split('\n');
202  
203    // === Pass 1: Parse, filter, and collect entries (merged with ad/boilerplate subtree skip) ===
204    const parsed: Entry[] = [];
205    let refCounter = 0;
206    let skipUntilDepth = -1;
207  
208    for (let i = 0; i < lines.length; i++) {
209      const line = lines[i];
210      if (!line.trim()) continue;
211  
212      const indent = line.length - line.trimStart().length;
213      const depth = Math.floor(indent / 2);
214  
215      // Subtree skip zone (noise roles, ads, boilerplate)
216      if (skipUntilDepth >= 0) {
217        if (depth > skipUntilDepth) continue;
218        skipUntilDepth = -1;
219      }
220  
221      let content = line.trimStart();
222      if (content.startsWith('- ')) content = content.slice(2);
223      if (isMetadataLine(content)) continue;
224      if (opts.maxDepth !== undefined && depth > opts.maxDepth) continue;
225  
226      const { role, text, hasText, trailingText } = parseLine(content);
227  
228      if (isNoiseNode(role, hasText, text, trailingText)) continue;
229  
230      // Subtree noise roles (contentinfo footer, etc.)
231      if (SUBTREE_NOISE_ROLES.has(role)) { skipUntilDepth = depth; continue; }
232  
233      // Ads and boilerplate — skip entire subtree (merged from old Pass 2)
234      if (isAdNode(text, trailingText) || isBoilerplateNode(text)) { skipUntilDepth = depth; continue; }
235  
236      content = stripAnnotations(content);
237  
238      const isInteractive = INTERACTIVE_ROLES.has(role);
239      const isLandmark = LANDMARK_ROLES.has(role);
240      if (opts.interactive && !isInteractive && !isLandmark && !hasText) continue;
241  
242      if (opts.compact) {
243        content = content.replace(/\s*\[.*?\]\s*/g, ' ').replace(/\s+/g, ' ').trim();
244      }
245      if (maxTextLen > 0 && content.length > maxTextLen) {
246        content = content.slice(0, maxTextLen) + '…';
247      }
248      if (isInteractive) {
249        refCounter++;
250        content = `[@${refCounter}] ${content}`;
251      }
252  
253      parsed.push({ depth, content, role, text, trailingText, isInteractive, isLandmark });
254    }
255  
256    // === Pass 2: Deduplicate (merged: generic/text parent match + heading+link + nested links) ===
257    const deduped: Entry[] = [];
258    for (let i = 0; i < parsed.length; i++) {
259      const entry = parsed[i];
260  
261      // Dedup: generic/text child matching parent label
262      if (entry.role === 'generic' || entry.role === 'text') {
263        let parent: Entry | undefined;
264        for (let j = deduped.length - 1; j >= 0; j--) {
265          if (deduped[j].depth < entry.depth) { parent = deduped[j]; break; }
266          if (deduped[j].depth === entry.depth) break;
267        }
268        if (parent) {
269          const childText = entry.trailingText || entry.text;
270          if (childText && parent.text && childText === parent.text) continue;
271        }
272      }
273  
274      // Dedup: heading + child link with identical label
275      if (entry.role === 'heading' && entry.text) {
276        const next = parsed[i + 1];
277        if (next && next.role === 'link' && next.text === entry.text && next.depth === entry.depth + 1) {
278          deduped.push(entry);
279          i++; // skip the link, preserve its children
280          continue;
281        }
282      }
283  
284      // Dedup: nested identical links (skip parent, keep child)
285      if (entry.role === 'link' && entry.text) {
286        const next = parsed[i + 1];
287        if (next && next.role === 'link' && next.text === entry.text && next.depth === entry.depth + 1) {
288          continue;
289        }
290      }
291  
292      deduped.push(entry);
293    }
294  
295    // === Pass 3: Iteratively prune empty containers (bottom-up) ===
296    let current = deduped;
297    let changed = true;
298    while (changed) {
299      changed = false;
300      const next: Entry[] = [];
301      for (let i = 0; i < current.length; i++) {
302        const entry = current[i];
303        if (CONTAINER_ROLES.has(entry.role) && !entry.text && !entry.trailingText) {
304          let hasChildren = false;
305          for (let j = i + 1; j < current.length; j++) {
306            if (current[j].depth <= entry.depth) break;
307            if (current[j].depth > entry.depth) { hasChildren = true; break; }
308          }
309          if (!hasChildren) { changed = true; continue; }
310        }
311        next.push(entry);
312      }
313      current = next;
314    }
315  
316    // === Pass 4: Collapse single-child containers ===
317    const collapsed: Entry[] = [];
318    for (let i = 0; i < current.length; i++) {
319      const entry = current[i];
320  
321      if (CONTAINER_ROLES.has(entry.role) && !entry.text && !entry.trailingText) {
322        let childCount = 0;
323        let childIdx = -1;
324        for (let j = i + 1; j < current.length; j++) {
325          if (current[j].depth <= entry.depth) break;
326          if (current[j].depth === entry.depth + 1) {
327            childCount++;
328            if (childCount === 1) childIdx = j;
329          }
330        }
331  
332        if (childCount === 1 && childIdx !== -1) {
333          const child = current[childIdx];
334          let hasGrandchildren = false;
335          for (let j = childIdx + 1; j < current.length; j++) {
336            if (current[j].depth <= child.depth) break;
337            if (current[j].depth > child.depth) { hasGrandchildren = true; break; }
338          }
339  
340          if (!hasGrandchildren) {
341            collapsed.push({
342              ...entry,
343              content: entry.content.replace(/:$/, '') + ' > ' + child.content,
344              role: child.role,
345              text: child.text,
346              trailingText: child.trailingText,
347              isInteractive: child.isInteractive,
348            });
349            i++;
350            continue;
351          }
352        }
353      }
354  
355      collapsed.push(entry);
356    }
357  
358    return collapsed.map(e => '  '.repeat(e.depth) + e.content).join('\n');
359  }