snapshotFormatter.ts
1 /** 2 * Aria snapshot formatter: parses snapshot text into clean format. 3 * 4 * 4-pass pipeline: 5 * 1. Parse & filter: strip annotations, metadata, noise, ads, boilerplate subtrees 6 * 2. Deduplicate: generic/text parent match, heading+link, nested identical links 7 * 3. Prune: empty containers (iterative bottom-up) 8 * 4. Collapse: single-child containers 9 */ 10 11 import type { SnapshotOptions } from './types.js'; 12 13 const DEFAULT_MAX_TEXT_LENGTH = 200; 14 15 // Roles that are pure noise and should always be filtered 16 const NOISE_ROLES = new Set([ 17 'none', 'presentation', 'separator', 'paragraph', 'tooltip', 'status', 18 ]); 19 20 // Roles whose entire subtree should be removed (footer boilerplate, etc.) 21 const SUBTREE_NOISE_ROLES = new Set([ 22 'contentinfo', 23 ]); 24 25 // Roles considered interactive (clickable/typeable) 26 const INTERACTIVE_ROLES = new Set([ 27 'button', 'link', 'textbox', 'checkbox', 'radio', 28 'combobox', 'tab', 'menuitem', 'option', 'switch', 29 'slider', 'spinbutton', 'searchbox', 30 ]); 31 32 // Structural landmark roles kept even in interactive mode 33 const LANDMARK_ROLES = new Set([ 34 'main', 'navigation', 'banner', 'heading', 'search', 35 'region', 'list', 'listitem', 'article', 'complementary', 36 'group', 'toolbar', 'tablist', 37 ]); 38 39 // Container roles eligible for pruning and collapse 40 const CONTAINER_ROLES = new Set([ 41 'list', 'listitem', 'group', 'toolbar', 'tablist', 42 'navigation', 'region', 'complementary', 43 'search', 'article', 'paragraph', 'figure', 44 ]); 45 46 // Decorator / separator text that adds no semantic value 47 const DECORATOR_TEXT = new Set(['•', '·', '|', '—', '-', '/', '\\']); 48 49 // Ad-related URL patterns 50 const AD_URL_PATTERNS = [ 51 'googleadservices.com/pagead/', 52 'alb.reddit.com/cr?', 53 'doubleclick.net/', 54 'cm.bilibili.com/cm/api/fees/', 55 ]; 56 57 // Boilerplate button labels to filter (back-to-top, etc.) 58 const BOILERPLATE_LABELS = [ 59 '回到顶部', 'back to top', 'scroll to top', 'go to top', 60 ]; 61 62 /** 63 * Parse role and text from a trimmed snapshot line. 64 * Handles quoted labels and trailing text after colon correctly, 65 * including lines wrapped in single quotes. 66 */ 67 function parseLine(trimmed: string): { role: string; text: string; hasText: boolean; trailingText: string } { 68 // Unwrap outer single quotes if present (snapshot wraps lines with special chars) 69 let line = trimmed; 70 if (line.startsWith("'") && line.endsWith("':")) { 71 line = line.slice(1, -2) + ':'; 72 } else if (line.startsWith("'") && line.endsWith("'")) { 73 line = line.slice(1, -1); 74 } 75 76 // Role is the first word 77 const roleMatch = line.match(/^([a-zA-Z]+)\b/); 78 const role = roleMatch ? roleMatch[1].toLowerCase() : ''; 79 80 // Extract quoted text content (the semantic label) 81 const textMatch = line.match(/"([^"]*)"/); 82 const text = textMatch ? textMatch[1] : ''; 83 84 // For trailing text: strip annotations and quoted strings first, then check after last colon 85 // This avoids matching colons inside quoted labels like "Account: user@email.com" 86 let stripped = line; 87 // Remove all quoted strings 88 stripped = stripped.replace(/"[^"]*"/g, '""'); 89 // Remove all bracket annotations 90 stripped = stripped.replace(/\[[^\]]*\]/g, ''); 91 92 const colonIdx = stripped.lastIndexOf(':'); 93 let trailingText = ''; 94 if (colonIdx !== -1) { 95 const afterColon = stripped.slice(colonIdx + 1).trim(); 96 if (afterColon.length > 0) { 97 // Get the actual trailing text from original line at same position 98 const origColonIdx = line.lastIndexOf(':'); 99 if (origColonIdx !== -1) { 100 trailingText = line.slice(origColonIdx + 1).trim(); 101 } 102 } 103 } 104 105 return { role, text, hasText: text.length > 0 || trailingText.length > 0, trailingText }; 106 } 107 108 /** 109 * Strip ALL bracket annotations from a content line, preserving quoted strings. 110 * Handles both double-quoted and outer single-quoted lines. 111 */ 112 function stripAnnotations(content: string): string { 113 // Unwrap outer single quotes first 114 let line = content; 115 if (line.startsWith("'") && (line.endsWith("':") || line.endsWith("'"))) { 116 if (line.endsWith("':")) { 117 line = line.slice(1, -2) + ':'; 118 } else { 119 line = line.slice(1, -1); 120 } 121 } 122 123 // Split by double quotes to protect quoted content 124 const parts = line.split('"'); 125 for (let i = 0; i < parts.length; i += 2) { 126 // Only strip annotations from non-quoted parts (even indices) 127 parts[i] = parts[i].replace(/\s*\[[^\]]*\]/g, ''); 128 } 129 let result = parts.join('"').replace(/\s{2,}/g, ' ').trim(); 130 131 return result; 132 } 133 134 /** 135 * Check if a line is a metadata-only line (like /url: ...). 136 */ 137 function isMetadataLine(trimmed: string): boolean { 138 return /^\/[a-zA-Z]+:/.test(trimmed); 139 } 140 141 /** 142 * Check if text content is purely decorative (separators, dots, etc.) 143 */ 144 function isDecoratorText(text: string): boolean { 145 return DECORATOR_TEXT.has(text.trim()); 146 } 147 148 /** 149 * Check if a node is ad-related based on its text content. 150 */ 151 function isAdNode(text: string, trailingText: string): boolean { 152 const t = (text + ' ' + trailingText).toLowerCase(); 153 if (t.includes('sponsored') || t.includes('advertisement')) return true; 154 if (t.includes('广告')) return true; 155 // Check for ad tracking URLs in the label 156 for (const pattern of AD_URL_PATTERNS) { 157 if (text.includes(pattern) || trailingText.includes(pattern)) return true; 158 } 159 return false; 160 } 161 162 /** 163 * Check if a node is boilerplate UI (back-to-top, etc.) 164 */ 165 function isBoilerplateNode(text: string): boolean { 166 const t = text.toLowerCase(); 167 return BOILERPLATE_LABELS.some(label => t.includes(label)); 168 } 169 170 /** 171 * Check if a role is noise that should be filtered. 172 */ 173 function isNoiseNode(role: string, hasText: boolean, text: string, trailingText: string): boolean { 174 if (NOISE_ROLES.has(role)) return true; 175 // generic without text is a wrapper 176 if (role === 'generic' && !hasText) return true; 177 // img without alt text is noise 178 if (role === 'img' && !hasText) return true; 179 // Decorator-only text nodes 180 if ((role === 'generic' || role === 'text') && hasText) { 181 const content = trailingText || text; 182 if (isDecoratorText(content)) return true; 183 } 184 return false; 185 } 186 187 interface Entry { 188 depth: number; 189 content: string; 190 role: string; 191 text: string; 192 trailingText: string; 193 isInteractive: boolean; 194 isLandmark: boolean; 195 } 196 197 export function formatSnapshot(raw: string, opts: SnapshotOptions = {}): string { 198 if (!raw || typeof raw !== 'string') return ''; 199 200 const maxTextLen = opts.maxTextLength ?? DEFAULT_MAX_TEXT_LENGTH; 201 const lines = raw.split('\n'); 202 203 // === Pass 1: Parse, filter, and collect entries (merged with ad/boilerplate subtree skip) === 204 const parsed: Entry[] = []; 205 let refCounter = 0; 206 let skipUntilDepth = -1; 207 208 for (let i = 0; i < lines.length; i++) { 209 const line = lines[i]; 210 if (!line.trim()) continue; 211 212 const indent = line.length - line.trimStart().length; 213 const depth = Math.floor(indent / 2); 214 215 // Subtree skip zone (noise roles, ads, boilerplate) 216 if (skipUntilDepth >= 0) { 217 if (depth > skipUntilDepth) continue; 218 skipUntilDepth = -1; 219 } 220 221 let content = line.trimStart(); 222 if (content.startsWith('- ')) content = content.slice(2); 223 if (isMetadataLine(content)) continue; 224 if (opts.maxDepth !== undefined && depth > opts.maxDepth) continue; 225 226 const { role, text, hasText, trailingText } = parseLine(content); 227 228 if (isNoiseNode(role, hasText, text, trailingText)) continue; 229 230 // Subtree noise roles (contentinfo footer, etc.) 231 if (SUBTREE_NOISE_ROLES.has(role)) { skipUntilDepth = depth; continue; } 232 233 // Ads and boilerplate — skip entire subtree (merged from old Pass 2) 234 if (isAdNode(text, trailingText) || isBoilerplateNode(text)) { skipUntilDepth = depth; continue; } 235 236 content = stripAnnotations(content); 237 238 const isInteractive = INTERACTIVE_ROLES.has(role); 239 const isLandmark = LANDMARK_ROLES.has(role); 240 if (opts.interactive && !isInteractive && !isLandmark && !hasText) continue; 241 242 if (opts.compact) { 243 content = content.replace(/\s*\[.*?\]\s*/g, ' ').replace(/\s+/g, ' ').trim(); 244 } 245 if (maxTextLen > 0 && content.length > maxTextLen) { 246 content = content.slice(0, maxTextLen) + '…'; 247 } 248 if (isInteractive) { 249 refCounter++; 250 content = `[@${refCounter}] ${content}`; 251 } 252 253 parsed.push({ depth, content, role, text, trailingText, isInteractive, isLandmark }); 254 } 255 256 // === Pass 2: Deduplicate (merged: generic/text parent match + heading+link + nested links) === 257 const deduped: Entry[] = []; 258 for (let i = 0; i < parsed.length; i++) { 259 const entry = parsed[i]; 260 261 // Dedup: generic/text child matching parent label 262 if (entry.role === 'generic' || entry.role === 'text') { 263 let parent: Entry | undefined; 264 for (let j = deduped.length - 1; j >= 0; j--) { 265 if (deduped[j].depth < entry.depth) { parent = deduped[j]; break; } 266 if (deduped[j].depth === entry.depth) break; 267 } 268 if (parent) { 269 const childText = entry.trailingText || entry.text; 270 if (childText && parent.text && childText === parent.text) continue; 271 } 272 } 273 274 // Dedup: heading + child link with identical label 275 if (entry.role === 'heading' && entry.text) { 276 const next = parsed[i + 1]; 277 if (next && next.role === 'link' && next.text === entry.text && next.depth === entry.depth + 1) { 278 deduped.push(entry); 279 i++; // skip the link, preserve its children 280 continue; 281 } 282 } 283 284 // Dedup: nested identical links (skip parent, keep child) 285 if (entry.role === 'link' && entry.text) { 286 const next = parsed[i + 1]; 287 if (next && next.role === 'link' && next.text === entry.text && next.depth === entry.depth + 1) { 288 continue; 289 } 290 } 291 292 deduped.push(entry); 293 } 294 295 // === Pass 3: Iteratively prune empty containers (bottom-up) === 296 let current = deduped; 297 let changed = true; 298 while (changed) { 299 changed = false; 300 const next: Entry[] = []; 301 for (let i = 0; i < current.length; i++) { 302 const entry = current[i]; 303 if (CONTAINER_ROLES.has(entry.role) && !entry.text && !entry.trailingText) { 304 let hasChildren = false; 305 for (let j = i + 1; j < current.length; j++) { 306 if (current[j].depth <= entry.depth) break; 307 if (current[j].depth > entry.depth) { hasChildren = true; break; } 308 } 309 if (!hasChildren) { changed = true; continue; } 310 } 311 next.push(entry); 312 } 313 current = next; 314 } 315 316 // === Pass 4: Collapse single-child containers === 317 const collapsed: Entry[] = []; 318 for (let i = 0; i < current.length; i++) { 319 const entry = current[i]; 320 321 if (CONTAINER_ROLES.has(entry.role) && !entry.text && !entry.trailingText) { 322 let childCount = 0; 323 let childIdx = -1; 324 for (let j = i + 1; j < current.length; j++) { 325 if (current[j].depth <= entry.depth) break; 326 if (current[j].depth === entry.depth + 1) { 327 childCount++; 328 if (childCount === 1) childIdx = j; 329 } 330 } 331 332 if (childCount === 1 && childIdx !== -1) { 333 const child = current[childIdx]; 334 let hasGrandchildren = false; 335 for (let j = childIdx + 1; j < current.length; j++) { 336 if (current[j].depth <= child.depth) break; 337 if (current[j].depth > child.depth) { hasGrandchildren = true; break; } 338 } 339 340 if (!hasGrandchildren) { 341 collapsed.push({ 342 ...entry, 343 content: entry.content.replace(/:$/, '') + ' > ' + child.content, 344 role: child.role, 345 text: child.text, 346 trailingText: child.trailingText, 347 isInteractive: child.isInteractive, 348 }); 349 i++; 350 continue; 351 } 352 } 353 } 354 355 collapsed.push(entry); 356 } 357 358 return collapsed.map(e => ' '.repeat(e.depth) + e.content).join('\n'); 359 }