html-tree.ts
1 /** 2 * Client-side HTML → structured tree serializer. 3 * 4 * Returned as a JS string that gets passed to `page.evaluate`. The expression 5 * walks the DOM subtree rooted at the first selector match (or documentElement 6 * when no selector is given) and emits a compact `{tag, attrs, text, children}` 7 * tree for agents to consume instead of re-parsing raw HTML. 8 * 9 * Text handling: `text` is the concatenated text of direct text children only, 10 * whitespace-collapsed. Nested element text is left inside `children[].text`. 11 * Ordering between text and elements is not preserved — agents that need it 12 * should fall back to raw HTML mode. 13 * 14 * Budget knobs let the caller bound the output on large pages — previously an 15 * unscoped `get html --as json` could return a giant tree. Callers set any 16 * combination of `depth` / `childrenMax` / `textMax`; each hit is reported in 17 * the `truncated` envelope so agents know to narrow their selector or raise 18 * the budget. 19 * 20 * Compound controls (date / time / datetime-local / month / week / select / 21 * file) gain a `compound` field so agents inspecting the JSON tree see the 22 * full contract — date format, full option list (up to cap) with selections 23 * preserved for options beyond the cap, file `accept` and `multiple`. Without 24 * this wiring agents repeatedly guess values on these controls from the raw 25 * attributes, which is the failure mode compound.ts was built to eliminate. 26 */ 27 28 import { COMPOUND_INFO_JS, type CompoundInfo } from './compound.js'; 29 30 export interface BuildHtmlTreeJsOptions { 31 /** CSS selector to scope the tree; unscoped = documentElement */ 32 selector?: string | null; 33 /** Max depth below the root (0 = root only, no children). Omit = unlimited. */ 34 depth?: number | null; 35 /** Max element children per node before the rest get dropped. Omit = unlimited. */ 36 childrenMax?: number | null; 37 /** Max chars of direct text per node before truncation. Omit = unlimited. */ 38 textMax?: number | null; 39 } 40 41 /** 42 * Returns a JS expression string. When evaluated in a page context the 43 * expression resolves to either 44 * `{selector, matched, tree, truncated}` on success, or 45 * `{selector, invalidSelector: true, reason}` when `querySelectorAll` 46 * throws a `SyntaxError` for an unparseable selector. 47 * 48 * Callers must branch on `invalidSelector` to convert it into the CLI's 49 * `invalid_selector` structured error; otherwise the browser-level exception 50 * would bubble out of `page.evaluate` and bypass the structured-error 51 * contract that agents rely on. 52 */ 53 export function buildHtmlTreeJs(opts: BuildHtmlTreeJsOptions = {}): string { 54 const selectorLiteral = opts.selector ? JSON.stringify(opts.selector) : 'null'; 55 const depthLiteral = Number.isFinite(opts.depth as number) && (opts.depth as number) >= 0 56 ? String(opts.depth) 57 : 'null'; 58 const childrenMaxLiteral = Number.isFinite(opts.childrenMax as number) && (opts.childrenMax as number) >= 0 59 ? String(opts.childrenMax) 60 : 'null'; 61 const textMaxLiteral = Number.isFinite(opts.textMax as number) && (opts.textMax as number) >= 0 62 ? String(opts.textMax) 63 : 'null'; 64 return `(() => { 65 ${COMPOUND_INFO_JS} 66 const selector = ${selectorLiteral}; 67 const maxDepth = ${depthLiteral}; 68 const maxChildren = ${childrenMaxLiteral}; 69 const maxText = ${textMaxLiteral}; 70 let matches; 71 if (selector) { 72 try { matches = document.querySelectorAll(selector); } 73 catch (e) { 74 return { selector: selector, invalidSelector: true, reason: (e && e.message) || String(e) }; 75 } 76 } else { 77 matches = [document.documentElement]; 78 } 79 const matched = matches.length; 80 const root = matches[0] || null; 81 const trunc = { depth: false, children_dropped: 0, text_truncated: 0 }; 82 function serialize(el, depth) { 83 if (!el || el.nodeType !== 1) return null; 84 const attrs = {}; 85 for (const a of el.attributes) attrs[a.name] = a.value; 86 let text = ''; 87 for (const n of el.childNodes) { 88 if (n.nodeType === 3) text += n.nodeValue; 89 } 90 text = text.replace(/\\s+/g, ' ').trim(); 91 if (maxText !== null && text.length > maxText) { 92 text = text.slice(0, maxText); 93 trunc.text_truncated++; 94 } 95 const children = []; 96 if (maxDepth === null || depth < maxDepth) { 97 const childEls = []; 98 for (const n of el.childNodes) if (n.nodeType === 1) childEls.push(n); 99 const keep = maxChildren === null ? childEls.length : Math.min(childEls.length, maxChildren); 100 for (let i = 0; i < keep; i++) { 101 const child = serialize(childEls[i], depth + 1); 102 if (child) children.push(child); 103 } 104 if (maxChildren !== null && childEls.length > maxChildren) { 105 trunc.children_dropped += childEls.length - maxChildren; 106 } 107 } else { 108 // Budget hit: we're at max depth. Count any element children we would have visited. 109 for (const n of el.childNodes) if (n.nodeType === 1) { trunc.depth = true; break; } 110 } 111 const node = { tag: el.tagName.toLowerCase(), attrs, text, children }; 112 const compound = compoundInfoOf(el); 113 if (compound) node.compound = compound; 114 return node; 115 } 116 const tree = root ? serialize(root, 0) : null; 117 const truncatedOut = {}; 118 if (trunc.depth) truncatedOut.depth = true; 119 if (trunc.children_dropped > 0) truncatedOut.children_dropped = trunc.children_dropped; 120 if (trunc.text_truncated > 0) truncatedOut.text_truncated = trunc.text_truncated; 121 const envelope = { selector: selector, matched: matched, tree: tree }; 122 if (Object.keys(truncatedOut).length > 0) envelope.truncated = truncatedOut; 123 return envelope; 124 })()`; 125 } 126 127 export interface HtmlNode { 128 tag: string; 129 attrs: Record<string, string>; 130 text: string; 131 children: HtmlNode[]; 132 /** 133 * Rich view for date/select/file controls. Omitted for non-compound elements 134 * so agents can rely on `compound != null` as a signal. 135 */ 136 compound?: CompoundInfo; 137 } 138 139 export interface HtmlTreeTruncationInfo { 140 /** At least one element child was dropped because depth budget was hit. */ 141 depth?: true; 142 /** Count of element children dropped across the tree due to `childrenMax`. */ 143 children_dropped?: number; 144 /** Count of nodes whose `text` was cut to `textMax`. */ 145 text_truncated?: number; 146 } 147 148 export interface HtmlTreeResult { 149 selector: string | null; 150 matched: number; 151 tree: HtmlNode | null; 152 truncated?: HtmlTreeTruncationInfo; 153 }