/ src / browser / html-tree.ts
html-tree.ts
  1  /**
  2   * Client-side HTML → structured tree serializer.
  3   *
  4   * Returned as a JS string that gets passed to `page.evaluate`. The expression
  5   * walks the DOM subtree rooted at the first selector match (or documentElement
  6   * when no selector is given) and emits a compact `{tag, attrs, text, children}`
  7   * tree for agents to consume instead of re-parsing raw HTML.
  8   *
  9   * Text handling: `text` is the concatenated text of direct text children only,
 10   * whitespace-collapsed. Nested element text is left inside `children[].text`.
 11   * Ordering between text and elements is not preserved — agents that need it
 12   * should fall back to raw HTML mode.
 13   *
 14   * Budget knobs let the caller bound the output on large pages — previously an
 15   * unscoped `get html --as json` could return a giant tree. Callers set any
 16   * combination of `depth` / `childrenMax` / `textMax`; each hit is reported in
 17   * the `truncated` envelope so agents know to narrow their selector or raise
 18   * the budget.
 19   *
 20   * Compound controls (date / time / datetime-local / month / week / select /
 21   * file) gain a `compound` field so agents inspecting the JSON tree see the
 22   * full contract — date format, full option list (up to cap) with selections
 23   * preserved for options beyond the cap, file `accept` and `multiple`. Without
 24   * this wiring agents repeatedly guess values on these controls from the raw
 25   * attributes, which is the failure mode compound.ts was built to eliminate.
 26   */
 27  
 28  import { COMPOUND_INFO_JS, type CompoundInfo } from './compound.js';
 29  
 30  export interface BuildHtmlTreeJsOptions {
 31      /** CSS selector to scope the tree; unscoped = documentElement */
 32      selector?: string | null;
 33      /** Max depth below the root (0 = root only, no children). Omit = unlimited. */
 34      depth?: number | null;
 35      /** Max element children per node before the rest get dropped. Omit = unlimited. */
 36      childrenMax?: number | null;
 37      /** Max chars of direct text per node before truncation. Omit = unlimited. */
 38      textMax?: number | null;
 39  }
 40  
 41  /**
 42   * Returns a JS expression string. When evaluated in a page context the
 43   * expression resolves to either
 44   *   `{selector, matched, tree, truncated}` on success, or
 45   *   `{selector, invalidSelector: true, reason}` when `querySelectorAll`
 46   *   throws a `SyntaxError` for an unparseable selector.
 47   *
 48   * Callers must branch on `invalidSelector` to convert it into the CLI's
 49   * `invalid_selector` structured error; otherwise the browser-level exception
 50   * would bubble out of `page.evaluate` and bypass the structured-error
 51   * contract that agents rely on.
 52   */
 53  export function buildHtmlTreeJs(opts: BuildHtmlTreeJsOptions = {}): string {
 54      const selectorLiteral = opts.selector ? JSON.stringify(opts.selector) : 'null';
 55      const depthLiteral = Number.isFinite(opts.depth as number) && (opts.depth as number) >= 0
 56          ? String(opts.depth)
 57          : 'null';
 58      const childrenMaxLiteral = Number.isFinite(opts.childrenMax as number) && (opts.childrenMax as number) >= 0
 59          ? String(opts.childrenMax)
 60          : 'null';
 61      const textMaxLiteral = Number.isFinite(opts.textMax as number) && (opts.textMax as number) >= 0
 62          ? String(opts.textMax)
 63          : 'null';
 64      return `(() => {
 65    ${COMPOUND_INFO_JS}
 66    const selector = ${selectorLiteral};
 67    const maxDepth = ${depthLiteral};
 68    const maxChildren = ${childrenMaxLiteral};
 69    const maxText = ${textMaxLiteral};
 70    let matches;
 71    if (selector) {
 72      try { matches = document.querySelectorAll(selector); }
 73      catch (e) {
 74        return { selector: selector, invalidSelector: true, reason: (e && e.message) || String(e) };
 75      }
 76    } else {
 77      matches = [document.documentElement];
 78    }
 79    const matched = matches.length;
 80    const root = matches[0] || null;
 81    const trunc = { depth: false, children_dropped: 0, text_truncated: 0 };
 82    function serialize(el, depth) {
 83      if (!el || el.nodeType !== 1) return null;
 84      const attrs = {};
 85      for (const a of el.attributes) attrs[a.name] = a.value;
 86      let text = '';
 87      for (const n of el.childNodes) {
 88        if (n.nodeType === 3) text += n.nodeValue;
 89      }
 90      text = text.replace(/\\s+/g, ' ').trim();
 91      if (maxText !== null && text.length > maxText) {
 92        text = text.slice(0, maxText);
 93        trunc.text_truncated++;
 94      }
 95      const children = [];
 96      if (maxDepth === null || depth < maxDepth) {
 97        const childEls = [];
 98        for (const n of el.childNodes) if (n.nodeType === 1) childEls.push(n);
 99        const keep = maxChildren === null ? childEls.length : Math.min(childEls.length, maxChildren);
100        for (let i = 0; i < keep; i++) {
101          const child = serialize(childEls[i], depth + 1);
102          if (child) children.push(child);
103        }
104        if (maxChildren !== null && childEls.length > maxChildren) {
105          trunc.children_dropped += childEls.length - maxChildren;
106        }
107      } else {
108        // Budget hit: we're at max depth. Count any element children we would have visited.
109        for (const n of el.childNodes) if (n.nodeType === 1) { trunc.depth = true; break; }
110      }
111      const node = { tag: el.tagName.toLowerCase(), attrs, text, children };
112      const compound = compoundInfoOf(el);
113      if (compound) node.compound = compound;
114      return node;
115    }
116    const tree = root ? serialize(root, 0) : null;
117    const truncatedOut = {};
118    if (trunc.depth) truncatedOut.depth = true;
119    if (trunc.children_dropped > 0) truncatedOut.children_dropped = trunc.children_dropped;
120    if (trunc.text_truncated > 0) truncatedOut.text_truncated = trunc.text_truncated;
121    const envelope = { selector: selector, matched: matched, tree: tree };
122    if (Object.keys(truncatedOut).length > 0) envelope.truncated = truncatedOut;
123    return envelope;
124  })()`;
125  }
126  
127  export interface HtmlNode {
128      tag: string;
129      attrs: Record<string, string>;
130      text: string;
131      children: HtmlNode[];
132      /**
133       * Rich view for date/select/file controls. Omitted for non-compound elements
134       * so agents can rely on `compound != null` as a signal.
135       */
136      compound?: CompoundInfo;
137  }
138  
139  export interface HtmlTreeTruncationInfo {
140      /** At least one element child was dropped because depth budget was hit. */
141      depth?: true;
142      /** Count of element children dropped across the tree due to `childrenMax`. */
143      children_dropped?: number;
144      /** Count of nodes whose `text` was cut to `textMax`. */
145      text_truncated?: number;
146  }
147  
148  export interface HtmlTreeResult {
149      selector: string | null;
150      matched: number;
151      tree: HtmlNode | null;
152      truncated?: HtmlTreeTruncationInfo;
153  }