From e40bf6f517331fc9952c3c9f2683b7e02ffb9735 Mon Sep 17 00:00:00 2001 From: Sean McGuire <75873287+seanmcguire12@users.noreply.github.com> Date: Fri, 7 Feb 2025 10:30:44 -0800 Subject: [PATCH] Annotations for Scrollable elements in a11y-dom hybrid (#463) * include backendDOMNodeId * skip ax nodeId if negative * replace role with dom tag name if none or generic * add xpath to AXNode type * revert unnecessary changed lines * revert more unnecessary changed lines * changeset * add getScrollableElementXpaths & expose it on the window * call browser-side scrollable elems fn, inject into observe output * changeset * speedup * prettier * prune before updating roles * take xpath out of AXnode type * find scrollable elems --------- Co-authored-by: Miguel --- .changeset/early-tables-type.md | 5 ++ lib/a11y/utils.ts | 98 +++++++++++++++++++++++++++++---- lib/dom/global.d.ts | 1 + lib/dom/process.ts | 74 +++++++++++++++++-------- 4 files changed, 144 insertions(+), 34 deletions(-) create mode 100644 .changeset/early-tables-type.md diff --git a/.changeset/early-tables-type.md b/.changeset/early-tables-type.md new file mode 100644 index 00000000..33cb5f3f --- /dev/null +++ b/.changeset/early-tables-type.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": patch +--- + +include 'Scrollable' annotations in a11y-dom hybrid diff --git a/lib/a11y/utils.ts b/lib/a11y/utils.ts index a8345cb4..0558e8f8 100644 --- a/lib/a11y/utils.ts +++ b/lib/a11y/utils.ts @@ -241,6 +241,9 @@ export async function getAccessibilityTree( await page.enableCDP("Accessibility"); try { + // Identify which elements are scrollable and get their backendNodeIds + const scrollableBackendIds = await findScrollableElementIds(page); + // Fetch the full accessibility tree from Chrome DevTools Protocol const { nodes } = await page.sendCDP<{ nodes: AXNode[] }>( "Accessibility.getFullAXTree", @@ -249,16 +252,28 @@ export async function getAccessibilityTree( // Transform into hierarchical structure const hierarchicalTree = await buildHierarchicalTree( - nodes.map((node) => ({ - role: node.role?.value, - name: node.name?.value, - description: node.description?.value, - value: node.value?.value, - nodeId: node.nodeId, - backendDOMNodeId: node.backendDOMNodeId, - parentId: node.parentId, - childIds: node.childIds, - })), + nodes.map((node) => { + let roleValue = node.role?.value || ""; + + if (scrollableBackendIds.has(node.backendDOMNodeId)) { + if (roleValue === "generic" || roleValue === "none") { + roleValue = "scrollable"; + } else { + roleValue = roleValue ? `scrollable, ${roleValue}` : "scrollable"; + } + } + + return { + role: roleValue, + name: node.name?.value, + description: node.description?.value, + value: node.value?.value, + nodeId: node.nodeId, + backendDOMNodeId: node.backendDOMNodeId, + parentId: node.parentId, + childIds: node.childIds, + }; + }), page, logger, ); @@ -360,6 +375,69 @@ export async function getXPathByResolvedObjectId( return result.value || ""; } +/** + * `findScrollableElementIds` is a function that identifies elements in + * the browser that are deemed "scrollable". At a high level, it does the + * following: + * - Calls the browser-side `window.getScrollableElementXpaths()` function, + * which returns a list of XPaths for scrollable containers. + * - Iterates over the returned list of XPaths, locating each element in the DOM + * using `stagehandPage.sendCDP(...)` + * - During each iteration, we call `Runtime.evaluate` to run `document.evaluate(...)` + * with each XPath, obtaining a `RemoteObject` reference if it exists. + * - Then, for each valid object reference, we call `DOM.describeNode` to retrieve + * the element’s `backendNodeId`. + * - Collects all resulting `backendNodeId`s in a Set and returns them. + * + * @param stagehandPage - A StagehandPage instance with built-in CDP helpers. + * @returns A Promise that resolves to a Set of unique `backendNodeId`s corresponding + * to scrollable elements in the DOM. + */ +export async function findScrollableElementIds( + stagehandPage: StagehandPage, +): Promise> { + // get the xpaths of the scrollable elements + const xpaths = await stagehandPage.page.evaluate(() => { + return window.getScrollableElementXpaths(); + }); + + const scrollableBackendIds = new Set(); + + for (const xpath of xpaths) { + if (!xpath) continue; + + // evaluate the XPath in the stagehandPage + const { result } = await stagehandPage.sendCDP<{ + result?: { objectId?: string }; + }>("Runtime.evaluate", { + expression: ` + (function() { + const res = document.evaluate(${JSON.stringify( + xpath, + )}, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null); + return res.singleNodeValue; + })(); + `, + returnByValue: false, + }); + + // if we have an objectId, call DOM.describeNode to get backendNodeId + if (result?.objectId) { + const { node } = await stagehandPage.sendCDP<{ + node?: { backendNodeId?: number }; + }>("DOM.describeNode", { + objectId: result.objectId, + }); + + if (node?.backendNodeId) { + scrollableBackendIds.add(node.backendNodeId); + } + } + } + + return scrollableBackendIds; +} + export async function performPlaywrightMethod( stagehandPage: Page, logger: (logLine: LogLine) => void, diff --git a/lib/dom/global.d.ts b/lib/dom/global.d.ts index ee0696b9..318e4482 100644 --- a/lib/dom/global.d.ts +++ b/lib/dom/global.d.ts @@ -36,5 +36,6 @@ declare global { width: number; height: number; }>; + getScrollableElementXpaths: (topN?: number) => Promise; } } diff --git a/lib/dom/process.ts b/lib/dom/process.ts index 338b51de..9a0a4d4b 100644 --- a/lib/dom/process.ts +++ b/lib/dom/process.ts @@ -11,18 +11,24 @@ export function isTextNode(node: Node): node is Text { return node.nodeType === Node.TEXT_NODE && Boolean(node.textContent?.trim()); } -function getMainScrollableElement(): HTMLElement { +/** + * Finds and returns a list of scrollable elements on the page, + * ordered from the element with the largest scrollHeight to the smallest. + * + * @param topN Optional maximum number of scrollable elements to return. + * If not provided, all found scrollable elements are returned. + * @returns An array of HTMLElements sorted by descending scrollHeight. + */ +export function getScrollableElements(topN?: number): HTMLElement[] { + // Get the root element const docEl = document.documentElement; - let mainScrollable: HTMLElement = docEl; - - // 1) Compute how “scrollable” the root is - // i.e. total scrollHeight - visible clientHeight - const rootScrollDiff = docEl.scrollHeight - docEl.clientHeight; - // Keep track of the “largest” scroll diff found so far. - let maxScrollDiff = rootScrollDiff; + // 1) Initialize an array to hold all scrollable elements. + // Always include the root element as a fallback. + const scrollableElements: HTMLElement[] = [docEl]; - // 2) Scan all elements to find if any
has a larger scrollable diff + // 2) Scan all elements to find potential scrollable containers. + // A candidate must have a scrollable overflow style and extra scrollable content. const allElements = document.querySelectorAll("*"); for (const elem of allElements) { const style = window.getComputedStyle(elem); @@ -33,25 +39,44 @@ function getMainScrollableElement(): HTMLElement { if (isPotentiallyScrollable) { const candidateScrollDiff = elem.scrollHeight - elem.clientHeight; - // Only pick this
if it has strictly more vertical “scrollable distance” than our current best - if (candidateScrollDiff > maxScrollDiff) { - maxScrollDiff = candidateScrollDiff; - mainScrollable = elem; + // Only consider this element if it actually has extra scrollable content + // and it can truly scroll. + if (candidateScrollDiff > 0 && canElementScroll(elem)) { + scrollableElements.push(elem); } } } - // 3) Verify the chosen element truly scrolls - if (mainScrollable !== docEl) { - if (!canElementScroll(mainScrollable)) { - console.log( - "Stagehand (Browser Process): Unable to scroll candidate. Fallback to .", - ); - mainScrollable = docEl; - } + // 3) Sort the scrollable elements from largest scrollHeight to smallest. + scrollableElements.sort((a, b) => b.scrollHeight - a.scrollHeight); + + // 4) If a topN limit is specified, return only the first topN elements. + if (topN !== undefined) { + return scrollableElements.slice(0, topN); } - return mainScrollable; + // Return all found scrollable elements if no limit is provided. + return scrollableElements; +} + +/** + * Calls getScrollableElements, then for each element calls generateXPaths, + * and returns the first XPath for each. + * + * @param topN (optional) integer limit on how many scrollable elements to process + * @returns string[] list of XPaths (1 for each scrollable element) + */ +export async function getScrollableElementXpaths( + topN?: number, +): Promise { + const scrollableElems = getScrollableElements(topN); + const xpaths = []; + for (const elem of scrollableElems) { + const allXPaths = await generateXPaths(elem); + const firstXPath = allXPaths?.[0] || ""; + xpaths.push(firstXPath); + } + return xpaths; } export async function processDom(chunksSeen: Array) { @@ -80,7 +105,8 @@ export async function processDom(chunksSeen: Array) { export async function processAllOfDom() { console.log("Stagehand (Browser Process): Processing all of DOM"); - const mainScrollable = getMainScrollableElement(); + const mainScrollableElements = getScrollableElements(1); + const mainScrollable = mainScrollableElements[0]; const container = mainScrollable === document.documentElement @@ -481,7 +507,7 @@ window.restoreDOM = restoreDOM; window.createTextBoundingBoxes = createTextBoundingBoxes; window.getElementBoundingBoxes = getElementBoundingBoxes; window.createStagehandContainer = createStagehandContainer; - +window.getScrollableElementXpaths = getScrollableElementXpaths; const leafElementDenyList = ["SVG", "IFRAME", "SCRIPT", "STYLE", "LINK"]; const interactiveElementTypes = [