Skip to content

Commit

Permalink
Annotations for Scrollable elements in a11y-dom hybrid (#463)
Browse files Browse the repository at this point in the history
* include backendDOMNodeId

* skip ax nodeId if negative

* replace role with dom tag name if none or generic

* add xpath to AXNode type

* revert unnecessary changed lines

* revert more unnecessary changed lines

* changeset

* add getScrollableElementXpaths & expose it on the window

* call browser-side scrollable elems fn, inject into observe output

* changeset

* speedup

* prettier

* prune before updating roles

* take xpath out of AXnode type

* find scrollable elems

---------

Co-authored-by: Miguel <[email protected]>
  • Loading branch information
seanmcguire12 and miguelg719 authored Feb 7, 2025
1 parent f72123d commit e40bf6f
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 34 deletions.
5 changes: 5 additions & 0 deletions .changeset/early-tables-type.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": patch
---

include 'Scrollable' annotations in a11y-dom hybrid
98 changes: 88 additions & 10 deletions lib/a11y/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,9 @@ export async function getAccessibilityTree(
await page.enableCDP("Accessibility");

try {
// Identify which elements are scrollable and get their backendNodeIds
const scrollableBackendIds = await findScrollableElementIds(page);

// Fetch the full accessibility tree from Chrome DevTools Protocol
const { nodes } = await page.sendCDP<{ nodes: AXNode[] }>(
"Accessibility.getFullAXTree",
Expand All @@ -249,16 +252,28 @@ export async function getAccessibilityTree(

// Transform into hierarchical structure
const hierarchicalTree = await buildHierarchicalTree(
nodes.map((node) => ({
role: node.role?.value,
name: node.name?.value,
description: node.description?.value,
value: node.value?.value,
nodeId: node.nodeId,
backendDOMNodeId: node.backendDOMNodeId,
parentId: node.parentId,
childIds: node.childIds,
})),
nodes.map((node) => {
let roleValue = node.role?.value || "";

if (scrollableBackendIds.has(node.backendDOMNodeId)) {
if (roleValue === "generic" || roleValue === "none") {
roleValue = "scrollable";
} else {
roleValue = roleValue ? `scrollable, ${roleValue}` : "scrollable";
}
}

return {
role: roleValue,
name: node.name?.value,
description: node.description?.value,
value: node.value?.value,
nodeId: node.nodeId,
backendDOMNodeId: node.backendDOMNodeId,
parentId: node.parentId,
childIds: node.childIds,
};
}),
page,
logger,
);
Expand Down Expand Up @@ -360,6 +375,69 @@ export async function getXPathByResolvedObjectId(
return result.value || "";
}

/**
* `findScrollableElementIds` is a function that identifies elements in
* the browser that are deemed "scrollable". At a high level, it does the
* following:
* - Calls the browser-side `window.getScrollableElementXpaths()` function,
* which returns a list of XPaths for scrollable containers.
* - Iterates over the returned list of XPaths, locating each element in the DOM
* using `stagehandPage.sendCDP(...)`
* - During each iteration, we call `Runtime.evaluate` to run `document.evaluate(...)`
* with each XPath, obtaining a `RemoteObject` reference if it exists.
* - Then, for each valid object reference, we call `DOM.describeNode` to retrieve
* the element’s `backendNodeId`.
* - Collects all resulting `backendNodeId`s in a Set and returns them.
*
* @param stagehandPage - A StagehandPage instance with built-in CDP helpers.
* @returns A Promise that resolves to a Set of unique `backendNodeId`s corresponding
* to scrollable elements in the DOM.
*/
export async function findScrollableElementIds(
stagehandPage: StagehandPage,
): Promise<Set<number>> {
// get the xpaths of the scrollable elements
const xpaths = await stagehandPage.page.evaluate(() => {
return window.getScrollableElementXpaths();
});

const scrollableBackendIds = new Set<number>();

for (const xpath of xpaths) {
if (!xpath) continue;

// evaluate the XPath in the stagehandPage
const { result } = await stagehandPage.sendCDP<{
result?: { objectId?: string };
}>("Runtime.evaluate", {
expression: `
(function() {
const res = document.evaluate(${JSON.stringify(
xpath,
)}, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
return res.singleNodeValue;
})();
`,
returnByValue: false,
});

// if we have an objectId, call DOM.describeNode to get backendNodeId
if (result?.objectId) {
const { node } = await stagehandPage.sendCDP<{
node?: { backendNodeId?: number };
}>("DOM.describeNode", {
objectId: result.objectId,
});

if (node?.backendNodeId) {
scrollableBackendIds.add(node.backendNodeId);
}
}
}

return scrollableBackendIds;
}

export async function performPlaywrightMethod(
stagehandPage: Page,
logger: (logLine: LogLine) => void,
Expand Down
1 change: 1 addition & 0 deletions lib/dom/global.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,6 @@ declare global {
width: number;
height: number;
}>;
getScrollableElementXpaths: (topN?: number) => Promise<string[]>;
}
}
74 changes: 50 additions & 24 deletions lib/dom/process.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,24 @@ export function isTextNode(node: Node): node is Text {
return node.nodeType === Node.TEXT_NODE && Boolean(node.textContent?.trim());
}

function getMainScrollableElement(): HTMLElement {
/**
* Finds and returns a list of scrollable elements on the page,
* ordered from the element with the largest scrollHeight to the smallest.
*
* @param topN Optional maximum number of scrollable elements to return.
* If not provided, all found scrollable elements are returned.
* @returns An array of HTMLElements sorted by descending scrollHeight.
*/
export function getScrollableElements(topN?: number): HTMLElement[] {
// Get the root <html> element
const docEl = document.documentElement;
let mainScrollable: HTMLElement = docEl;

// 1) Compute how “scrollable” the root <html> is
// i.e. total scrollHeight - visible clientHeight
const rootScrollDiff = docEl.scrollHeight - docEl.clientHeight;

// Keep track of the “largest” scroll diff found so far.
let maxScrollDiff = rootScrollDiff;
// 1) Initialize an array to hold all scrollable elements.
// Always include the root <html> element as a fallback.
const scrollableElements: HTMLElement[] = [docEl];

// 2) Scan all elements to find if any <div> has a larger scrollable diff
// 2) Scan all elements to find potential scrollable containers.
// A candidate must have a scrollable overflow style and extra scrollable content.
const allElements = document.querySelectorAll<HTMLElement>("*");
for (const elem of allElements) {
const style = window.getComputedStyle(elem);
Expand All @@ -33,25 +39,44 @@ function getMainScrollableElement(): HTMLElement {

if (isPotentiallyScrollable) {
const candidateScrollDiff = elem.scrollHeight - elem.clientHeight;
// Only pick this <div> if it has strictly more vertical “scrollable distance” than our current best
if (candidateScrollDiff > maxScrollDiff) {
maxScrollDiff = candidateScrollDiff;
mainScrollable = elem;
// Only consider this element if it actually has extra scrollable content
// and it can truly scroll.
if (candidateScrollDiff > 0 && canElementScroll(elem)) {
scrollableElements.push(elem);
}
}
}

// 3) Verify the chosen element truly scrolls
if (mainScrollable !== docEl) {
if (!canElementScroll(mainScrollable)) {
console.log(
"Stagehand (Browser Process): Unable to scroll candidate. Fallback to <html>.",
);
mainScrollable = docEl;
}
// 3) Sort the scrollable elements from largest scrollHeight to smallest.
scrollableElements.sort((a, b) => b.scrollHeight - a.scrollHeight);

// 4) If a topN limit is specified, return only the first topN elements.
if (topN !== undefined) {
return scrollableElements.slice(0, topN);
}

return mainScrollable;
// Return all found scrollable elements if no limit is provided.
return scrollableElements;
}

/**
* Calls getScrollableElements, then for each element calls generateXPaths,
* and returns the first XPath for each.
*
* @param topN (optional) integer limit on how many scrollable elements to process
* @returns string[] list of XPaths (1 for each scrollable element)
*/
export async function getScrollableElementXpaths(
topN?: number,
): Promise<string[]> {
const scrollableElems = getScrollableElements(topN);
const xpaths = [];
for (const elem of scrollableElems) {
const allXPaths = await generateXPaths(elem);
const firstXPath = allXPaths?.[0] || "";
xpaths.push(firstXPath);
}
return xpaths;
}

export async function processDom(chunksSeen: Array<number>) {
Expand Down Expand Up @@ -80,7 +105,8 @@ export async function processDom(chunksSeen: Array<number>) {
export async function processAllOfDom() {
console.log("Stagehand (Browser Process): Processing all of DOM");

const mainScrollable = getMainScrollableElement();
const mainScrollableElements = getScrollableElements(1);
const mainScrollable = mainScrollableElements[0];

const container =
mainScrollable === document.documentElement
Expand Down Expand Up @@ -481,7 +507,7 @@ window.restoreDOM = restoreDOM;
window.createTextBoundingBoxes = createTextBoundingBoxes;
window.getElementBoundingBoxes = getElementBoundingBoxes;
window.createStagehandContainer = createStagehandContainer;

window.getScrollableElementXpaths = getScrollableElementXpaths;
const leafElementDenyList = ["SVG", "IFRAME", "SCRIPT", "STYLE", "LINK"];

const interactiveElementTypes = [
Expand Down

0 comments on commit e40bf6f

Please sign in to comment.