Skip to content

Commit

Permalink
browser: fix inconsistent automation errors (#799)
Browse files Browse the repository at this point in the history
- the extension relied on `chrome.windows.getCurrent()` to get the
current window in view. However, when the devTools view is undocked, it
is treated as the active window. This leads to errors where the service
worker tries to send events to the devtools view instead of the content
window.
- There was an error where some code paths tried to access
chrome.storage apis in the content script, causing the script to crash.
  • Loading branch information
hillary-mutisya authored Mar 7, 2025
1 parent 071aee4 commit 9c1dbb4
Show file tree
Hide file tree
Showing 9 changed files with 260 additions and 126 deletions.
7 changes: 6 additions & 1 deletion ts/packages/agents/browser/src/agent/browserConnector.mts
Original file line number Diff line number Diff line change
Expand Up @@ -236,14 +236,19 @@ export class BrowserConnector {
return this.sendActionToBrowser(clickAction);
}

async enterTextIn(textValue: string, cssSelector?: string) {
async enterTextIn(
textValue: string,
cssSelector?: string,
submitForm?: boolean,
) {
let actionName = cssSelector ? "enterTextInElement" : "enterTextOnPage";

const textAction = {
actionName: actionName,
parameters: {
value: textValue,
cssSelector: cssSelector,
submitForm: submitForm,
},
};

Expand Down
17 changes: 11 additions & 6 deletions ts/packages/agents/browser/src/agent/discovery/actionHandler.mts
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,16 @@ export async function handleSchemaDiscoveryAction(

console.timeEnd(timerName);

const selected = response.data as UserActionsList;
const uniqueItems = new Map(
selected.actions.map((action) => [action.actionName, action]),
);

message =
"Possible user actions: \n" +
JSON.stringify(response.data, null, 2);
JSON.stringify(Array.from(uniqueItems.values()), null, 2);

const selected = response.data as UserActionsList;
const actionNames = [
...new Set(selected.actions.map((action) => action.actionName)),
];
const actionNames = [...new Set(uniqueItems.keys())];

const { schema, typeDefinitions } = await getDynamicSchema(actionNames);
message += `\n =========== \n Discovered actions schema: \n ${schema} `;
Expand Down Expand Up @@ -144,7 +146,10 @@ export async function handleSchemaDiscoveryAction(
}, 500);
}

return { schema: response.data, typeDefinitions: typeDefinitions };
return {
schema: Array.from(uniqueItems.values()),
typeDefinitions: typeDefinitions,
};
}

async function handleRegisterSiteSchema(action: any) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@ export type Button = {
cssSelector: string;
};

export type Element = {
title: string;
// CSS Selector for the element
cssSelector: string;
};

export type Link = {
title: string;
// CSS Selector for the link
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,16 @@ export type EnterText = {
};
};

// This is used on pages where the user can type anywhere in the document body
// and the page captures input
export type EnterTextAtPageScope = {
actionName: "EnterTextAtPageScope";
parameters: {
// the shortName of the UserIntentParameter to use for this value
textParameter: string;
};
};

export type SelectValueFromDropdown = {
actionName: "selectValueFromDropdown";
parameters: {
Expand All @@ -71,11 +81,11 @@ export type SelectValueFromDropdown = {
};
};

export type ClickOnButton = {
actionName: "clickOnButton";
export type ClickOnElement = {
actionName: "clickOnElement";
parameters: {
// the shortName of the UserIntentParameter to use for this value
buttonTextParameter: string;
elementTextParameter: string;
};
};

Expand All @@ -91,7 +101,7 @@ export type PageManipulationActions =
| SelectElementByText
| EnterText
| SelectValueFromDropdown
| ClickOnButton
| ClickOnElement
| ClickOnLink;

export type PageManipulationActionsList = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@ import {
} from "./schema/userActionsPool.mjs";
import { handleCommerceAction } from "../commerce/actionHandler.mjs";
import {
Button,
DropdownControl,
Element,
NavigationLink,
TextInput,
} from "./schema/pageComponents.mjs";
import {
PageManipulationActions,
Expand Down Expand Up @@ -167,20 +168,21 @@ export function createTempAgentForSchema(

await followLink(link?.linkCssSelector);
break;
case "clickOnButton":
const buttonParameter = targetIntent.parameters.find(
case "clickOnElement":
const elementParameter = targetIntent.parameters.find(
(param) =>
param.shortName ==
step.parameters.buttonTextParameter,
step.parameters.elementTextParameter,
);
const button = (await getComponentFromPage(
"Button",
`button text ${buttonParameter?.name}`,
)) as Button;
await browser.clickOn(button.cssSelector);
await browser.awaitPageInteraction();
await browser.awaitPageLoad();

const element = (await getComponentFromPage(
"Element",
`element text ${elementParameter?.name}`,
)) as Element;
if (element !== undefined) {
await browser.clickOn(element.cssSelector);
await browser.awaitPageInteraction();
await browser.awaitPageLoad();
}
break;
case "enterText":
const textParameter = targetIntent.parameters.find(
Expand All @@ -190,15 +192,15 @@ export function createTempAgentForSchema(
const textElement = (await getComponentFromPage(
"TextInput",
`input label ${textParameter?.name}`,
)) as Button;
)) as TextInput;

const userProvidedTextValue =
action.parameters[step.parameters.textParameter];

if (userProvidedTextValue !== undefined) {
await browser.enterTextIn(
userProvidedTextValue,
textElement.cssSelector,
textElement?.cssSelector,
);
}
break;
Expand Down
91 changes: 59 additions & 32 deletions ts/packages/agents/browser/src/extension/contentScript.ts
Original file line number Diff line number Diff line change
Expand Up @@ -544,8 +544,13 @@ let actionIndex = 1;
let recordedActionHtml: string = "";
let recordedActionScreenshot: string = "";

function startRecording() {
async function startRecording() {
if (recording) return;

await chrome.runtime.sendMessage({
type: "clearRecordedActions",
});

recording = true;
recordedActions = [];
actionIndex = 1;
Expand All @@ -559,8 +564,6 @@ function startRecording() {
document.addEventListener("input", recordInput, true);
// document.addEventListener("scroll", recordScroll, true);
document.addEventListener("keyup", recordTextEntry, true);

saveRecordedActions();
}

// Stop recording and return data
Expand All @@ -587,9 +590,6 @@ async function stopRecording() {
recordedActionScreenshot,
recordedActionHtml,
});
chrome.storage.session.remove("recordedActions");
chrome.storage.session.remove("recordedActionScreenshot");
chrome.storage.session.remove("recordedActionHtml");
}

// Record click events
Expand Down Expand Up @@ -634,21 +634,55 @@ function recordInput(event: Event) {
saveRecordedActions();
}

function recordTextEntry(event: Event) {
const target = event.target as HTMLInputElement | HTMLTextAreaElement;
if (target.tagName === "INPUT" || target.tagName === "TEXTAREA") {
function recordTextEntry(event: KeyboardEvent) {
const target = event.target as HTMLElement;
if (
target.tagName === "INPUT" ||
target.tagName === "TEXTAREA" ||
target.isContentEditable
) {
let value = target.textContent;
if (
target instanceof HTMLInputElement ||
target instanceof HTMLTextAreaElement
) {
value = target.value;
}

const action = {
id: actionIndex++,
type: "textInput",
timestamp: Date.now(),
tag: target.tagName,
selector: getCSSSelector(target),
boundingBox: getBoundingBox(target),
value: target.value, // Capture final text value
value: value, // Capture final text value
};

recordedActions.push(action);
}
if (target.tagName === "BODY") {
if (
recordedActions.length > 0 &&
recordedActions[recordedActions.length - 1].type ===
"pageLevelTextInput"
) {
// accumulate entered text value
recordedActions[recordedActions.length - 1].value += event.key;
} else {
const action = {
id: actionIndex++,
type: "pageLevelTextInput",
timestamp: Date.now(),
tag: target.tagName,
selector: "body",
boundingBox: getBoundingBox(target),
value: event.key,
};

recordedActions.push(action);
}
}

saveRecordedActions();
}
Expand Down Expand Up @@ -730,33 +764,15 @@ async function captureAnnotatedScreenshot() {
};
}

function saveRecordedActions() {
chrome.storage.session.set({
async function saveRecordedActions() {
await chrome.runtime.sendMessage({
type: "saveRecordedActions",
recordedActions,
recordedActionScreenshot,
recordedActionHtml,
});
}

// Restore actions if page is refreshed
chrome.storage.session.get("recordedActions", (data) => {
if (data !== undefined && data.recordedActions) {
recordedActions = data.recordedActions;
}
});

chrome.storage.session.get("recordedActionScreenshot", (data) => {
if (data !== undefined && data.recordedActionScreenshot) {
recordedActionScreenshot = data.recordedActionScreenshot;
}
});

chrome.storage.session.get("recordedActionHtml", (data) => {
if (data !== undefined && data.recordedActionHtml) {
recordedActionHtml = data.recordedActionHtml;
}
});

// Detect navigation and push it as an action
window.addEventListener("beforeunload", recordNavigation);
window.addEventListener("popstate", recordNavigation);
Expand Down Expand Up @@ -966,7 +982,7 @@ async function handleScriptAction(
break;
}
case "startRecording": {
startRecording();
await startRecording();
sendResponse({});
break;
}
Expand Down Expand Up @@ -1021,4 +1037,15 @@ window.addEventListener(

document.addEventListener("DOMContentLoaded", async () => {
console.log("Content Script initialized");

// Restore actions e.g. if page is refreshed
const restoredData = await chrome.runtime.sendMessage({
type: "getRecordedActions",
});

if (restoredData) {
recordedActions = restoredData.recordedActions;
recordedActionScreenshot = restoredData.recordedActionScreenshot;
recordedActionHtml = restoredData.recordedActionHtml;
}
});
Loading

0 comments on commit 9c1dbb4

Please sign in to comment.