Skip to content

Commit

Permalink
Infer action sequence from page events recording (#779)
Browse files Browse the repository at this point in the history
  • Loading branch information
hillary-mutisya authored Mar 4, 2025
1 parent a141884 commit 6bc52ea
Show file tree
Hide file tree
Showing 5 changed files with 214 additions and 10 deletions.
37 changes: 32 additions & 5 deletions ts/packages/agents/browser/src/agent/discovery/actionHandler.mts
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ export async function handleSchemaDiscoveryAction(
async function handleGetIntentFromReccording(action: any) {
const timerName = `Getting intent schema`;
console.time(timerName);
const response = await agent.getIntentSchemaFromRecording(
const intentResponse = await agent.getIntentSchemaFromRecording(
action.parameters.recordedActionName,
action.parameters.recordedActionDescription,
action.parameters.recordedActionSteps,
Expand All @@ -283,17 +283,44 @@ export async function handleSchemaDiscoveryAction(
"",
);

if (!response.success) {
if (!intentResponse.success) {
console.error("Attempt to process recorded action failed");
console.error(response.message);
console.error(intentResponse.message);
message = "Action could not be completed";
return;
}

console.timeEnd(timerName);
message = "Intent schema: \n" + JSON.stringify(response.data, null, 2);
message =
"Intent schema: \n" + JSON.stringify(intentResponse.data, null, 2);

return response.data;
const timerName2 = `Getting action schema`;
console.time(timerName2);
const stepsResponse = await agent.getActionStepsSchemaFromRecording(
action.parameters.recordedActionName,
action.parameters.recordedActionDescription,
intentResponse.data,
action.parameters.recordedActionSteps,
action.parameters.htmlFragments,
// action.parameters.screenshot,
"",
);

if (!stepsResponse.success) {
console.error("Attempt to process recorded action failed");
console.error(stepsResponse.message);
message = "Action could not be completed";
return {
intent: intentResponse.data,
};
}

console.timeEnd(timerName2);

return {
intent: intentResponse.data,
actions: stepsResponse.data,
};
}

//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ export type UserIntentParameter = {
name: string;
type: string;
// The default value for the parameter. If this value is set based on a HTML
// page, check whether the target element has a default value
// page, check whether the target element has a default value. For dropdown elements, use the
// selected value for this entry
defaultValue?: any;
description: string;
// Indicates whether a parameter is required. If a parameter has a default value
Expand All @@ -23,3 +24,67 @@ export type UserIntent = {
// a consise list of the parameters that should be captured from the user in order to implenent this action
parameters: UserIntentParameter[];
};

export type WebPlan = {
webPlanName: string;
description: string;
parameters: {
actionName: string;
stepsListId: string;
};
};

export type SelectElementByText = {
actionName: "selectElementByText";
parameters: {
// the shortName of the UserIntentParameter to use for this value
text: string;
elementType?: string;
};
};

export type EnterText = {
actionName: "enterText";
parameters: {
// the shortName of the UserIntentParameter to use for this value
text: string;
};
};

export type SelectValueFromDropdown = {
actionName: "selectValueFromDropdown";
parameters: {
// the shortName of the UserIntentParameter to use for this value
valueTextParameter: string;
};
};

export type ClickOnButton = {
actionName: "clickOnButton";
parameters: {
// the shortName of the UserIntentParameter to use for this value
buttonText: string;
};
};

export type ClickOnLink = {
actionName: "ClickOnLink";
parameters: {
// the shortName of the UserIntentParameter to use for this value
linkTextParameter: string;
};
};

export type PageManipulationActions =
| SelectElementByText
| EnterText
| SelectValueFromDropdown
| ClickOnButton
| ClickOnLink;

export type PageManipulationActionsList = {
planName: string;
description: string;
intentSchemaName: string;
steps: PageManipulationActions[];
};
96 changes: 96 additions & 0 deletions ts/packages/agents/browser/src/agent/discovery/translator.mts
Original file line number Diff line number Diff line change
Expand Up @@ -750,4 +750,100 @@ export class SchemaDiscoveryAgent<T extends object> {
]);
return response;
}

async getActionStepsSchemaFromRecording(
recordedActionName: string,
recordedActionDescription: string,
intentSchema?: any,
recordedActionSteps?: string,
fragments?: HtmlFragments[],
screenshot?: string,
) {
const packageRoot = path.join("..", "..", "..");
const resultsSchema = await fs.promises.readFile(
fileURLToPath(
new URL(
path.join(
packageRoot,
"./src/agent/discovery/schema/recordedActions.mts",
),
import.meta.url,
),
),
"utf8",
);

const bootstrapTranslator = this.getBootstrapTranslator(
"PageManipulationActionsList",
resultsSchema,
);

const screenshotSection = getScreenshotPromptSection(
screenshot,
fragments,
);
const htmlSection = getHtmlPromptSection(fragments);
const prefixSection = getBootstrapPrefixPromptSection();
let requestSection = [];
requestSection.push({
type: "text",
text: `
The user provided an example of how they would complete the ${recordedActionName} action on the webpage.
They provided a description of the task below:
'''
${recordedActionDescription}
'''
Here is a JSON representation of the parameters that a user can provide when invoking the ${recordedActionName} action.
'''
${JSON.stringify(intentSchema, undefined, 2)}
'''
`,
});

if (recordedActionSteps) {
requestSection.push({
type: "text",
text: `
Here are the recorded steps that the user went through on the webpage to complete the action.
'''
${recordedActionSteps}
'''
`,
});
}

const promptSections = [
...prefixSection,
...screenshotSection,
...htmlSection,
{
type: "text",
text: `
Examine the layout information provided as well as the user action information. Based on this
generate a SINGLE "${bootstrapTranslator.validator.getTypeName()}" response using the typescript schema below.
'''
${bootstrapTranslator.validator.getSchemaText()}
'''
`,
},
...requestSection,
{
type: "text",
text: `
The following is the COMPLETE JSON response object with 2 spaces of indentation and no properties with the value undefined:
`,
},
];

const response = await bootstrapTranslator.translate("", [
{ role: "user", content: JSON.stringify(promptSections) },
]);
return response;
}
}
5 changes: 4 additions & 1 deletion ts/packages/agents/browser/src/extension/serviceWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1548,7 +1548,10 @@ chrome.runtime.onMessage.addListener(
},
});

sendResponse({ schema: schemaResult });
sendResponse({
intent: schemaResult.intent,
actions: schemaResult.actions,
});
break;
}
case "startRecording": {
Expand Down
19 changes: 16 additions & 3 deletions ts/packages/agents/browser/src/extension/sidepanel.ts
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ function renderTimeline(action: any, index: number) {
<a class="nav-link" data-bs-toggle="tab" href="#intentTab${index}">Intent</a>
</li>
<li class="nav-item">
<a class="nav-link" data-bs-toggle="tab" href="#planTab${index}">Plan</a>
<a class="nav-link" data-bs-toggle="tab" href="#planTab${index}">Actions</a>
</li>
</ul>
<button id="processAction" class="btn btn-sm btn-outline-primary" style="border:0px" title="Process Action">
Expand Down Expand Up @@ -321,6 +321,10 @@ function renderTimeline(action: any, index: number) {
"#intentContent",
)! as HTMLElement;

const actionsViewContainer = timelineHeader.querySelector(
"#planContent",
)! as HTMLElement;

processActionButton.style.display = "block";
processActionButton.addEventListener("click", () =>
getIntentFromRecording(
Expand Down Expand Up @@ -353,11 +357,20 @@ function renderTimeline(action: any, index: number) {
}

const card = document.createElement("div");
const intentSchema = JSON.stringify(response.intent, null, 2);

card.innerHTML = `
<pre class="card-text"><code class="language-json">${JSON.stringify(response.schema, null, 2)}</code></pre>
<pre class="card-text"><code class="language-json">${intentSchema}</code></pre>
`;

intentViewContainer.replaceChildren(card);

const actionsCard = document.createElement("div");
actionsCard.innerHTML = `
<pre class="card-text"><code class="language-json">${JSON.stringify(response.actions, null, 2)}</code></pre>
`;

intentViewContainer.appendChild(card);
actionsViewContainer.replaceChildren(actionsCard);
}

userActionsListContainer.appendChild(timelineHeader);
Expand Down

0 comments on commit 6bc52ea

Please sign in to comment.