Infer action sequence from page events recording (#779)

microsoft · Mar 4, 2025 · 6bc52ea · 6bc52ea
1 parent a141884
commit 6bc52ea
Show file tree

Hide file tree

Showing 5 changed files with 214 additions and 10 deletions.
diff --git a/ts/packages/agents/browser/src/agent/discovery/actionHandler.mts b/ts/packages/agents/browser/src/agent/discovery/actionHandler.mts
@@ -274,7 +274,7 @@ export async function handleSchemaDiscoveryAction(
     async function handleGetIntentFromReccording(action: any) {
         const timerName = `Getting intent schema`;
         console.time(timerName);
-        const response = await agent.getIntentSchemaFromRecording(
+        const intentResponse = await agent.getIntentSchemaFromRecording(
             action.parameters.recordedActionName,
             action.parameters.recordedActionDescription,
             action.parameters.recordedActionSteps,
@@ -283,17 +283,44 @@ export async function handleSchemaDiscoveryAction(
             "",
         );
 
-        if (!response.success) {
+        if (!intentResponse.success) {
             console.error("Attempt to process recorded action failed");
-            console.error(response.message);
+            console.error(intentResponse.message);
             message = "Action could not be completed";
             return;
         }
 
         console.timeEnd(timerName);
-        message = "Intent schema: \n" + JSON.stringify(response.data, null, 2);
+        message =
+            "Intent schema: \n" + JSON.stringify(intentResponse.data, null, 2);
 
-        return response.data;
+        const timerName2 = `Getting action schema`;
+        console.time(timerName2);
+        const stepsResponse = await agent.getActionStepsSchemaFromRecording(
+            action.parameters.recordedActionName,
+            action.parameters.recordedActionDescription,
+            intentResponse.data,
+            action.parameters.recordedActionSteps,
+            action.parameters.htmlFragments,
+            // action.parameters.screenshot,
+            "",
+        );
+
+        if (!stepsResponse.success) {
+            console.error("Attempt to process recorded action failed");
+            console.error(stepsResponse.message);
+            message = "Action could not be completed";
+            return {
+                intent: intentResponse.data,
+            };
+        }
+
+        console.timeEnd(timerName2);
+
+        return {
+            intent: intentResponse.data,
+            actions: stepsResponse.data,
+        };
     }
 
     //

diff --git a/ts/packages/agents/browser/src/agent/discovery/schema/recordedActions.mts b/ts/packages/agents/browser/src/agent/discovery/schema/recordedActions.mts
@@ -9,7 +9,8 @@ export type UserIntentParameter = {
     name: string;
     type: string;
     // The default value for the parameter. If this value is set based on a HTML
-    // page, check whether the target element has a default value
+    // page, check whether the target element has a default value. For dropdown elements, use the
+    // selected value for this entry
     defaultValue?: any;
     description: string;
     // Indicates whether a parameter is required. If a parameter has a default value
@@ -23,3 +24,67 @@ export type UserIntent = {
     // a consise list of the parameters that should be captured from the user in order to implenent this action
     parameters: UserIntentParameter[];
 };
+
+export type WebPlan = {
+    webPlanName: string;
+    description: string;
+    parameters: {
+        actionName: string;
+        stepsListId: string;
+    };
+};
+
+export type SelectElementByText = {
+    actionName: "selectElementByText";
+    parameters: {
+        // the shortName of the UserIntentParameter to use for this value
+        text: string;
+        elementType?: string;
+    };
+};
+
+export type EnterText = {
+    actionName: "enterText";
+    parameters: {
+        // the shortName of the UserIntentParameter to use for this value
+        text: string;
+    };
+};
+
+export type SelectValueFromDropdown = {
+    actionName: "selectValueFromDropdown";
+    parameters: {
+        // the shortName of the UserIntentParameter to use for this value
+        valueTextParameter: string;
+    };
+};
+
+export type ClickOnButton = {
+    actionName: "clickOnButton";
+    parameters: {
+        // the shortName of the UserIntentParameter to use for this value
+        buttonText: string;
+    };
+};
+
+export type ClickOnLink = {
+    actionName: "ClickOnLink";
+    parameters: {
+        // the shortName of the UserIntentParameter to use for this value
+        linkTextParameter: string;
+    };
+};
+
+export type PageManipulationActions =
+    | SelectElementByText
+    | EnterText
+    | SelectValueFromDropdown
+    | ClickOnButton
+    | ClickOnLink;
+
+export type PageManipulationActionsList = {
+    planName: string;
+    description: string;
+    intentSchemaName: string;
+    steps: PageManipulationActions[];
+};
diff --git a/ts/packages/agents/browser/src/agent/discovery/translator.mts b/ts/packages/agents/browser/src/agent/discovery/translator.mts
@@ -750,4 +750,100 @@ export class SchemaDiscoveryAgent<T extends object> {
         ]);
         return response;
     }
+
+    async getActionStepsSchemaFromRecording(
+        recordedActionName: string,
+        recordedActionDescription: string,
+        intentSchema?: any,
+        recordedActionSteps?: string,
+        fragments?: HtmlFragments[],
+        screenshot?: string,
+    ) {
+        const packageRoot = path.join("..", "..", "..");
+        const resultsSchema = await fs.promises.readFile(
+            fileURLToPath(
+                new URL(
+                    path.join(
+                        packageRoot,
+                        "./src/agent/discovery/schema/recordedActions.mts",
+                    ),
+                    import.meta.url,
+                ),
+            ),
+            "utf8",
+        );
+
+        const bootstrapTranslator = this.getBootstrapTranslator(
+            "PageManipulationActionsList",
+            resultsSchema,
+        );
+
+        const screenshotSection = getScreenshotPromptSection(
+            screenshot,
+            fragments,
+        );
+        const htmlSection = getHtmlPromptSection(fragments);
+        const prefixSection = getBootstrapPrefixPromptSection();
+        let requestSection = [];
+        requestSection.push({
+            type: "text",
+            text: `
+               
+            The user provided an example of how they would complete the ${recordedActionName} action on the webpage. 
+            They provided a description of the task below:
+            '''
+            ${recordedActionDescription}
+            '''
+
+            Here is a JSON representation of the parameters that a user can provide when invoking the ${recordedActionName} action.
+
+            '''
+            ${JSON.stringify(intentSchema, undefined, 2)}
+            '''
+
+            `,
+        });
+
+        if (recordedActionSteps) {
+            requestSection.push({
+                type: "text",
+                text: `
+               
+            Here are the recorded steps that the user went through on the webpage to complete the action.
+            '''
+            ${recordedActionSteps}
+            '''
+            `,
+            });
+        }
+
+        const promptSections = [
+            ...prefixSection,
+            ...screenshotSection,
+            ...htmlSection,
+            {
+                type: "text",
+                text: `
+        Examine the layout information provided as well as the user action information. Based on this
+        generate a SINGLE "${bootstrapTranslator.validator.getTypeName()}" response using the typescript schema below.
+                
+        '''
+        ${bootstrapTranslator.validator.getSchemaText()}
+        '''
+        `,
+            },
+            ...requestSection,
+            {
+                type: "text",
+                text: `
+        The following is the COMPLETE JSON response object with 2 spaces of indentation and no properties with the value undefined:            
+        `,
+            },
+        ];
+
+        const response = await bootstrapTranslator.translate("", [
+            { role: "user", content: JSON.stringify(promptSections) },
+        ]);
+        return response;
+    }
 }
diff --git a/ts/packages/agents/browser/src/extension/serviceWorker.ts b/ts/packages/agents/browser/src/extension/serviceWorker.ts
@@ -1548,7 +1548,10 @@ chrome.runtime.onMessage.addListener(
                         },
                     });
 
-                    sendResponse({ schema: schemaResult });
+                    sendResponse({
+                        intent: schemaResult.intent,
+                        actions: schemaResult.actions,
+                    });
                     break;
                 }
                 case "startRecording": {

diff --git a/ts/packages/agents/browser/src/extension/sidepanel.ts b/ts/packages/agents/browser/src/extension/sidepanel.ts
@@ -270,7 +270,7 @@ function renderTimeline(action: any, index: number) {
                                             <a class="nav-link" data-bs-toggle="tab" href="#intentTab${index}">Intent</a>
                                             </li>
                                             <li class="nav-item">
-                                            <a class="nav-link" data-bs-toggle="tab" href="#planTab${index}">Plan</a>
+                                            <a class="nav-link" data-bs-toggle="tab" href="#planTab${index}">Actions</a>
                                             </li>
                                         </ul>
                                     <button id="processAction" class="btn btn-sm btn-outline-primary" style="border:0px" title="Process Action">
@@ -321,6 +321,10 @@ function renderTimeline(action: any, index: number) {
         "#intentContent",
     )! as HTMLElement;
 
+    const actionsViewContainer = timelineHeader.querySelector(
+        "#planContent",
+    )! as HTMLElement;
+
     processActionButton.style.display = "block";
     processActionButton.addEventListener("click", () =>
         getIntentFromRecording(
@@ -353,11 +357,20 @@ function renderTimeline(action: any, index: number) {
         }
 
         const card = document.createElement("div");
+        const intentSchema = JSON.stringify(response.intent, null, 2);
+
         card.innerHTML = `        
-            <pre class="card-text"><code class="language-json">${JSON.stringify(response.schema, null, 2)}</code></pre>
+            <pre class="card-text"><code class="language-json">${intentSchema}</code></pre>
+        `;
+
+        intentViewContainer.replaceChildren(card);
+
+        const actionsCard = document.createElement("div");
+        actionsCard.innerHTML = `        
+            <pre class="card-text"><code class="language-json">${JSON.stringify(response.actions, null, 2)}</code></pre>
         `;
 
-        intentViewContainer.appendChild(card);
+        actionsViewContainer.replaceChildren(actionsCard);
     }
 
     userActionsListContainer.appendChild(timelineHeader);