From f59b5acb4229fd8cddb048618e1f8050f0c8f334 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Tue, 7 Jan 2025 14:36:07 -0700
Subject: [PATCH 1/6] fix: updates based on live API

---
 src/agentTester.ts                            | 34 +++++++++----------
 src/index.ts                                  |  2 +-
 test/agentTester.test.ts                      | 20 +++++------
 ...ions_runs_4KBSM000000003F4AQ_results.json} |  0
 4 files changed, 28 insertions(+), 28 deletions(-)
 rename test/mocks/{einstein_ai-evaluations_runs_4KBSM000000003F4AQ_details.json => einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json} (100%)
diff --git a/src/agentTester.ts b/src/agentTester.ts
index 41c9dd6..f97c64c 100644
--- a/src/agentTester.ts
+++ b/src/agentTester.ts
@@ -53,7 +53,7 @@ export type TestCaseResult = {
   }>;
 };
 
-export type AgentTestDetailsResponse = {
+export type AgentTestResultsResponse = {
   status: TestStatus;
   startTime: string;
   endTime?: string;
@@ -106,7 +106,7 @@ export class AgentTester {
    *
    * @param {string} jobId
    * @param {Duration} timeout
-   * @returns {Promise<AgentTestDetailsResponse>}
+   * @returns {Promise<AgentTestResultsResponse>}
    */
   public async poll(
     jobId: string,
@@ -117,17 +117,17 @@ export class AgentTester {
     } = {
       timeout: Duration.minutes(5),
     }
-  ): Promise<AgentTestDetailsResponse> {
+  ): Promise<AgentTestResultsResponse> {
     const frequency = env.getNumber('SF_AGENT_TEST_POLLING_FREQUENCY_MS', 1000);
     const lifecycle = Lifecycle.getInstance();
     const client = await PollingClient.create({
       poll: async (): Promise<StatusResult> => {
         // NOTE: we don't actually need to call the status API here since all the same information is present on the
         // details API. We could just call the details API and check the status there.
-        const [detailsResponse, statusResponse] = await Promise.all([this.details(jobId), this.status(jobId)]);
-        const totalTestCases = detailsResponse.testSet.testCases.length;
-        const failingTestCases = detailsResponse.testSet.testCases.filter((tc) => tc.status === 'ERROR').length;
-        const passingTestCases = detailsResponse.testSet.testCases.filter(
+        const [resultsResponse, statusResponse] = await Promise.all([this.results(jobId), this.status(jobId)]);
+        const totalTestCases = resultsResponse.testSet.testCases.length;
+        const failingTestCases = resultsResponse.testSet.testCases.filter((tc) => tc.status === 'ERROR').length;
+        const passingTestCases = resultsResponse.testSet.testCases.filter(
           (tc) => tc.status === 'COMPLETED' && tc.expectationResults.every((r) => r.result === 'Passed')
         ).length;
 
@@ -139,7 +139,7 @@ export class AgentTester {
             failingTestCases,
             passingTestCases,
           });
-          return { payload: detailsResponse, completed: true };
+          return { payload: resultsResponse, completed: true };
         }
 
         await lifecycle.emit('AGENT_TEST_POLLING_EVENT', {
@@ -155,19 +155,19 @@ export class AgentTester {
       timeout,
     });
 
-    return client.subscribe<AgentTestDetailsResponse>();
+    return client.subscribe<AgentTestResultsResponse>();
   }
 
   /**
    * Request test run details
    *
    * @param {string} jobId
-   * @returns {Promise<AgentTestDetailsResponse>}
+   * @returns {Promise<AgentTestResultsResponse>}
    */
-  public async details(jobId: string): Promise<AgentTestDetailsResponse> {
-    const url = `/einstein/ai-evaluations/runs/${jobId}/details`;
+  public async results(jobId: string): Promise<AgentTestResultsResponse> {
+    const url = `/einstein/ai-evaluations/runs/${jobId}/results`;
 
-    return this.maybeMock.request<AgentTestDetailsResponse>('GET', url);
+    return this.maybeMock.request<AgentTestResultsResponse>('GET', url);
   }
 
   /**
@@ -246,7 +246,7 @@ function makeSimpleTable(data: Record<string, string>, title: string): string {
   return `${title}\n${table}`;
 }
 
-export async function humanFormat(details: AgentTestDetailsResponse): Promise<string> {
+export async function humanFormat(details: AgentTestResultsResponse): Promise<string> {
   const { Ux } = await import('@salesforce/sf-plugins-core');
   const ux = new Ux();
 
@@ -312,11 +312,11 @@ export async function humanFormat(details: AgentTestDetailsResponse): Promise<st
   return tables.join('\n') + `\n${resultsTable}\n\n${failedTestCasesTable}\n`;
 }
 
-export async function jsonFormat(details: AgentTestDetailsResponse): Promise<string> {
+export async function jsonFormat(details: AgentTestResultsResponse): Promise<string> {
   return Promise.resolve(JSON.stringify(details, null, 2));
 }
 
-export async function junitFormat(details: AgentTestDetailsResponse): Promise<string> {
+export async function junitFormat(details: AgentTestResultsResponse): Promise<string> {
   // eslint-disable-next-line import/no-extraneous-dependencies
   const { XMLBuilder } = await import('fast-xml-parser');
   const builder = new XMLBuilder({
@@ -369,7 +369,7 @@ export async function junitFormat(details: AgentTestDetailsResponse): Promise<st
   return `<?xml version="1.0" encoding="UTF-8"?>\n${suites}`.trim();
 }
 
-export async function tapFormat(details: AgentTestDetailsResponse): Promise<string> {
+export async function tapFormat(details: AgentTestResultsResponse): Promise<string> {
   const lines: string[] = [];
   let expectationCount = 0;
   for (const testCase of details.testSet.testCases) {
diff --git a/src/index.ts b/src/index.ts
index 3a5aa53..0d8f5ca 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -20,7 +20,7 @@ export {
   jsonFormat,
   junitFormat,
   tapFormat,
-  type AgentTestDetailsResponse,
+  type AgentTestResultsResponse,
   type AgentTestStartResponse,
   type AgentTestStatusResponse,
   type TestCaseResult,
diff --git a/test/agentTester.test.ts b/test/agentTester.test.ts
index 205a780..6d1e2c0 100644
--- a/test/agentTester.test.ts
+++ b/test/agentTester.test.ts
@@ -8,7 +8,7 @@ import { readFile } from 'node:fs/promises';
 import { expect } from 'chai';
 import { MockTestOrgData, TestContext } from '@salesforce/core/testSetup';
 import { Connection } from '@salesforce/core';
-import { AgentTestDetailsResponse, AgentTester, humanFormat, junitFormat, tapFormat } from '../src/agentTester';
+import { AgentTestResultsResponse, AgentTester, humanFormat, junitFormat, tapFormat } from '../src/agentTester';
 
 describe('AgentTester', () => {
   const $$ = new TestContext();
@@ -62,11 +62,11 @@ describe('AgentTester', () => {
     });
   });
 
-  describe('details', () => {
-    it('should return details of completed test run', async () => {
+  describe('results', () => {
+    it('should return results of completed test run', async () => {
       const tester = new AgentTester(connection);
       await tester.start('suiteId');
-      const output = await tester.details('4KBSM000000003F4AQ');
+      const output = await tester.results('4KBSM000000003F4AQ');
       // TODO: make this assertion more meaningful
       expect(output).to.be.ok;
     });
@@ -84,8 +84,8 @@ describe('AgentTester', () => {
 
 describe('humanFormat', () => {
   it('should transform test results to human readable format', async () => {
-    const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_details.json', 'utf8');
-    const input = JSON.parse(raw) as AgentTestDetailsResponse;
+    const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json', 'utf8');
+    const input = JSON.parse(raw) as AgentTestResultsResponse;
     const output = await humanFormat(input);
     expect(output).to.be.ok;
   });
@@ -93,8 +93,8 @@ describe('humanFormat', () => {
 
 describe('junitFormatter', () => {
   it('should transform test results to JUnit format', async () => {
-    const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_details.json', 'utf8');
-    const input = JSON.parse(raw) as AgentTestDetailsResponse;
+    const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json', 'utf8');
+    const input = JSON.parse(raw) as AgentTestResultsResponse;
     const output = await junitFormat(input);
     expect(output).to.deep.equal(`<?xml version="1.0" encoding="UTF-8"?>
 <testsuites name="Copilot_for_Salesforce" tests="2" failures="1" time="20000">
@@ -112,8 +112,8 @@ describe('junitFormatter', () => {
 
 describe('tapFormatter', () => {
   it('should transform test results to TAP format', async () => {
-    const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_details.json', 'utf8');
-    const input = JSON.parse(raw) as AgentTestDetailsResponse;
+    const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json', 'utf8');
+    const input = JSON.parse(raw) as AgentTestResultsResponse;
     const output = await tapFormat(input);
     expect(output).to.deep.equal(`Tap Version 14
 1..6
diff --git a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_details.json b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json
similarity index 100%
rename from test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_details.json
rename to test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json

From e198c77a6287550571593d04aa548b0db042e885 Mon Sep 17 00:00:00 2001
From: svc-cli-bot <Svc_cli_bot@salesforce.com>
Date: Tue, 7 Jan 2025 21:37:42 +0000
Subject: [PATCH 2/6] chore(release): 0.5.10-dev.0 [skip ci]

---
 package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/package.json b/package.json
index e025492..4abfbac 100644
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "@salesforce/agents",
   "description": "Client side APIs for working with Salesforce agents",
-  "version": "0.5.9",
+  "version": "0.5.10-dev.0",
   "license": "BSD-3-Clause",
   "author": "Salesforce",
   "main": "lib/index",

From 7757c162313559169ccc31fb88367114bf8b8b86 Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Fri, 10 Jan 2025 09:56:56 -0700
Subject: [PATCH 3/6] fix: api updates

---
 CONTRIBUTING.md                               | 22 +++-----
 src/agentTester.ts                            | 53 +++++++++++--------
 ...tions_runs_4KBSM000000003F4AQ_results.json | 20 ++++---
 3 files changed, 47 insertions(+), 48 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 73226ad..5f994e9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,16 +1,16 @@
 ## Contributing
 
 1. Familiarize yourself with the codebase by reading the docs, in
-   particular the [developing](./contributing/developing.md) doc.
-1. Create a new issue before starting your project so that we can keep track of
+   particular the [developing](./developing.md) doc.
+2. Create a new issue before starting your project so that we can keep track of
    what you're trying to add/fix. That way, we can also offer suggestions or
    let you know if there is already an effort in progress.
-1. Fork this repository.
-1. Set up your environment using the information in the [developing](./contributing/developing.md) doc.
-1. Create a _topic_ branch in your fork based on the correct branch (usually the **develop** branch, see [Branches section](./contributing/developing.md)). Note: this step is recommended but technically not required if contributing using a fork.
-1. Edit the code in your fork.
-1. Sign the CLA (see [CLA](#cla)).
-1. Send us a pull request when you're done. We'll review your code, suggest any
+3. Fork this repository.
+4. Set up your environment using the information in the [developing](./developing.md) doc.
+5. Create a _topic_ branch in your fork based on the correct branch (usually the **develop** branch, see [Branches section](./developing.md)). Note: this step is recommended but technically not required if contributing using a fork.
+6. Edit the code in your fork.
+7. Sign the CLA (see [CLA](#cla)).
+8. Send us a pull request when you're done. We'll review your code, suggest any
    needed changes, and merge it in.
 
 ## Pull Requests
@@ -31,9 +31,3 @@ Agreement. You can do so by going to <https://cla.salesforce.com/sign-cla>.
 ### Merging Pull Requests
 
 Pull request merging is restricted to squash and merge only.
-
-## Helpful Resources
-
-- All of the files in the [contributing](./contributing) folder have useful information, particularly the previously-mentioned [developing](./contributing/developing.md) doc.
-- The [Source-Deploy-Retrieve Handbook](./HANDBOOK.md) contains an overview of all of the code in this project. This easy-to-read document can serve as an introduction and overview of the code and concepts, or as a reference for what a given module accomplishes and why it was designed.
-- The [API documentation](https://forcedotcom.github.io/source-deploy-retrieve/) has details on using the classes and methods.
diff --git a/src/agentTester.ts b/src/agentTester.ts
index f97c64c..9b307a0 100644
--- a/src/agentTester.ts
+++ b/src/agentTester.ts
@@ -25,14 +25,14 @@ export type AgentTestStatusResponse = {
 
 export type TestCaseResult = {
   status: TestStatus;
-  number: string;
   utterance: string;
+  utterence: string;
   startTime: string;
   endTime?: string;
   generatedData: {
     type: 'AGENT';
     actionsSequence: string[];
-    outcome: 'Success' | 'Failure';
+    outcome: string;
     topic: string;
     inputTokensCount: string;
     outputTokensCount: string;
@@ -42,7 +42,7 @@ export type TestCaseResult = {
     actualValue: string;
     expectedValue: string;
     score: number;
-    result: 'Passed' | 'Failed';
+    result: 'PASS' | 'FAIL';
     metricLabel: 'Accuracy' | 'Precision';
     metricExplainability: string;
     status: TestStatus;
@@ -122,19 +122,19 @@ export class AgentTester {
     const lifecycle = Lifecycle.getInstance();
     const client = await PollingClient.create({
       poll: async (): Promise<StatusResult> => {
-        // NOTE: we don't actually need to call the status API here since all the same information is present on the
-        // details API. We could just call the details API and check the status there.
-        const [resultsResponse, statusResponse] = await Promise.all([this.results(jobId), this.status(jobId)]);
+        const resultsResponse = await this.results(jobId);
         const totalTestCases = resultsResponse.testSet.testCases.length;
-        const failingTestCases = resultsResponse.testSet.testCases.filter((tc) => tc.status === 'ERROR').length;
         const passingTestCases = resultsResponse.testSet.testCases.filter(
-          (tc) => tc.status === 'COMPLETED' && tc.expectationResults.every((r) => r.result === 'Passed')
+          (tc) => tc.status === 'COMPLETED' && tc.expectationResults.every((r) => r.result === 'PASS')
+        ).length;
+        const failingTestCases = resultsResponse.testSet.testCases.filter(
+          (tc) => ['ERROR', 'COMPLETED'].includes(tc.status) && tc.expectationResults.some((r) => r.result === 'FAIL')
         ).length;
 
-        if (statusResponse.status.toLowerCase() === 'completed') {
+        if (resultsResponse.status.toLowerCase() === 'completed') {
           await lifecycle.emit('AGENT_TEST_POLLING_EVENT', {
             jobId,
-            status: statusResponse.status,
+            status: resultsResponse.status,
             totalTestCases,
             failingTestCases,
             passingTestCases,
@@ -144,7 +144,7 @@ export class AgentTester {
 
         await lifecycle.emit('AGENT_TEST_POLLING_EVENT', {
           jobId,
-          status: statusResponse.status,
+          status: resultsResponse.status,
           totalTestCases,
           failingTestCases,
           passingTestCases,
@@ -252,13 +252,16 @@ export async function humanFormat(details: AgentTestResultsResponse): Promise<st
 
   const tables: string[] = [];
   for (const testCase of details.testSet.testCases) {
+    const number = details.testSet.testCases.indexOf(testCase) + 1;
     const table = ux.makeTable({
-      title: `${ansis.bold(`Test Case #${testCase.number}`)}\n${ansis.dim('Utterance')}: ${testCase.utterance}`,
+      title: `${ansis.bold(`Test Case #${number}`)}\n${ansis.dim('Utterance')}: ${
+        testCase.utterance ?? testCase.utterence
+      }`,
       overflow: 'wrap',
       columns: ['test', 'result', { key: 'expected', width: '40%' }, { key: 'actual', width: '40%' }],
       data: testCase.expectationResults.map((r) => ({
         test: humanFriendlyName(r.name),
-        result: r.result === 'Passed' ? ansis.green('Pass') : ansis.red('Fail'),
+        result: r.result === 'PASS' ? ansis.green('Pass') : ansis.red('Fail'),
         expected: r.expectedValue,
         actual: r.actualValue,
       })),
@@ -269,19 +272,19 @@ export async function humanFormat(details: AgentTestResultsResponse): Promise<st
 
   const topicPassCount = details.testSet.testCases.reduce((acc, tc) => {
     const topic = tc.expectationResults.find((r) => r.name === 'topic_sequence_match');
-    return topic?.result === 'Passed' ? acc + 1 : acc;
+    return topic?.result === 'PASS' ? acc + 1 : acc;
   }, 0);
   const topicPassPercent = (topicPassCount / details.testSet.testCases.length) * 100;
 
   const actionPassCount = details.testSet.testCases.reduce((acc, tc) => {
     const action = tc.expectationResults.find((r) => r.name === 'action_sequence_match');
-    return action?.result === 'Passed' ? acc + 1 : acc;
+    return action?.result === 'PASS' ? acc + 1 : acc;
   }, 0);
   const actionPassPercent = (actionPassCount / details.testSet.testCases.length) * 100;
 
   const outcomePassCount = details.testSet.testCases.reduce((acc, tc) => {
     const outcome = tc.expectationResults.find((r) => r.name === 'bot_response_rating');
-    return outcome?.result === 'Passed' ? acc + 1 : acc;
+    return outcome?.result === 'PASS' ? acc + 1 : acc;
   }, 0);
   const outcomePassPercent = (outcomePassCount / details.testSet.testCases.length) * 100;
 
@@ -300,9 +303,9 @@ export async function humanFormat(details: AgentTestResultsResponse): Promise<st
   const failedTestCases = details.testSet.testCases.filter((tc) => tc.status === 'ERROR');
   const failedTestCasesObj = Object.fromEntries(
     Object.entries(failedTestCases).map(([, tc]) => [
-      `Test Case #${tc.number}`,
+      `Test Case #${failedTestCases.indexOf(tc) + 1}`,
       tc.expectationResults
-        .filter((r) => r.result === 'Failed')
+        .filter((r) => r.result === 'FAIL')
         .map((r) => humanFriendlyName(r.name))
         .join(', '),
     ])
@@ -326,7 +329,9 @@ export async function junitFormat(details: AgentTestResultsResponse): Promise<st
   });
 
   const testCount = details.testSet.testCases.length;
-  const failureCount = details.testSet.testCases.filter((tc) => tc.status === 'ERROR').length;
+  const failureCount = details.testSet.testCases.filter(
+    (tc) => ['ERROR', 'COMPLETED'].includes(tc.status) && tc.expectationResults.some((r) => r.result === 'FAIL')
+  ).length;
   const time = details.testSet.testCases.reduce((acc, tc) => {
     if (tc.endTime && tc.startTime) {
       return acc + new Date(tc.endTime).getTime() - new Date(tc.startTime).getTime();
@@ -351,12 +356,12 @@ export async function junitFormat(details: AgentTestResultsResponse): Promise<st
           : 0;
 
         return {
-          $name: `${details.testSet.name}.${testCase.number}`,
+          $name: `${details.testSet.name}.${details.testSet.testCases.indexOf(testCase) + 1}`,
           $time: testCaseTime,
           $assertions: testCase.expectationResults.length,
           failure: testCase.expectationResults
             .map((r) => {
-              if (r.result === 'Failed') {
+              if (r.result === 'FAIL') {
                 return { $message: r.errorMessage ?? 'Unknown error', $name: r.name };
               }
             })
@@ -374,9 +379,11 @@ export async function tapFormat(details: AgentTestResultsResponse): Promise<stri
   let expectationCount = 0;
   for (const testCase of details.testSet.testCases) {
     for (const result of testCase.expectationResults) {
-      const status = result.result === 'Passed' ? 'ok' : 'not ok';
+      const status = result.result === 'PASS' ? 'ok' : 'not ok';
       expectationCount++;
-      lines.push(`${status} ${expectationCount} ${details.testSet.name}.${testCase.number}`);
+      lines.push(
+        `${status} ${expectationCount} ${details.testSet.name}.${details.testSet.testCases.indexOf(testCase) + 1}`
+      );
       if (status === 'not ok') {
         lines.push('  ---');
         lines.push(`  message: ${result.errorMessage ?? 'Unknown error'}`);
diff --git a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json
index b84e3ed..a8a42a4 100644
--- a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json
+++ b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json
@@ -9,8 +9,7 @@
     "testCases": [
       {
         "status": "COMPLETED",
-        "number": 1,
-        "utterance": "Summarize account Acme",
+        "utterence": "Summarize account Acme",
         "startTime": "2024-11-28T12:00:10Z",
         "endTime": "2024-11-28T12:00:20Z",
         "generatedData": {
@@ -27,7 +26,7 @@
             "actualValue": "GeneralCRM",
             "expectedValue": "GeneralCRM",
             "score": 1.0,
-            "result": "Passed",
+            "result": "PASS",
             "metricLabel": "Accuracy",
             "metricExplainability": "Measures the correctness of the result.",
             "status": "Completed",
@@ -41,7 +40,7 @@
             "actualValue": "[\"IdentifyRecordByName\",\"SummarizeRecord\"]",
             "expectedValue": "[\"IdentifyRecordByName\",\"SummarizeRecord\"]",
             "score": 1.0,
-            "result": "Passed",
+            "result": "PASS",
             "metricLabel": "Precision",
             "metricExplainability": "Measures the precision of the result.",
             "status": "Completed",
@@ -55,7 +54,7 @@
             "actualValue": "Here is the summary of the account Acme. How else can I assist you? Acme is a customer since 2019. They have 3 open opportunities and 2 open cases.",
             "expectedValue": "Summary of account details are shown",
             "score": 0.9,
-            "result": "Passed",
+            "result": "PASS",
             "metricLabel": "Precision",
             "metricExplainability": "Measures the precision of the result.",
             "status": "Completed",
@@ -67,10 +66,9 @@
         ]
       },
       {
-        "status": "ERROR",
-        "number": 2,
+        "status": "COMPLETED",
         "startTime": "2024-11-28T12:00:30Z",
-        "utterance": "Summarize the open cases and Activities of acme from sep to nov 2024",
+        "utterence": "Summarize the open cases and Activities of acme from sep to nov 2024",
         "endTime": "2024-11-28T12:00:40Z",
         "generatedData": {
           "type": "AGENT",
@@ -86,7 +84,7 @@
             "actualValue": "GeneralCRM",
             "expectedValue": "GeneralCRM",
             "score": 1,
-            "result": "Passed",
+            "result": "PASS",
             "metricLabel": "Accuracy",
             "metricExplainability": "Measures the correctness of the result.",
             "status": "Completed",
@@ -100,7 +98,7 @@
             "actualValue": "[\"IdentifyRecordByName\",\"QueryRecords\"]",
             "expectedValue": "[\"IdentifyRecordByName\",\"QueryRecords\",\"GetActivitiesTimeline\"]",
             "score": 0.5,
-            "result": "Failed",
+            "result": "FAIL",
             "metricLabel": "Precision",
             "metricExplainability": "Measures the precision of the result.",
             "status": "Completed",
@@ -114,7 +112,7 @@
             "actualValue": "It looks like I am unable to find the information you are looking for due to access restrictions. How else can I assist you?",
             "expectedValue": "Summary of open cases and activities associated with timeline",
             "score": 0.1,
-            "result": "Failed",
+            "result": "FAIL",
             "metricLabel": "Precision",
             "metricExplainability": "Measures the precision of the result.",
             "status": "Completed",

From 5c6147e0edf2eab1250cfd571829f3551ab2ccaf Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Tue, 14 Jan 2025 10:40:02 -0700
Subject: [PATCH 4/6] chore: clean up

---
 src/agentTester.ts                            | 55 ++++++++++++-------
 src/index.ts                                  |  5 +-
 test/agentTester.test.ts                      | 14 ++---
 ...tions_runs_4KBSM000000003F4AQ_results.json |  4 +-
 4 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/src/agentTester.ts b/src/agentTester.ts
index 9b307a0..32129ba 100644
--- a/src/agentTester.ts
+++ b/src/agentTester.ts
@@ -26,7 +26,6 @@ export type AgentTestStatusResponse = {
 export type TestCaseResult = {
   status: TestStatus;
   utterance: string;
-  utterence: string;
   startTime: string;
   endTime?: string;
   generatedData: {
@@ -246,7 +245,25 @@ function makeSimpleTable(data: Record<string, string>, title: string): string {
   return `${title}\n${table}`;
 }
 
-export async function humanFormat(details: AgentTestResultsResponse): Promise<string> {
+export async function convertTestResultsToFormat(
+  results: AgentTestResultsResponse,
+  format: 'human' | 'json' | 'junit' | 'tap'
+): Promise<string> {
+  switch (format) {
+    case 'human':
+      return humanFormat(results);
+    case 'json':
+      return jsonFormat(results);
+    case 'junit':
+      return junitFormat(results);
+    case 'tap':
+      return tapFormat(results);
+    default:
+      throw new Error(`Unsupported format: ${format as string}`);
+  }
+}
+
+async function humanFormat(details: AgentTestResultsResponse): Promise<string> {
   const { Ux } = await import('@salesforce/sf-plugins-core');
   const ux = new Ux();
 
@@ -254,9 +271,7 @@ export async function humanFormat(details: AgentTestResultsResponse): Promise<st
   for (const testCase of details.testSet.testCases) {
     const number = details.testSet.testCases.indexOf(testCase) + 1;
     const table = ux.makeTable({
-      title: `${ansis.bold(`Test Case #${number}`)}\n${ansis.dim('Utterance')}: ${
-        testCase.utterance ?? testCase.utterence
-      }`,
+      title: `${ansis.bold(`Test Case #${number}`)}\n${ansis.dim('Utterance')}: ${testCase.utterance}`,
       overflow: 'wrap',
       columns: ['test', 'result', { key: 'expected', width: '40%' }, { key: 'actual', width: '40%' }],
       data: testCase.expectationResults.map((r) => ({
@@ -315,11 +330,11 @@ export async function humanFormat(details: AgentTestResultsResponse): Promise<st
   return tables.join('\n') + `\n${resultsTable}\n\n${failedTestCasesTable}\n`;
 }
 
-export async function jsonFormat(details: AgentTestResultsResponse): Promise<string> {
-  return Promise.resolve(JSON.stringify(details, null, 2));
+async function jsonFormat(results: AgentTestResultsResponse): Promise<string> {
+  return Promise.resolve(JSON.stringify(results, null, 2));
 }
 
-export async function junitFormat(details: AgentTestResultsResponse): Promise<string> {
+async function junitFormat(results: AgentTestResultsResponse): Promise<string> {
   // eslint-disable-next-line import/no-extraneous-dependencies
   const { XMLBuilder } = await import('fast-xml-parser');
   const builder = new XMLBuilder({
@@ -328,11 +343,11 @@ export async function junitFormat(details: AgentTestResultsResponse): Promise<st
     ignoreAttributes: false,
   });
 
-  const testCount = details.testSet.testCases.length;
-  const failureCount = details.testSet.testCases.filter(
+  const testCount = results.testSet.testCases.length;
+  const failureCount = results.testSet.testCases.filter(
     (tc) => ['ERROR', 'COMPLETED'].includes(tc.status) && tc.expectationResults.some((r) => r.result === 'FAIL')
   ).length;
-  const time = details.testSet.testCases.reduce((acc, tc) => {
+  const time = results.testSet.testCases.reduce((acc, tc) => {
     if (tc.endTime && tc.startTime) {
       return acc + new Date(tc.endTime).getTime() - new Date(tc.startTime).getTime();
     }
@@ -341,22 +356,22 @@ export async function junitFormat(details: AgentTestResultsResponse): Promise<st
 
   const suites = builder.build({
     testsuites: {
-      $name: details.subjectName,
+      $name: results.subjectName,
       $tests: testCount,
       $failures: failureCount,
       $time: time,
       property: [
-        { $name: 'status', $value: details.status },
-        { $name: 'start-time', $value: details.startTime },
-        { $name: 'end-time', $value: details.endTime },
+        { $name: 'status', $value: results.status },
+        { $name: 'start-time', $value: results.startTime },
+        { $name: 'end-time', $value: results.endTime },
       ],
-      testsuite: details.testSet.testCases.map((testCase) => {
+      testsuite: results.testSet.testCases.map((testCase) => {
         const testCaseTime = testCase.endTime
           ? new Date(testCase.endTime).getTime() - new Date(testCase.startTime).getTime()
           : 0;
 
         return {
-          $name: `${details.testSet.name}.${details.testSet.testCases.indexOf(testCase) + 1}`,
+          $name: `${results.testSet.name}.${results.testSet.testCases.indexOf(testCase) + 1}`,
           $time: testCaseTime,
           $assertions: testCase.expectationResults.length,
           failure: testCase.expectationResults
@@ -374,15 +389,15 @@ export async function junitFormat(details: AgentTestResultsResponse): Promise<st
   return `<?xml version="1.0" encoding="UTF-8"?>\n${suites}`.trim();
 }
 
-export async function tapFormat(details: AgentTestResultsResponse): Promise<string> {
+async function tapFormat(results: AgentTestResultsResponse): Promise<string> {
   const lines: string[] = [];
   let expectationCount = 0;
-  for (const testCase of details.testSet.testCases) {
+  for (const testCase of results.testSet.testCases) {
     for (const result of testCase.expectationResults) {
       const status = result.result === 'PASS' ? 'ok' : 'not ok';
       expectationCount++;
       lines.push(
-        `${status} ${expectationCount} ${details.testSet.name}.${details.testSet.testCases.indexOf(testCase) + 1}`
+        `${status} ${expectationCount} ${results.testSet.name}.${results.testSet.testCases.indexOf(testCase) + 1}`
       );
       if (status === 'not ok') {
         lines.push('  ---');
diff --git a/src/index.ts b/src/index.ts
index 0d8f5ca..60534a1 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -16,10 +16,7 @@ export {
 export { Agent, AgentCreateLifecycleStages } from './agent';
 export {
   AgentTester,
-  humanFormat,
-  jsonFormat,
-  junitFormat,
-  tapFormat,
+  convertTestResultsToFormat,
   type AgentTestResultsResponse,
   type AgentTestStartResponse,
   type AgentTestStatusResponse,
diff --git a/test/agentTester.test.ts b/test/agentTester.test.ts
index 6d1e2c0..c8ed309 100644
--- a/test/agentTester.test.ts
+++ b/test/agentTester.test.ts
@@ -8,7 +8,7 @@ import { readFile } from 'node:fs/promises';
 import { expect } from 'chai';
 import { MockTestOrgData, TestContext } from '@salesforce/core/testSetup';
 import { Connection } from '@salesforce/core';
-import { AgentTestResultsResponse, AgentTester, humanFormat, junitFormat, tapFormat } from '../src/agentTester';
+import { AgentTestResultsResponse, AgentTester, convertTestResultsToFormat } from '../src/agentTester';
 
 describe('AgentTester', () => {
   const $$ = new TestContext();
@@ -82,20 +82,20 @@ describe('AgentTester', () => {
   });
 });
 
-describe('humanFormat', () => {
+describe('human format', () => {
   it('should transform test results to human readable format', async () => {
     const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json', 'utf8');
     const input = JSON.parse(raw) as AgentTestResultsResponse;
-    const output = await humanFormat(input);
+    const output = await convertTestResultsToFormat(input, 'human');
     expect(output).to.be.ok;
   });
 });
 
-describe('junitFormatter', () => {
+describe('junit formatter', () => {
   it('should transform test results to JUnit format', async () => {
     const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json', 'utf8');
     const input = JSON.parse(raw) as AgentTestResultsResponse;
-    const output = await junitFormat(input);
+    const output = await convertTestResultsToFormat(input, 'junit');
     expect(output).to.deep.equal(`<?xml version="1.0" encoding="UTF-8"?>
 <testsuites name="Copilot_for_Salesforce" tests="2" failures="1" time="20000">
   <property name="status" value="COMPLETED"></property>
@@ -110,11 +110,11 @@ describe('junitFormatter', () => {
   });
 });
 
-describe('tapFormatter', () => {
+describe('tap formatter', () => {
   it('should transform test results to TAP format', async () => {
     const raw = await readFile('./test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json', 'utf8');
     const input = JSON.parse(raw) as AgentTestResultsResponse;
-    const output = await tapFormat(input);
+    const output = await convertTestResultsToFormat(input, 'tap');
     expect(output).to.deep.equal(`Tap Version 14
 1..6
 ok 1 CRM_Sanity_v1.1
diff --git a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json
index a8a42a4..4e00471 100644
--- a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json
+++ b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json
@@ -9,7 +9,7 @@
     "testCases": [
       {
         "status": "COMPLETED",
-        "utterence": "Summarize account Acme",
+        "utterance": "Summarize account Acme",
         "startTime": "2024-11-28T12:00:10Z",
         "endTime": "2024-11-28T12:00:20Z",
         "generatedData": {
@@ -68,7 +68,7 @@
       {
         "status": "COMPLETED",
         "startTime": "2024-11-28T12:00:30Z",
-        "utterence": "Summarize the open cases and Activities of acme from sep to nov 2024",
+        "utterance": "Summarize the open cases and Activities of acme from sep to nov 2024",
         "endTime": "2024-11-28T12:00:40Z",
         "generatedData": {
           "type": "AGENT",

From 7a1641c5a53e2abb1e15372405a20415f66bf41e Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Fri, 17 Jan 2025 09:47:24 -0700
Subject: [PATCH 5/6] chore: more api changes

---
 src/agentTester.ts                            | 64 ++++++++++++-------
 src/maybe-mock.ts                             |  2 +-
 test/agentTester.test.ts                      |  4 +-
 .../1.json                                    |  2 +-
 .../2.json                                    |  2 +-
 .../3.json                                    |  2 +-
 ...tions_runs_4KBSM000000003F4AQ_results.json |  6 +-
 7 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/src/agentTester.ts b/src/agentTester.ts
index 32129ba..da8fefe 100644
--- a/src/agentTester.ts
+++ b/src/agentTester.ts
@@ -9,7 +9,7 @@ import { Duration, env } from '@salesforce/kit';
 import ansis from 'ansis';
 import { MaybeMock } from './maybe-mock';
 
-export type TestStatus = 'NEW' | 'IN_PROGRESS' | 'COMPLETED' | 'ERROR';
+export type TestStatus = 'New' | 'InProgress' | 'Completed' | 'Error';
 
 export type AgentTestStartResponse = {
   aiEvaluationId: string;
@@ -41,7 +41,7 @@ export type TestCaseResult = {
     actualValue: string;
     expectedValue: string;
     score: number;
-    result: 'PASS' | 'FAIL';
+    result: 'PASS' | 'FAILURE';
     metricLabel: 'Accuracy' | 'Precision';
     metricExplainability: string;
     status: TestStatus;
@@ -121,16 +121,38 @@ export class AgentTester {
     const lifecycle = Lifecycle.getInstance();
     const client = await PollingClient.create({
       poll: async (): Promise<StatusResult> => {
-        const resultsResponse = await this.results(jobId);
-        const totalTestCases = resultsResponse.testSet.testCases.length;
-        const passingTestCases = resultsResponse.testSet.testCases.filter(
-          (tc) => tc.status === 'COMPLETED' && tc.expectationResults.every((r) => r.result === 'PASS')
-        ).length;
-        const failingTestCases = resultsResponse.testSet.testCases.filter(
-          (tc) => ['ERROR', 'COMPLETED'].includes(tc.status) && tc.expectationResults.some((r) => r.result === 'FAIL')
-        ).length;
-
-        if (resultsResponse.status.toLowerCase() === 'completed') {
+        const statusResponse = await this.status(jobId);
+        // eslint-disable-next-line no-console
+        console.log('*'.repeat(process.stdout.columns));
+        // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-var-requires
+        const util = require('node:util');
+        // eslint-disable-next-line no-console, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call
+        console.log(util.inspect(statusResponse, { depth: 6 }));
+        // eslint-disable-next-line no-console
+        console.log('*'.repeat(process.stdout.columns));
+        if (statusResponse.status.toLowerCase() !== 'new') {
+          const resultsResponse = await this.results(jobId);
+          const totalTestCases = resultsResponse.testSet.testCases.length;
+          const passingTestCases = resultsResponse.testSet.testCases.filter(
+            (tc) => tc.status.toLowerCase() === 'completed' && tc.expectationResults.every((r) => r.result === 'PASS')
+          ).length;
+          const failingTestCases = resultsResponse.testSet.testCases.filter(
+            (tc) =>
+              ['error', 'completed'].includes(tc.status.toLowerCase()) &&
+              tc.expectationResults.some((r) => r.result === 'FAILURE')
+          ).length;
+
+          if (resultsResponse.status.toLowerCase() === 'completed') {
+            await lifecycle.emit('AGENT_TEST_POLLING_EVENT', {
+              jobId,
+              status: resultsResponse.status,
+              totalTestCases,
+              failingTestCases,
+              passingTestCases,
+            });
+            return { payload: resultsResponse, completed: true };
+          }
+
           await lifecycle.emit('AGENT_TEST_POLLING_EVENT', {
             jobId,
             status: resultsResponse.status,
@@ -138,16 +160,8 @@ export class AgentTester {
             failingTestCases,
             passingTestCases,
           });
-          return { payload: resultsResponse, completed: true };
         }
 
-        await lifecycle.emit('AGENT_TEST_POLLING_EVENT', {
-          jobId,
-          status: resultsResponse.status,
-          totalTestCases,
-          failingTestCases,
-          passingTestCases,
-        });
         return { completed: false };
       },
       frequency: Duration.milliseconds(frequency),
@@ -315,12 +329,12 @@ async function humanFormat(details: AgentTestResultsResponse): Promise<string> {
 
   const resultsTable = makeSimpleTable(results, ansis.bold.blue('Test Results'));
 
-  const failedTestCases = details.testSet.testCases.filter((tc) => tc.status === 'ERROR');
+  const failedTestCases = details.testSet.testCases.filter((tc) => tc.status.toLowerCase() === 'error');
   const failedTestCasesObj = Object.fromEntries(
     Object.entries(failedTestCases).map(([, tc]) => [
       `Test Case #${failedTestCases.indexOf(tc) + 1}`,
       tc.expectationResults
-        .filter((r) => r.result === 'FAIL')
+        .filter((r) => r.result === 'FAILURE')
         .map((r) => humanFriendlyName(r.name))
         .join(', '),
     ])
@@ -345,7 +359,9 @@ async function junitFormat(results: AgentTestResultsResponse): Promise<string> {
 
   const testCount = results.testSet.testCases.length;
   const failureCount = results.testSet.testCases.filter(
-    (tc) => ['ERROR', 'COMPLETED'].includes(tc.status) && tc.expectationResults.some((r) => r.result === 'FAIL')
+    (tc) =>
+      ['error', 'completed'].includes(tc.status.toLowerCase()) &&
+      tc.expectationResults.some((r) => r.result === 'FAILURE')
   ).length;
   const time = results.testSet.testCases.reduce((acc, tc) => {
     if (tc.endTime && tc.startTime) {
@@ -376,7 +392,7 @@ async function junitFormat(results: AgentTestResultsResponse): Promise<string> {
           $assertions: testCase.expectationResults.length,
           failure: testCase.expectationResults
             .map((r) => {
-              if (r.result === 'FAIL') {
+              if (r.result === 'FAILURE') {
                 return { $message: r.errorMessage ?? 'Unknown error', $name: r.name };
               }
             })
diff --git a/src/maybe-mock.ts b/src/maybe-mock.ts
index 1f8d053..602d515 100644
--- a/src/maybe-mock.ts
+++ b/src/maybe-mock.ts
@@ -164,7 +164,7 @@ export class MaybeMock {
     this.logger.debug(`Making ${method} request to ${url}`);
     switch (method) {
       case 'GET':
-        return this.connection.requestGet<T>(url, { retry: { maxRetries: 3 } });
+        return this.connection.requestGet<T>(url, { retry: { maxRetries: 10 } });
       case 'POST':
         if (!body) {
           throw SfError.create({
diff --git a/test/agentTester.test.ts b/test/agentTester.test.ts
index c8ed309..1a8ac4b 100644
--- a/test/agentTester.test.ts
+++ b/test/agentTester.test.ts
@@ -45,7 +45,7 @@ describe('AgentTester', () => {
       const output = await tester.status('4KBSM000000003F4AQ');
       expect(output).to.be.ok;
       expect(output).to.deep.equal({
-        status: 'IN_PROGRESS',
+        status: 'InProgress',
         startTime: '2024-11-13T15:00:00.000Z',
       });
     });
@@ -98,7 +98,7 @@ describe('junit formatter', () => {
     const output = await convertTestResultsToFormat(input, 'junit');
     expect(output).to.deep.equal(`<?xml version="1.0" encoding="UTF-8"?>
 <testsuites name="Copilot_for_Salesforce" tests="2" failures="1" time="20000">
-  <property name="status" value="COMPLETED"></property>
+  <property name="status" value="Completed"></property>
   <property name="start-time" value="2024-11-28T12:00:00Z"></property>
   <property name="end-time" value="2024-11-28T12:00:48.56Z"></property>
   <testsuite name="CRM_Sanity_v1.1" time="10000" assertions="3"></testsuite>
diff --git a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ/1.json b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ/1.json
index daf2bbc..58716da 100644
--- a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ/1.json
+++ b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ/1.json
@@ -1,4 +1,4 @@
 {
-  "status": "IN_PROGRESS",
+  "status": "InProgress",
   "startTime": "2024-11-13T15:00:00.000Z"
 }
diff --git a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ/2.json b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ/2.json
index daf2bbc..58716da 100644
--- a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ/2.json
+++ b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ/2.json
@@ -1,4 +1,4 @@
 {
-  "status": "IN_PROGRESS",
+  "status": "InProgress",
   "startTime": "2024-11-13T15:00:00.000Z"
 }
diff --git a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ/3.json b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ/3.json
index d4f6503..88bd062 100644
--- a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ/3.json
+++ b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ/3.json
@@ -1,4 +1,4 @@
 {
-  "status": "COMPLETED",
+  "status": "Completed",
   "startTime": "2024-11-13T15:00:00.000Z"
 }
diff --git a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json
index 4e00471..704b480 100644
--- a/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json
+++ b/test/mocks/einstein_ai-evaluations_runs_4KBSM000000003F4AQ_results.json
@@ -1,5 +1,5 @@
 {
-  "status": "COMPLETED",
+  "status": "Completed",
   "startTime": "2024-11-28T12:00:00Z",
   "endTime": "2024-11-28T12:00:48.56Z",
   "errorMessage": null,
@@ -98,7 +98,7 @@
             "actualValue": "[\"IdentifyRecordByName\",\"QueryRecords\"]",
             "expectedValue": "[\"IdentifyRecordByName\",\"QueryRecords\",\"GetActivitiesTimeline\"]",
             "score": 0.5,
-            "result": "FAIL",
+            "result": "FAILURE",
             "metricLabel": "Precision",
             "metricExplainability": "Measures the precision of the result.",
             "status": "Completed",
@@ -112,7 +112,7 @@
             "actualValue": "It looks like I am unable to find the information you are looking for due to access restrictions. How else can I assist you?",
             "expectedValue": "Summary of open cases and activities associated with timeline",
             "score": 0.1,
-            "result": "FAIL",
+            "result": "FAILURE",
             "metricLabel": "Precision",
             "metricExplainability": "Measures the precision of the result.",
             "status": "Completed",

From 470fa525cba2a32c78f857b652edc3b1feabc47b Mon Sep 17 00:00:00 2001
From: Mike Donnalley <mdonnalley@salesforce.com>
Date: Fri, 17 Jan 2025 13:23:51 -0700
Subject: [PATCH 6/6] chore: remove console logs

---
 src/agentTester.ts | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/agentTester.ts b/src/agentTester.ts
index da8fefe..dcfdc79 100644
--- a/src/agentTester.ts
+++ b/src/agentTester.ts
@@ -122,14 +122,6 @@ export class AgentTester {
     const client = await PollingClient.create({
       poll: async (): Promise<StatusResult> => {
         const statusResponse = await this.status(jobId);
-        // eslint-disable-next-line no-console
-        console.log('*'.repeat(process.stdout.columns));
-        // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-var-requires
-        const util = require('node:util');
-        // eslint-disable-next-line no-console, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call
-        console.log(util.inspect(statusResponse, { depth: 6 }));
-        // eslint-disable-next-line no-console
-        console.log('*'.repeat(process.stdout.columns));
         if (statusResponse.status.toLowerCase() !== 'new') {
           const resultsResponse = await this.results(jobId);
           const totalTestCases = resultsResponse.testSet.testCases.length;