salesforcecli · cristiand391 · Jul 10, 2025
diff --git a/package.json b/package.json
@@ -54,16 +54,21 @@
     "zod": "^3.25.67"
   },
   "devDependencies": {
+    "@ai-sdk/google": "^1.2.22",
+    "@ai-sdk/openai": "^1.3.23",
     "@modelcontextprotocol/inspector": "^0.15.0",
     "@salesforce/cli-plugins-testkit": "^5.3.39",
     "@salesforce/dev-scripts": "11.0.2",
     "@types/node": "^22.15.32",
+    "ai": "^4.3.17",
     "eslint-config-salesforce-license": "^1.0.1",
     "eslint-plugin-sf-plugin": "^1.20.26",
     "oclif": "^4.20.1",
     "ts-node": "^10.9.2",
     "ts-patch": "^3.3.0",
-    "typescript": "^5.8.3"
+    "typescript": "^5.8.3",
+    "vitest": "^3.2.4",
+    "vitest-evals": "^0.3.0"
   },
   "publishConfig": {
     "access": "public"

diff --git a/test/evals/sf-query-org.eval.ts b/test/evals/sf-query-org.eval.ts
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import { describeEval } from 'vitest-evals';
+import { Factuality, TaskRunner } from './utils.js';
+
+describeEval('SOQL queries', {
+  data: async () => [
+    {
+      input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.',
+      expected: `The response should include these records:
+Architectural Details
+City Living
+Contemporary City Living
+Contemporary Luxury
+Heart of Harvard Square
+Modern City Living
+Quiet Retreat
+Seaport District Retreat
+Stunning Colonial
+Stunning Victorian
+Ultimate Sophistication
+Waterfront in the City
+`,
+      //         expected: `The response should include these records:
+      // Sophisticated Urban Escape
+      // Metropolitan Elegance
+      // Vibrant City Sanctuary
+      // Downtown Dreamscape
+      // Sleek Urban Oasis
+      // Modern Metropole
+      // Luxe in the Loop
+      // `,
+    },
+  ],
+  task: TaskRunner(),
+  scorers: [Factuality()],
+  threshold: 0.6,
+  timeout: 30_000,
+});
diff --git a/test/evals/utils.ts b/test/evals/utils.ts
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import * as path from 'node:path';
+import { google } from '@ai-sdk/google';
+import { experimental_createMCPClient, generateObject, streamText, type LanguageModel } from 'ai';
+import { Experimental_StdioMCPTransport } from 'ai/mcp-stdio';
+import { z } from 'zod';
+
+// This prompt intends to represent what an IDE context window could look like, some specifics:
+//
+// * Current open project directory
+// * Current open file
+const SYSTEM_PROMPT = `You are an assistant responsible for evaluating the results of calling various tools. 
+
+You a general purpose LLM-based Agent. Your purpose is to answer the user's query using the tools provided.
+
+- You should ONLY use the tools available to answer the user's query.
+- Use as few tool calls as possible to get to the answer.
+- Using multiple tool calls to get to the answer is allowed when needed.
+
+The current open project dir is "${process.env.SF_EVAL_PROMPT_PROJECT_DIR}"
+`;
+
+// Supported models: https://ai.google.dev/gemini-api/docs/models
+const defaultModel = google('gemini-2.5-flash');
+
+export function TaskRunner(model: LanguageModel = defaultModel) {
+  return async function TaskRun(input: string) {
+    const mcpClient = await experimental_createMCPClient({
+      transport: new Experimental_StdioMCPTransport({
+        command: 'node',
+        args: [path.join(import.meta.dirname, '../../bin/run.js'), '-o', 'DEFAULT_TARGET_ORG', '--no-telemetry'],
+      }),
+    });
+
+    const tools = await mcpClient.tools();
+
+    try {
+      const result = streamText({
+        model,
+        tools,
+        system: SYSTEM_PROMPT,
+        prompt: input,
+        maxRetries: 1,
+        maxSteps: 10,
+        experimental_telemetry: {
+          isEnabled: false,
+        },
+        onError: (error) => {
+          // eslint-disable-next-line no-console
+          console.error(error);
+        },
+      });
+
+      // TODO: we don't need text streaming here, maybe switch to `generateText`?
+      // eslint-disable-next-line
+      for await (const _ of result.fullStream) {
+      }
+
+      return await result.text;
+    } catch (error) {
+      // eslint-disable-next-line no-console
+      console.error(error);
+      throw error;
+    } finally {
+      await mcpClient.close();
+    }
+  };
+}
+
+/**
+ * A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
+ *
+ * ```
+ * import { openai } from "@ai-sdk/openai";
+ *
+ * scorers: [Factuality(openai("gpt-4o"))]
+ * ```
+ */
+export function Factuality(model: LanguageModel = defaultModel) {
+  // TODO: remove function wrapper
+  // eslint-disable-next-line @typescript-eslint/no-shadow
+  return async function Factuality(opts: { input: string; output: string; expected?: string }) {
+    const { object } = await generateObject({
+      model,
+      /**
+       * Prompt implementation from `autoevals`:
+       *
+       * {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
+       */
+      prompt: `
+        You are comparing a submitted answer to an expert answer on a given question. Here is the data:
+
+        [BEGIN DATA]
+        ************
+        [Question]: ${opts.input}
+        ************
+        [Expert]: ${opts.expected}
+        ************
+        [Submission]: ${opts.output}
+        ************
+        [END DATA]
+
+        Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation, or overall structure.
+
+        The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:
+
+        (A) The submitted answer is a subset of the expert answer and is fully consistent with it.
+        (B) The submitted answer is a superset of the expert answer and is fully consistent with it.
+        (C) The submitted answer contains all the same details as the expert answer.
+        (D) There is a disagreement between the submitted answer and the expert answer.
+        (E) The answers differ, but these differences don't matter from the perspective of factuality.
+      `,
+      schema: z.object({
+        answer: z.enum(['A', 'B', 'C', 'D', 'E']).describe('Your selection.'),
+        rationale: z.string().describe('Why you chose this answer. Be very detailed.'),
+      }),
+    });
+
+    const scores = {
+      A: 0.4,
+      B: 0.6,
+      C: 1,
+      D: 0,
+      E: 1,
+    };
+
+    return {
+      score: scores[object.answer],
+      metadata: {
+        rationale: object.rationale,
+      },
+    };
+  };
+}
diff --git a/test/vitest.config.ts b/test/vitest.config.ts
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2025, Salesforce, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import { defineConfig } from 'vitest/config';
+
+export default defineConfig({
+  test: {
+    include: ['**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}'],
+    reporters: ['vitest-evals/reporter'],
+  },
+});