Skip to content

[WIP] test: eval testing setup W-18964528 #102

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,21 @@
"zod": "^3.25.67"
},
"devDependencies": {
"@ai-sdk/google": "^1.2.22",
"@ai-sdk/openai": "^1.3.23",
"@modelcontextprotocol/inspector": "^0.15.0",
"@salesforce/cli-plugins-testkit": "^5.3.39",
"@salesforce/dev-scripts": "11.0.2",
"@types/node": "^22.15.32",
"ai": "^4.3.17",
"eslint-config-salesforce-license": "^1.0.1",
"eslint-plugin-sf-plugin": "^1.20.26",
"oclif": "^4.20.1",
"ts-node": "^10.9.2",
"ts-patch": "^3.3.0",
"typescript": "^5.8.3"
"typescript": "^5.8.3",
"vitest": "^3.2.4",
"vitest-evals": "^0.3.0"
},
"publishConfig": {
"access": "public"
Expand Down
52 changes: 52 additions & 0 deletions test/evals/sf-query-org.eval.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Copyright 2025, Salesforce, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { describeEval } from 'vitest-evals';
import { Factuality, TaskRunner } from './utils.js';

describeEval('SOQL queries', {
data: async () => [
{
input: 'List the name of the Property__c records in my org, ordered in ascending order by their name.',
expected: `The response should include these records:
Architectural Details
City Living
Contemporary City Living
Contemporary Luxury
Heart of Harvard Square
Modern City Living
Quiet Retreat
Seaport District Retreat
Stunning Colonial
Stunning Victorian
Ultimate Sophistication
Waterfront in the City
`,
// expected: `The response should include these records:
// Sophisticated Urban Escape
// Metropolitan Elegance
// Vibrant City Sanctuary
// Downtown Dreamscape
// Sleek Urban Oasis
// Modern Metropole
// Luxe in the Loop
// `,
},
],
task: TaskRunner(),
scorers: [Factuality()],
threshold: 0.6,
timeout: 30_000,
});
148 changes: 148 additions & 0 deletions test/evals/utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
/*
* Copyright 2025, Salesforce, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import * as path from 'node:path';
import { google } from '@ai-sdk/google';
import { experimental_createMCPClient, generateObject, streamText, type LanguageModel } from 'ai';
import { Experimental_StdioMCPTransport } from 'ai/mcp-stdio';
import { z } from 'zod';

// This prompt intends to represent what an IDE context window could look like, some specifics:
//
// * Current open project directory
// * Current open file
const SYSTEM_PROMPT = `You are an assistant responsible for evaluating the results of calling various tools.

You a general purpose LLM-based Agent. Your purpose is to answer the user's query using the tools provided.

- You should ONLY use the tools available to answer the user's query.
- Use as few tool calls as possible to get to the answer.
- Using multiple tool calls to get to the answer is allowed when needed.

The current open project dir is "${process.env.SF_EVAL_PROMPT_PROJECT_DIR}"
`;

// Supported models: https://ai.google.dev/gemini-api/docs/models
const defaultModel = google('gemini-2.5-flash');

export function TaskRunner(model: LanguageModel = defaultModel) {
return async function TaskRun(input: string) {
const mcpClient = await experimental_createMCPClient({
transport: new Experimental_StdioMCPTransport({
command: 'node',
args: [path.join(import.meta.dirname, '../../bin/run.js'), '-o', 'DEFAULT_TARGET_ORG', '--no-telemetry'],
}),
});

const tools = await mcpClient.tools();

try {
const result = streamText({
model,
tools,
system: SYSTEM_PROMPT,
prompt: input,
maxRetries: 1,
maxSteps: 10,
experimental_telemetry: {
isEnabled: false,
},
onError: (error) => {
// eslint-disable-next-line no-console
console.error(error);
},
});

// TODO: we don't need text streaming here, maybe switch to `generateText`?
// eslint-disable-next-line
for await (const _ of result.fullStream) {
}

return await result.text;
} catch (error) {
// eslint-disable-next-line no-console
console.error(error);
throw error;
} finally {
await mcpClient.close();
}
};
}

/**
* A Factuality checker utilizing the `ai` SDK based on the implementation in `autoevals`.
*
* ```
* import { openai } from "@ai-sdk/openai";
*
* scorers: [Factuality(openai("gpt-4o"))]
* ```
*/
export function Factuality(model: LanguageModel = defaultModel) {
// TODO: remove function wrapper
// eslint-disable-next-line @typescript-eslint/no-shadow
return async function Factuality(opts: { input: string; output: string; expected?: string }) {
const { object } = await generateObject({
model,
/**
* Prompt implementation from `autoevals`:
*
* {@link https://github.com/braintrustdata/autoevals/blob/5aa20a0a9eb8fc9e07e9e5722ebf71c68d082f32/templates/factuality.yaml}
*/
prompt: `
You are comparing a submitted answer to an expert answer on a given question. Here is the data:

[BEGIN DATA]
************
[Question]: ${opts.input}
************
[Expert]: ${opts.expected}
************
[Submission]: ${opts.output}
************
[END DATA]

Compare the factual content of the submitted answer with the expert answer. Ignore any differences in style, grammar, or punctuation, or overall structure.

The submitted answer may either be a subset or superset of the expert answer, or it may conflict with it. Determine which case applies. Answer the question by selecting one of the following options:

(A) The submitted answer is a subset of the expert answer and is fully consistent with it.
(B) The submitted answer is a superset of the expert answer and is fully consistent with it.
(C) The submitted answer contains all the same details as the expert answer.
(D) There is a disagreement between the submitted answer and the expert answer.
(E) The answers differ, but these differences don't matter from the perspective of factuality.
`,
schema: z.object({
answer: z.enum(['A', 'B', 'C', 'D', 'E']).describe('Your selection.'),
rationale: z.string().describe('Why you chose this answer. Be very detailed.'),
}),
});

const scores = {
A: 0.4,
B: 0.6,
C: 1,
D: 0,
E: 1,
};

return {
score: scores[object.answer],
metadata: {
rationale: object.rationale,
},
};
};
}
23 changes: 23 additions & 0 deletions test/vitest.config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Copyright 2025, Salesforce, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { defineConfig } from 'vitest/config';

export default defineConfig({
test: {
include: ['**/*.eval.{js,mjs,cjs,ts,mts,cts,jsx,tsx}'],
reporters: ['vitest-evals/reporter'],
},
});
Loading
Loading