Skip to content

Commit 64f8892

Browse files
authored
Merge pull request #90 from asamal4/optional-tool
Ability to set alternate tool calls for eval
2 parents f8ba383 + ec5f63d commit 64f8892

File tree

8 files changed

+800
-24
lines changed

8 files changed

+800
-24
lines changed

README.md

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ embedding:
291291
| `attachments` | list[string] | ❌ | Attachments | ❌ |
292292
| `expected_response` | string | 📋 | Expected response for comparison | ❌ |
293293
| `expected_intent` | string | 📋 | Expected intent for intent evaluation| ❌ |
294-
| `expected_tool_calls` | list[list[dict]] | 📋 | Expected tool call sequences | ❌ |
294+
| `expected_tool_calls` | list[list[list[dict]]] | 📋 | Expected tool call sequences (multiple alternative sets) | ❌ |
295295
| `tool_calls` | list[list[dict]] | ❌ | Actual tool calls from API | ✅ (if API enabled) |
296296
| `verify_script` | string | 📋 | Path to verification script | ❌ |
297297
| `turn_metrics` | list[string] | ❌ | Turn-specific metrics to evaluate | ❌ |
@@ -302,7 +302,7 @@ embedding:
302302
Examples
303303
> - `expected_response`: Required for `custom:answer_correctness`
304304
> - `expected_intent`: Required for `custom:intent_eval`
305-
> - `expected_tool_calls`: Required for `custom:tool_eval`
305+
> - `expected_tool_calls`: Required for `custom:tool_eval` (multiple alternative sets format)
306306
> - `verify_script`: Required for `script:action_eval` (used when API is enabled)
307307
> - `response`: Required for most metrics (auto-populated if API enabled)
308308

@@ -314,15 +314,37 @@ Examples
314314
| `[]` (empty list) | Skip evaluation for this turn |
315315
| `["metric1", ...]` | Use specified metrics only |
316316

317+
#### Tool Evaluation
318+
319+
The `custom:tool_eval` metric supports flexible matching with multiple alternative patterns:
320+
321+
- **Format**: `[[[tool_calls, ...]], [[tool_calls]], ...]` (list of list of list)
322+
- **Matching**: Tries each alternative until one matches
323+
- **Use Cases**: Optional tools, multiple approaches, default arguments, skip scenarios
324+
- **Empty Sets**: `[]` represents "no tools" and must come after primary alternatives
325+
317326
#### Tool Call Structure
318327

319328
```yaml
329+
# Multiple alternative sets format: [[[tool_calls, ...]], [[tool_calls]], ...]
320330
expected_tool_calls:
321-
-
322-
- tool_name: oc_get # Tool name
323-
arguments: # Tool arguments
324-
kind: pod
325-
name: openshift-light* # Regex patterns supported for flexible matching
331+
- # Alternative 1: Primary approach
332+
- # Sequence 1
333+
- tool_name: oc_get
334+
arguments:
335+
kind: pod
336+
name: openshift-light* # Regex patterns supported
337+
- # Sequence 2 (if multiple parallel tool calls needed)
338+
- tool_name: oc_describe
339+
arguments:
340+
kind: pod
341+
- # Alternative 2: Different approach
342+
- # Sequence 1
343+
- tool_name: kubectl_get
344+
arguments:
345+
resource: pods
346+
- # Alternative 3: Skip scenario (optional)
347+
[] # When model has information from previous conversation
326348
```
327349

328350
#### Script-Based Evaluations

src/lightspeed_evaluation/core/metrics/custom/tool_eval.py

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,26 +8,26 @@
88

99

1010
def evaluate_tool_calls(
11-
expected: list[list[dict[str, Any]]], actual: list[list[dict[str, Any]]]
11+
expected: list[list[list[dict[str, Any]]]],
12+
actual: list[list[dict[str, Any]]],
1213
) -> tuple[bool, str]:
1314
"""Evaluate tool calls using the custom:tool_eval metric.
1415
1516
Args:
16-
expected: Expected tool calls structure (list[list[dict[str, Any]]])
17-
actual: Actual tool calls from API response (list[list[dict[str, Any]]])
17+
expected: Expected tool calls structure (with alternatives)
18+
actual: Actual tool calls from API response
1819
1920
Returns:
2021
tuple: (success: bool, details: str)
2122
"""
2223
try:
23-
success = compare_tool_calls(expected, actual)
24-
25-
if success:
26-
details = "Tool calls match expected structure and arguments"
27-
else:
28-
details = "Tool calls do not match expected structure or arguments"
24+
# Try each set until one matches
25+
for i, expected_set in enumerate(expected):
26+
if compare_tool_calls(expected_set, actual):
27+
return _create_success_message(i, expected_set)
2928

30-
return success, details
29+
# If all sets fail, return failure status & message
30+
return _create_failure_message(expected, actual)
3131

3232
except (AttributeError, TypeError, ValueError) as e:
3333
logger.error("Error during tool evaluation: %s", e)
@@ -143,6 +143,40 @@ def _compare_tool_arguments(expected: dict[str, Any], actual: dict[str, Any]) ->
143143
return True
144144

145145

146+
def _create_success_message(
147+
index: int, expected_set: list[list[dict[str, Any]]]
148+
) -> tuple[bool, str]:
149+
"""Create success message based on match type."""
150+
pattern_type = "Primary pattern" if index == 0 else f"Alternative {index + 1}"
151+
152+
# Determine message based on what matched
153+
if len(expected_set) == 0:
154+
# Empty alternative matched - index 0 can never be empty due to constraints
155+
message = "No tool calls made (valid alternate skip scenario)"
156+
else:
157+
message = "Tool calls match expected structure and arguments"
158+
159+
return True, f"{pattern_type} matched: {message}"
160+
161+
162+
def _create_failure_message(
163+
expected: list[list[list[dict[str, Any]]]], actual: list[list[dict[str, Any]]]
164+
) -> tuple[bool, str]:
165+
"""Create failure message with helpful context."""
166+
# If we reach here, none of the alternatives matched
167+
168+
if len(actual) == 0:
169+
return (
170+
False,
171+
"No actual tool calls made and this is not set as an expected alternative",
172+
)
173+
174+
return (
175+
False,
176+
f"Tool calls made but didn't match any of the {len(expected)} expected pattern(s)",
177+
)
178+
179+
146180
def format_tool_calls_for_logging(tool_calls: list[list[dict[str, Any]]]) -> str:
147181
"""Format tool calls for logging purposes."""
148182
if not tool_calls:

src/lightspeed_evaluation/core/models/data.py

Lines changed: 133 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ class TurnData(BaseModel):
5656
expected_response: Optional[str] = Field(
5757
default=None, min_length=1, description="Expected response for comparison"
5858
)
59-
expected_tool_calls: Optional[list[list[dict[str, Any]]]] = Field(
60-
default=None, description="Expected tool call sequences"
59+
expected_tool_calls: Optional[list[list[list[dict[str, Any]]]]] = Field(
60+
default=None, description="Expected tool call sequences (with alternatives)"
6161
)
6262
expected_intent: Optional[str] = Field(
6363
default=None, min_length=1, description="Expected intent for intent evaluation"
@@ -93,19 +93,145 @@ def validate_turn_metrics(cls, v: Optional[list[str]]) -> Optional[list[str]]:
9393
@classmethod
9494
def validate_expected_tool_calls(
9595
cls, v: Optional[Any]
96-
) -> Optional[list[list[dict[str, Any]]]]:
97-
"""Validate expected tool calls when provided."""
96+
) -> Optional[list[list[list[dict[str, Any]]]]]:
97+
"""Validate expected tool calls when provided.
98+
99+
Converts single set format to multiple sets format automatically for backward compatibility.
100+
101+
Input formats:
102+
1. Single set: [[{tool_name, arguments}, ...], ...] -> Converted to multiple sets
103+
2. Multiple sets: [[[{tool_name, arguments}, ...], ...], [[...], ...], ...] -> Used as-is
104+
105+
Output format:
106+
Multiple sets: [[[{tool_name, arguments}, ...], ...], [[...], ...], ...]
107+
"""
98108
if v is None:
99109
return None
100110

111+
if not isinstance(v, list):
112+
raise ValueError("Expected tool calls must be a list")
113+
114+
# Ensure multiple sets format (backward compatibility)
115+
data = cls._ensure_multiple_sets_format(v)
116+
117+
# Validate multiple sets format
118+
return cls._validate_multiple_sets(data)
119+
120+
# Future backward compatibility removal (minimal changes):
121+
# 1. Delete: _ensure_multiple_sets_format() and _is_single_set_format()
122+
# 2. Replace above with: return cls._validate_multiple_sets(v)
123+
124+
@classmethod
125+
def _ensure_multiple_sets_format(cls, v: list) -> list[list[list[dict[str, Any]]]]:
126+
"""Ensure data is in multiple sets format (backward compatibility)."""
127+
# Convert single set format to multiple sets format if needed
128+
if cls._is_single_set_format(v):
129+
# Single set: [[tool1, tool2], [tool3]] -> Multiple sets: [[[tool1, tool2], [tool3]]]
130+
return [v]
131+
# Already multiple sets: [[[tool1]], [[tool2]]] -> Keep as-is
132+
return v
133+
134+
@classmethod
135+
def _validate_multiple_sets(
136+
cls, data: list[list[list[dict[str, Any]]]]
137+
) -> list[list[list[dict[str, Any]]]]:
138+
"""Validate multiple sets format data."""
139+
# Reject empty sequences anywhere
140+
cls._reject_empty_sequences(data)
141+
142+
# Validate each alternative set
143+
validated_alternatives = []
144+
for alternative in data:
145+
validated_alternative = cls._validate_tool_call_sequences(alternative)
146+
validated_alternatives.append(validated_alternative)
147+
148+
# Apply constraints
149+
cls._validate_empty_set_constraints(validated_alternatives)
150+
return validated_alternatives
151+
152+
@classmethod
153+
def _is_single_set_format(cls, v: list) -> bool:
154+
"""Detect if input is single set format (backward compatibility)."""
155+
if not v:
156+
return True # Empty list is single set format
157+
158+
# Check first element: if it's a dict, it's single set format
159+
# If it's a list, it could be multiple sets or single set with sequences
160+
first_element = v[0]
161+
if isinstance(first_element, dict):
162+
return True # Single set: [tool1, tool2, ...]
163+
164+
if isinstance(first_element, list):
165+
if not first_element:
166+
# Empty list [] - could be single set (empty sequence) or multiple sets (empty alt)
167+
# Check if there are multiple empty lists (indicates multiple sets format)
168+
return not (
169+
len(v) > 1
170+
and all(isinstance(el, list) and len(el) == 0 for el in v)
171+
)
172+
# Non-empty list - check what's inside
173+
return isinstance(
174+
first_element[0], dict
175+
) # dict = single set, list = multiple sets
176+
177+
return False
178+
179+
@classmethod
180+
def _reject_empty_sequences(cls, data: list[list[list[dict[str, Any]]]]) -> None:
181+
"""Reject empty sequences in data."""
182+
for i, alternative in enumerate(data):
183+
for j, sequence in enumerate(alternative):
184+
if isinstance(sequence, list) and len(sequence) == 0:
185+
raise ValueError(
186+
f"Empty sequence at position {j} in alternative {i} is invalid. "
187+
"Use [] for no tools instead."
188+
)
189+
190+
@classmethod
191+
def _validate_empty_set_constraints(
192+
cls, result: list[list[list[dict[str, Any]]]]
193+
) -> None:
194+
"""Validate that empty alternatives come after primary options (not first or only)."""
195+
if not result:
196+
return
197+
198+
if len(result) == 1 and len(result[0]) == 0:
199+
raise ValueError(
200+
"Empty set cannot be the only alternative. "
201+
"Empty alternatives should represent fallback scenarios, not primary options."
202+
)
203+
204+
if len(result) > 1 and len(result[0]) == 0:
205+
raise ValueError(
206+
"Empty set cannot be the first alternative. "
207+
"Empty alternatives should come after primary options."
208+
)
209+
210+
# Prevent multiple redundant empty alternatives
211+
empty_count = sum(1 for alt in result if len(alt) == 0)
212+
if empty_count > 1:
213+
raise ValueError(
214+
f"Found {empty_count} empty alternatives. "
215+
"Multiple empty alternatives are redundant - use only one as fallback."
216+
)
217+
218+
@classmethod
219+
def _is_sequence_of_sequences(cls, seq: list) -> bool:
220+
"""Check if a sequence contains sequences (not dicts)."""
221+
return bool(seq) and isinstance(seq[0], list)
222+
223+
@classmethod
224+
def _validate_tool_call_sequences(cls, v: Any) -> list[list[dict[str, Any]]]:
225+
"""Validate tool call sequences structure."""
101226
if not isinstance(v, list):
102227
raise ValueError("Expected tool calls must be a list of sequences")
103228

104-
result = []
229+
validated_sequences = []
105230
for i, sequence in enumerate(v):
106231
if not isinstance(sequence, list):
107232
raise ValueError(f"Sequence {i} must be a list")
108233

234+
# Empty sequences are already rejected by _reject_empty_sequences
109235
tool_calls = []
110236
for j, tool_call in enumerate(sequence):
111237
if not isinstance(tool_call, dict):
@@ -131,8 +257,8 @@ def validate_expected_tool_calls(
131257
}
132258
tool_calls.append(validated_tool_call)
133259

134-
result.append(tool_calls)
135-
return result
260+
validated_sequences.append(tool_calls)
261+
return validated_sequences
136262

137263

138264
class EvaluationData(BaseModel):
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Tests for metrics module."""
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Tests for custom metrics module."""

0 commit comments

Comments
 (0)