Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 29 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ embedding:
| `attachments` | list[string] | ❌ | Attachments | ❌ |
| `expected_response` | string | 📋 | Expected response for comparison | ❌ |
| `expected_intent` | string | 📋 | Expected intent for intent evaluation| ❌ |
| `expected_tool_calls` | list[list[dict]] | 📋 | Expected tool call sequences | ❌ |
| `expected_tool_calls` | list[list[list[dict]]] | 📋 | Expected tool call sequences (multiple alternative sets) | ❌ |
| `tool_calls` | list[list[dict]] | ❌ | Actual tool calls from API | ✅ (if API enabled) |
| `verify_script` | string | 📋 | Path to verification script | ❌ |
| `turn_metrics` | list[string] | ❌ | Turn-specific metrics to evaluate | ❌ |
Expand All @@ -302,7 +302,7 @@ embedding:
Examples
> - `expected_response`: Required for `custom:answer_correctness`
> - `expected_intent`: Required for `custom:intent_eval`
> - `expected_tool_calls`: Required for `custom:tool_eval`
> - `expected_tool_calls`: Required for `custom:tool_eval` (multiple alternative sets format)
> - `verify_script`: Required for `script:action_eval` (used when API is enabled)
> - `response`: Required for most metrics (auto-populated if API enabled)

Expand All @@ -314,15 +314,37 @@ Examples
| `[]` (empty list) | Skip evaluation for this turn |
| `["metric1", ...]` | Use specified metrics only |

#### Tool Evaluation

The `custom:tool_eval` metric supports flexible matching with multiple alternative patterns:

- **Format**: `[[[tool_calls, ...]], [[tool_calls]], ...]` (list of list of list)
- **Matching**: Tries each alternative until one matches
- **Use Cases**: Optional tools, multiple approaches, default arguments, skip scenarios
- **Empty Sets**: `[]` represents "no tools" and must come after primary alternatives

#### Tool Call Structure

```yaml
# Multiple alternative sets format: [[[tool_calls, ...]], [[tool_calls]], ...]
expected_tool_calls:
-
- tool_name: oc_get # Tool name
arguments: # Tool arguments
kind: pod
name: openshift-light* # Regex patterns supported for flexible matching
- # Alternative 1: Primary approach
- # Sequence 1
- tool_name: oc_get
arguments:
kind: pod
name: openshift-light* # Regex patterns supported
- # Sequence 2 (if multiple parallel tool calls needed)
- tool_name: oc_describe
arguments:
kind: pod
- # Alternative 2: Different approach
- # Sequence 1
- tool_name: kubectl_get
arguments:
resource: pods
- # Alternative 3: Skip scenario (optional)
[] # When model has information from previous conversation
```

#### Script-Based Evaluations
Expand Down
54 changes: 44 additions & 10 deletions src/lightspeed_evaluation/core/metrics/custom/tool_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,26 @@


def evaluate_tool_calls(
expected: list[list[dict[str, Any]]], actual: list[list[dict[str, Any]]]
expected: list[list[list[dict[str, Any]]]],
actual: list[list[dict[str, Any]]],
) -> tuple[bool, str]:
"""Evaluate tool calls using the custom:tool_eval metric.

Args:
expected: Expected tool calls structure (list[list[dict[str, Any]]])
actual: Actual tool calls from API response (list[list[dict[str, Any]]])
expected: Expected tool calls structure (with alternatives)
actual: Actual tool calls from API response

Returns:
tuple: (success: bool, details: str)
"""
try:
success = compare_tool_calls(expected, actual)

if success:
details = "Tool calls match expected structure and arguments"
else:
details = "Tool calls do not match expected structure or arguments"
# Try each set until one matches
for i, expected_set in enumerate(expected):
if compare_tool_calls(expected_set, actual):
return _create_success_message(i, expected_set)

return success, details
# If all sets fail, return failure status & message
return _create_failure_message(expected, actual)

except (AttributeError, TypeError, ValueError) as e:
logger.error("Error during tool evaluation: %s", e)
Expand Down Expand Up @@ -143,6 +143,40 @@ def _compare_tool_arguments(expected: dict[str, Any], actual: dict[str, Any]) ->
return True


def _create_success_message(
index: int, expected_set: list[list[dict[str, Any]]]
) -> tuple[bool, str]:
"""Create success message based on match type."""
pattern_type = "Primary pattern" if index == 0 else f"Alternative {index + 1}"

# Determine message based on what matched
if len(expected_set) == 0:
# Empty alternative matched - index 0 can never be empty due to constraints
message = "No tool calls made (valid alternate skip scenario)"
else:
message = "Tool calls match expected structure and arguments"

return True, f"{pattern_type} matched: {message}"


def _create_failure_message(
expected: list[list[list[dict[str, Any]]]], actual: list[list[dict[str, Any]]]
) -> tuple[bool, str]:
"""Create failure message with helpful context."""
# If we reach here, none of the alternatives matched

if len(actual) == 0:
return (
False,
"No actual tool calls made and this is not set as an expected alternative",
)

return (
False,
f"Tool calls made but didn't match any of the {len(expected)} expected pattern(s)",
)


def format_tool_calls_for_logging(tool_calls: list[list[dict[str, Any]]]) -> str:
"""Format tool calls for logging purposes."""
if not tool_calls:
Expand Down
140 changes: 133 additions & 7 deletions src/lightspeed_evaluation/core/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ class TurnData(BaseModel):
expected_response: Optional[str] = Field(
default=None, min_length=1, description="Expected response for comparison"
)
expected_tool_calls: Optional[list[list[dict[str, Any]]]] = Field(
default=None, description="Expected tool call sequences"
expected_tool_calls: Optional[list[list[list[dict[str, Any]]]]] = Field(
default=None, description="Expected tool call sequences (with alternatives)"
)
expected_intent: Optional[str] = Field(
default=None, min_length=1, description="Expected intent for intent evaluation"
Expand Down Expand Up @@ -93,19 +93,145 @@ def validate_turn_metrics(cls, v: Optional[list[str]]) -> Optional[list[str]]:
@classmethod
def validate_expected_tool_calls(
cls, v: Optional[Any]
) -> Optional[list[list[dict[str, Any]]]]:
"""Validate expected tool calls when provided."""
) -> Optional[list[list[list[dict[str, Any]]]]]:
"""Validate expected tool calls when provided.

Converts single set format to multiple sets format automatically for backward compatibility.

Input formats:
1. Single set: [[{tool_name, arguments}, ...], ...] -> Converted to multiple sets
2. Multiple sets: [[[{tool_name, arguments}, ...], ...], [[...], ...], ...] -> Used as-is

Output format:
Multiple sets: [[[{tool_name, arguments}, ...], ...], [[...], ...], ...]
"""
if v is None:
return None

if not isinstance(v, list):
raise ValueError("Expected tool calls must be a list")

# Ensure multiple sets format (backward compatibility)
data = cls._ensure_multiple_sets_format(v)

# Validate multiple sets format
return cls._validate_multiple_sets(data)

# Future backward compatibility removal (minimal changes):
# 1. Delete: _ensure_multiple_sets_format() and _is_single_set_format()
# 2. Replace above with: return cls._validate_multiple_sets(v)

@classmethod
def _ensure_multiple_sets_format(cls, v: list) -> list[list[list[dict[str, Any]]]]:
"""Ensure data is in multiple sets format (backward compatibility)."""
# Convert single set format to multiple sets format if needed
if cls._is_single_set_format(v):
# Single set: [[tool1, tool2], [tool3]] -> Multiple sets: [[[tool1, tool2], [tool3]]]
return [v]
# Already multiple sets: [[[tool1]], [[tool2]]] -> Keep as-is
return v

@classmethod
def _validate_multiple_sets(
cls, data: list[list[list[dict[str, Any]]]]
) -> list[list[list[dict[str, Any]]]]:
"""Validate multiple sets format data."""
# Reject empty sequences anywhere
cls._reject_empty_sequences(data)

# Validate each alternative set
validated_alternatives = []
for alternative in data:
validated_alternative = cls._validate_tool_call_sequences(alternative)
validated_alternatives.append(validated_alternative)

# Apply constraints
cls._validate_empty_set_constraints(validated_alternatives)
return validated_alternatives

@classmethod
def _is_single_set_format(cls, v: list) -> bool:
"""Detect if input is single set format (backward compatibility)."""
if not v:
return True # Empty list is single set format

# Check first element: if it's a dict, it's single set format
# If it's a list, it could be multiple sets or single set with sequences
first_element = v[0]
if isinstance(first_element, dict):
return True # Single set: [tool1, tool2, ...]

if isinstance(first_element, list):
if not first_element:
# Empty list [] - could be single set (empty sequence) or multiple sets (empty alt)
# Check if there are multiple empty lists (indicates multiple sets format)
return not (
len(v) > 1
and all(isinstance(el, list) and len(el) == 0 for el in v)
)
# Non-empty list - check what's inside
return isinstance(
first_element[0], dict
) # dict = single set, list = multiple sets

return False

@classmethod
def _reject_empty_sequences(cls, data: list[list[list[dict[str, Any]]]]) -> None:
"""Reject empty sequences in data."""
for i, alternative in enumerate(data):
for j, sequence in enumerate(alternative):
if isinstance(sequence, list) and len(sequence) == 0:
raise ValueError(
f"Empty sequence at position {j} in alternative {i} is invalid. "
"Use [] for no tools instead."
)

@classmethod
def _validate_empty_set_constraints(
cls, result: list[list[list[dict[str, Any]]]]
) -> None:
"""Validate that empty alternatives come after primary options (not first or only)."""
if not result:
return

if len(result) == 1 and len(result[0]) == 0:
raise ValueError(
"Empty set cannot be the only alternative. "
"Empty alternatives should represent fallback scenarios, not primary options."
)

if len(result) > 1 and len(result[0]) == 0:
raise ValueError(
"Empty set cannot be the first alternative. "
"Empty alternatives should come after primary options."
)

# Prevent multiple redundant empty alternatives
empty_count = sum(1 for alt in result if len(alt) == 0)
if empty_count > 1:
raise ValueError(
f"Found {empty_count} empty alternatives. "
"Multiple empty alternatives are redundant - use only one as fallback."
)

@classmethod
def _is_sequence_of_sequences(cls, seq: list) -> bool:
"""Check if a sequence contains sequences (not dicts)."""
return bool(seq) and isinstance(seq[0], list)

@classmethod
def _validate_tool_call_sequences(cls, v: Any) -> list[list[dict[str, Any]]]:
"""Validate tool call sequences structure."""
if not isinstance(v, list):
raise ValueError("Expected tool calls must be a list of sequences")

result = []
validated_sequences = []
for i, sequence in enumerate(v):
if not isinstance(sequence, list):
raise ValueError(f"Sequence {i} must be a list")

# Empty sequences are already rejected by _reject_empty_sequences
tool_calls = []
for j, tool_call in enumerate(sequence):
if not isinstance(tool_call, dict):
Expand All @@ -131,8 +257,8 @@ def validate_expected_tool_calls(
}
tool_calls.append(validated_tool_call)

result.append(tool_calls)
return result
validated_sequences.append(tool_calls)
return validated_sequences


class EvaluationData(BaseModel):
Expand Down
1 change: 1 addition & 0 deletions tests/unit/core/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Tests for metrics module."""
1 change: 1 addition & 0 deletions tests/unit/core/metrics/custom/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Tests for custom metrics module."""
Loading