lightspeed-core · tisnik · Nov 6, 2025 · Oct 31, 2025 · Oct 31, 2025 · Oct 31, 2025
diff --git a/README.md b/README.md
@@ -291,7 +291,7 @@ embedding:
 | `attachments`         | list[string]     | ❌       | Attachments                          | ❌                    |
 | `expected_response`   | string           | 📋       | Expected response for comparison     | ❌                    |
 | `expected_intent`     | string           | 📋       | Expected intent for intent evaluation| ❌                    |
-| `expected_tool_calls` | list[list[dict]] | 📋       | Expected tool call sequences         | ❌                    |
+| `expected_tool_calls` | list[list[list[dict]]] | 📋 | Expected tool call sequences (multiple alternative sets) | ❌ |
 | `tool_calls`          | list[list[dict]] | ❌       | Actual tool calls from API           | ✅ (if API enabled)   |
 | `verify_script`       | string           | 📋       | Path to verification script          | ❌                    |
 | `turn_metrics`        | list[string]     | ❌       | Turn-specific metrics to evaluate    | ❌                    |
@@ -302,7 +302,7 @@ embedding:
 Examples
 > - `expected_response`: Required for `custom:answer_correctness`
 > - `expected_intent`: Required for `custom:intent_eval`
-> - `expected_tool_calls`: Required for `custom:tool_eval`
+> - `expected_tool_calls`: Required for `custom:tool_eval` (multiple alternative sets format)
 > - `verify_script`: Required for `script:action_eval` (used when API is enabled)
 > - `response`: Required for most metrics (auto-populated if API enabled)
 
@@ -314,15 +314,37 @@ Examples
 | `[]` (empty list)   | Skip evaluation for this turn |
 | `["metric1", ...]`  | Use specified metrics only |
 
+#### Tool Evaluation
+
+The `custom:tool_eval` metric supports flexible matching with multiple alternative patterns:
+
+- **Format**: `[[[tool_calls, ...]], [[tool_calls]], ...]` (list of list of list)
+- **Matching**: Tries each alternative until one matches
+- **Use Cases**: Optional tools, multiple approaches, default arguments, skip scenarios
+- **Empty Sets**: `[]` represents "no tools" and must come after primary alternatives
+
 #### Tool Call Structure
 
   ```yaml
+  # Multiple alternative sets format: [[[tool_calls, ...]], [[tool_calls]], ...]
   expected_tool_calls:
-    -
-      - tool_name: oc_get           # Tool name
-        arguments:                  # Tool arguments
-          kind: pod
-          name: openshift-light*    # Regex patterns supported for flexible matching
+    - # Alternative 1: Primary approach
+      - # Sequence 1
+        - tool_name: oc_get
+          arguments:
+            kind: pod
+            name: openshift-light*    # Regex patterns supported
+      - # Sequence 2 (if multiple parallel tool calls needed)
+        - tool_name: oc_describe
+          arguments:
+            kind: pod
+    - # Alternative 2: Different approach
+      - # Sequence 1
+        - tool_name: kubectl_get
+          arguments:
+            resource: pods
+    - # Alternative 3: Skip scenario (optional)
+      []  # When model has information from previous conversation
   ```
 
 #### Script-Based Evaluations

diff --git a/src/lightspeed_evaluation/core/metrics/custom/tool_eval.py b/src/lightspeed_evaluation/core/metrics/custom/tool_eval.py
@@ -8,26 +8,26 @@
 
 
 def evaluate_tool_calls(
-    expected: list[list[dict[str, Any]]], actual: list[list[dict[str, Any]]]
+    expected: list[list[list[dict[str, Any]]]],
+    actual: list[list[dict[str, Any]]],
 ) -> tuple[bool, str]:
     """Evaluate tool calls using the custom:tool_eval metric.
 
     Args:
-        expected: Expected tool calls structure (list[list[dict[str, Any]]])
-        actual: Actual tool calls from API response (list[list[dict[str, Any]]])
+        expected: Expected tool calls structure (with alternatives)
+        actual: Actual tool calls from API response
 
     Returns:
         tuple: (success: bool, details: str)
     """
     try:
-        success = compare_tool_calls(expected, actual)
-
-        if success:
-            details = "Tool calls match expected structure and arguments"
-        else:
-            details = "Tool calls do not match expected structure or arguments"
+        # Try each set until one matches
+        for i, expected_set in enumerate(expected):
+            if compare_tool_calls(expected_set, actual):
+                return _create_success_message(i, expected_set)
 
-        return success, details
+        # If all sets fail, return failure status & message
+        return _create_failure_message(expected, actual)
 
     except (AttributeError, TypeError, ValueError) as e:
         logger.error("Error during tool evaluation: %s", e)
@@ -143,6 +143,40 @@ def _compare_tool_arguments(expected: dict[str, Any], actual: dict[str, Any]) ->
     return True
 
 
+def _create_success_message(
+    index: int, expected_set: list[list[dict[str, Any]]]
+) -> tuple[bool, str]:
+    """Create success message based on match type."""
+    pattern_type = "Primary pattern" if index == 0 else f"Alternative {index + 1}"
+
+    # Determine message based on what matched
+    if len(expected_set) == 0:
+        # Empty alternative matched - index 0 can never be empty due to constraints
+        message = "No tool calls made (valid alternate skip scenario)"
+    else:
+        message = "Tool calls match expected structure and arguments"
+
+    return True, f"{pattern_type} matched: {message}"
+
+
+def _create_failure_message(
+    expected: list[list[list[dict[str, Any]]]], actual: list[list[dict[str, Any]]]
+) -> tuple[bool, str]:
+    """Create failure message with helpful context."""
+    # If we reach here, none of the alternatives matched
+
+    if len(actual) == 0:
+        return (
+            False,
+            "No actual tool calls made and this is not set as an expected alternative",
+        )
+
+    return (
+        False,
+        f"Tool calls made but didn't match any of the {len(expected)} expected pattern(s)",
+    )
+
+
 def format_tool_calls_for_logging(tool_calls: list[list[dict[str, Any]]]) -> str:
     """Format tool calls for logging purposes."""
     if not tool_calls:

diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py
@@ -56,8 +56,8 @@ class TurnData(BaseModel):
     expected_response: Optional[str] = Field(
         default=None, min_length=1, description="Expected response for comparison"
     )
-    expected_tool_calls: Optional[list[list[dict[str, Any]]]] = Field(
-        default=None, description="Expected tool call sequences"
+    expected_tool_calls: Optional[list[list[list[dict[str, Any]]]]] = Field(
+        default=None, description="Expected tool call sequences (with alternatives)"
     )
     expected_intent: Optional[str] = Field(
         default=None, min_length=1, description="Expected intent for intent evaluation"
@@ -93,19 +93,145 @@ def validate_turn_metrics(cls, v: Optional[list[str]]) -> Optional[list[str]]:
     @classmethod
     def validate_expected_tool_calls(
         cls, v: Optional[Any]
-    ) -> Optional[list[list[dict[str, Any]]]]:
-        """Validate expected tool calls when provided."""
+    ) -> Optional[list[list[list[dict[str, Any]]]]]:
+        """Validate expected tool calls when provided.
+
+        Converts single set format to multiple sets format automatically for backward compatibility.
+
+        Input formats:
+        1. Single set: [[{tool_name, arguments}, ...], ...] -> Converted to multiple sets
+        2. Multiple sets: [[[{tool_name, arguments}, ...], ...], [[...], ...], ...] -> Used as-is
+
+        Output format:
+        Multiple sets: [[[{tool_name, arguments}, ...], ...], [[...], ...], ...]
+        """
         if v is None:
             return None
 
+        if not isinstance(v, list):
+            raise ValueError("Expected tool calls must be a list")
+
+        # Ensure multiple sets format (backward compatibility)
+        data = cls._ensure_multiple_sets_format(v)
+
+        # Validate multiple sets format
+        return cls._validate_multiple_sets(data)
+
+        # Future backward compatibility removal (minimal changes):
+        # 1. Delete: _ensure_multiple_sets_format() and _is_single_set_format()
+        # 2. Replace above with: return cls._validate_multiple_sets(v)
+
+    @classmethod
+    def _ensure_multiple_sets_format(cls, v: list) -> list[list[list[dict[str, Any]]]]:
+        """Ensure data is in multiple sets format (backward compatibility)."""
+        # Convert single set format to multiple sets format if needed
+        if cls._is_single_set_format(v):
+            # Single set: [[tool1, tool2], [tool3]] -> Multiple sets: [[[tool1, tool2], [tool3]]]
+            return [v]
+        # Already multiple sets: [[[tool1]], [[tool2]]] -> Keep as-is
+        return v
+
+    @classmethod
+    def _validate_multiple_sets(
+        cls, data: list[list[list[dict[str, Any]]]]
+    ) -> list[list[list[dict[str, Any]]]]:
+        """Validate multiple sets format data."""
+        # Reject empty sequences anywhere
+        cls._reject_empty_sequences(data)
+
+        # Validate each alternative set
+        validated_alternatives = []
+        for alternative in data:
+            validated_alternative = cls._validate_tool_call_sequences(alternative)
+            validated_alternatives.append(validated_alternative)
+
+        # Apply constraints
+        cls._validate_empty_set_constraints(validated_alternatives)
+        return validated_alternatives
+
+    @classmethod
+    def _is_single_set_format(cls, v: list) -> bool:
+        """Detect if input is single set format (backward compatibility)."""
+        if not v:
+            return True  # Empty list is single set format
+
+        # Check first element: if it's a dict, it's single set format
+        # If it's a list, it could be multiple sets or single set with sequences
+        first_element = v[0]
+        if isinstance(first_element, dict):
+            return True  # Single set: [tool1, tool2, ...]
+
+        if isinstance(first_element, list):
+            if not first_element:
+                # Empty list [] - could be single set (empty sequence) or multiple sets (empty alt)
+                # Check if there are multiple empty lists (indicates multiple sets format)
+                return not (
+                    len(v) > 1
+                    and all(isinstance(el, list) and len(el) == 0 for el in v)
+                )
+            # Non-empty list - check what's inside
+            return isinstance(
+                first_element[0], dict
+            )  # dict = single set, list = multiple sets
+
+        return False
+
+    @classmethod
+    def _reject_empty_sequences(cls, data: list[list[list[dict[str, Any]]]]) -> None:
+        """Reject empty sequences in data."""
+        for i, alternative in enumerate(data):
+            for j, sequence in enumerate(alternative):
+                if isinstance(sequence, list) and len(sequence) == 0:
+                    raise ValueError(
+                        f"Empty sequence at position {j} in alternative {i} is invalid. "
+                        "Use [] for no tools instead."
+                    )
+
+    @classmethod
+    def _validate_empty_set_constraints(
+        cls, result: list[list[list[dict[str, Any]]]]
+    ) -> None:
+        """Validate that empty alternatives come after primary options (not first or only)."""
+        if not result:
+            return
+
+        if len(result) == 1 and len(result[0]) == 0:
+            raise ValueError(
+                "Empty set cannot be the only alternative. "
+                "Empty alternatives should represent fallback scenarios, not primary options."
+            )
+
+        if len(result) > 1 and len(result[0]) == 0:
+            raise ValueError(
+                "Empty set cannot be the first alternative. "
+                "Empty alternatives should come after primary options."
+            )
+
+        # Prevent multiple redundant empty alternatives
+        empty_count = sum(1 for alt in result if len(alt) == 0)
+        if empty_count > 1:
+            raise ValueError(
+                f"Found {empty_count} empty alternatives. "
+                "Multiple empty alternatives are redundant - use only one as fallback."
+            )
+
+    @classmethod
+    def _is_sequence_of_sequences(cls, seq: list) -> bool:
+        """Check if a sequence contains sequences (not dicts)."""
+        return bool(seq) and isinstance(seq[0], list)
+
+    @classmethod
+    def _validate_tool_call_sequences(cls, v: Any) -> list[list[dict[str, Any]]]:
+        """Validate tool call sequences structure."""
         if not isinstance(v, list):
             raise ValueError("Expected tool calls must be a list of sequences")
 
-        result = []
+        validated_sequences = []
         for i, sequence in enumerate(v):
             if not isinstance(sequence, list):
                 raise ValueError(f"Sequence {i} must be a list")
 
+            # Empty sequences are already rejected by _reject_empty_sequences
             tool_calls = []
             for j, tool_call in enumerate(sequence):
                 if not isinstance(tool_call, dict):
@@ -131,8 +257,8 @@ def validate_expected_tool_calls(
                 }
                 tool_calls.append(validated_tool_call)
 
-            result.append(tool_calls)
-        return result
+            validated_sequences.append(tool_calls)
+        return validated_sequences
 
 
 class EvaluationData(BaseModel):

diff --git a/tests/unit/core/metrics/__init__.py b/tests/unit/core/metrics/__init__.py
@@ -0,0 +1 @@
+"""Tests for metrics module."""
diff --git a/tests/unit/core/metrics/custom/__init__.py b/tests/unit/core/metrics/custom/__init__.py
@@ -0,0 +1 @@
+"""Tests for custom metrics module."""