Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions config/mmlu_example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
- conversation_group_id: rh199_9.0_ch01
description: Vim editor and file management
turns:
- turn_id: 849c1899-e729-4387-b5e4-7b87f4bfadc9
query: 'What is the difference between the `vim-minimal` and `vim-enhanced` packages in Red Hat Enterprise Linux, and what features do they provide for editing text-based files?


A) `vim-enhanced` provides a more comprehensive set of features, an online help system, and a tutorial program, while `vim-minimal` offers a lightweight installation with core features and the basic `vi` command.

B) `vim-enhanced` and `vim-minimal` provide the same set of features with slight variations in the online help system and tutorial program.

C) `vim-minimal` provides a more comprehensive set of features, an online help system, and a tutorial program, while `vim-enhanced` offers a lightweight installation with core features and the basic `vi` command.

D) `vim-enhanced` provides a lightweight installation with core features and the basic `vi` command, while `vim-minimal` offers a more comprehensive set of features with additional plugins.'
expected_response: A
response: null
turn_metrics:
- custom:multiple_choice_exact
- turn_id: 4320ff94-bf75-4dfc-b7d9-38f704ccf47d
query: 'How can you open a file for editing using the `vi` and `vim` commands in Red Hat Enterprise Linux?


A) vi --help or vim --help

B) vi -r filename or vim -r filename

C) vi filename or vim filename

D) vi -w filename or vim -w filename'
expected_response: C
response: null
turn_metrics:
- custom:multiple_choice_exact
- turn_id: 68ddb5b7-c16c-4de2-afb6-a736c548fd52
query: 'What are the different modes of operation in the Vim editor, and how can you move between them?


A) Vim has command mode, insert mode, visual mode, and extended command mode. You can move between them using ''i'' to enter insert mode from command mode, ''Esc'' to return to command mode from insert mode or visual mode, and '':'' to enter extended command mode from command mode.

B) Vim has command mode, insert mode, select mode, and extended command mode. You can move between them using ''i'' to enter select mode from command mode, ''Esc'' to return to command mode from insert mode or visual mode, and '':'' to enter extended command mode from command mode.

C) Vim has edit mode, insert mode, visual mode, and extended command mode. You can move between them using ''e'' to enter edit mode from command mode, ''Esc'' to return to command mode from insert mode or visual mode, and '':'' to enter extended command mode from command mode.

D) Vim has command mode, insert mode, visual mode, and search mode. You can move between them using ''i'' to enter insert mode from command mode, ''Esc'' to return to command mode from insert mode or visual mode, and ''/'' to enter search mode from command mode.'
expected_response: A
response: null
turn_metrics:
- custom:multiple_choice_exact
8 changes: 8 additions & 0 deletions config/system.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,14 @@ metrics_metadata:
"custom:tool_eval":
description: "Tool call evaluation comparing expected vs actual tool calls"

"custom:multiple_choice_exact":
threshold: 1.0
description: "MMLU-style multiple choice exact match with flexible letter extraction"

"custom:multiple_choice_strict":
threshold: 1.0
description: "MMLU-style multiple choice strict match (single letter only)"

# Script-based metrics
"script:action_eval":
description: "Script-based evaluation for infrastructure/environment validation"
Expand Down
51 changes: 51 additions & 0 deletions src/lightspeed_evaluation/core/metrics/custom/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from lightspeed_evaluation.core.llm.custom import BaseCustomLLM
from lightspeed_evaluation.core.llm.manager import LLMManager
from lightspeed_evaluation.core.metrics.custom.mmlu_style_eval import MMLUMetrics
from lightspeed_evaluation.core.metrics.custom.prompts import (
ANSWER_CORRECTNESS_PROMPT,
INTENT_EVALUATION_PROMPT,
Expand All @@ -26,11 +27,14 @@ def __init__(self, llm_manager: LLMManager):
self.llm = BaseCustomLLM(
llm_manager.get_model_name(), llm_manager.get_llm_params()
)
self.mmlu_metrics = MMLUMetrics()

self.supported_metrics = {
"answer_correctness": self._evaluate_answer_correctness,
"intent_eval": self._evaluate_intent,
"tool_eval": self._evaluate_tool_calls,
"multiple_choice_exact": self._evaluate_mmlu_exact,
"multiple_choice_strict": self._evaluate_mmlu_strict,
}

print(f"✅ Custom Metrics initialized: {self.llm.model_name}")
Expand Down Expand Up @@ -241,3 +245,50 @@ def _evaluate_intent(
return score, reason
except LLMError as e:
return None, f"Intent evaluation failed: {str(e)}"

def _evaluate_mmlu_exact(
self,
conv_data: Any,
turn_idx: Optional[int],
turn_data: Optional[TurnData],
is_conversation: bool,
) -> tuple[Optional[float], str]:
"""Evaluate using MMLU exact match metric.

Args:
conv_data: Conversation data.
turn_idx: Turn index.
turn_data: Turn data containing response and expected response.
is_conversation: Whether this is conversation-level evaluation.

Returns:
Tuple of (score, reason).
"""
scope = EvaluationScope(
turn_idx=turn_idx, turn_data=turn_data, is_conversation=is_conversation
)
return self.mmlu_metrics.evaluate("multiple_choice_exact", conv_data, scope)

def _evaluate_mmlu_strict(
self,
conv_data: Any,
turn_idx: Optional[int],
turn_data: Optional[TurnData],
is_conversation: bool,
) -> tuple[Optional[float], str]:
"""Evaluate using MMLU strict match metric.

Args:
conv_data: Conversation data.
turn_idx: Turn index.
turn_data: Turn data containing response and expected response.
is_conversation: Whether this is conversation-level evaluation.

Returns:
Tuple of (score, reason).
"""
scope = EvaluationScope(
turn_idx=turn_idx, turn_data=turn_data, is_conversation=is_conversation
)
return self.mmlu_metrics.evaluate("multiple_choice_strict", conv_data, scope)

215 changes: 215 additions & 0 deletions src/lightspeed_evaluation/core/metrics/custom/mmlu_style_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
"""MMLU-style multiple choice evaluation metrics."""

import re
from typing import Any, Optional

from lightspeed_evaluation.core.models import EvaluationScope, TurnData


class MultipleChoiceExactMatch: # pylint: disable=too-few-public-methods
"""Exact match metric for multiple choice questions (MMLU-style scoring).

Returns 1.0 for correct answer, 0.0 for incorrect.
"""

def __init__(self, threshold: float = 1.0) -> None:
"""Initialize metric.

Args:
threshold: Score threshold for passing (default: 1.0, meaning must be exact).
"""
self.threshold = threshold

def evaluate( # pylint: disable=unused-argument
self, response: str, expected_response: str, **kwargs: Any
) -> dict[str, Any]:
"""Evaluate if the AI response matches the expected answer.

Args:
response: The AI's generated response.
expected_response: The correct answer (e.g., "A", "B", "C", or "D").
**kwargs: Additional arguments (ignored).

Returns:
Dict with 'score' (1.0 or 0.0) and 'reason' (explanation).
"""
# Clean inputs
response_clean = response.strip().upper()
expected_clean = expected_response.strip().upper()

# Extract letter from response using regex
# Handles cases like:
# - "B"
# - "The answer is B"
# - "B) Code can survive..."
# - "I think B is correct"
letter_match = re.search(r"\b([ABCD])\b", response_clean)

if letter_match:
response_letter = letter_match.group(1)
else:
# No clear letter found, try first character
response_letter = response_clean[0] if response_clean else ""

# Compare
is_correct = response_letter == expected_clean
score = 1.0 if is_correct else 0.0

# Build explanation
reason = (
f"Expected: {expected_clean} | "
f"Extracted: {response_letter} | "
f"Result: {'✓ CORRECT' if is_correct else '✗ INCORRECT'} | "
f"Full response: '{response[:100]}...'"
if len(response) > 100
else f"Full response: '{response}'"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

when response length is greater than 100, the reason won't have expected response.

)

return {"score": score, "reason": reason}


class MultipleChoiceStrictMatch: # pylint: disable=too-few-public-methods
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optional: Perhaps we can simply convert to this to a python function.

"""Stricter version requiring response to be exactly A, B, C, or D."""

def __init__(self, threshold: float = 1.0) -> None:
"""Initialize metric.

Args:
threshold: Score threshold for passing (default: 1.0).
"""
self.threshold = threshold
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can remove threshold, as this is a binary metric.


def evaluate( # pylint: disable=unused-argument
self, response: str, expected_response: str, **kwargs: Any
) -> dict[str, Any]:
"""Evaluate if response exactly matches expected answer.

Args:
response: The AI's generated response.
expected_response: The correct answer (single letter).
**kwargs: Additional arguments (ignored).

Returns:
Dict with 'score' (1.0 or 0.0) and 'reason' (explanation).
"""
response_clean = response.strip().upper()
expected_clean = expected_response.strip().upper()

# Must be exactly one letter
is_correct = response_clean == expected_clean and len(response_clean) == 1
score = 1.0 if is_correct else 0.0

return {
"score": score,
"reason": f"Expected exactly '{expected_clean}', got '{response_clean}'",
}


class MMLUMetrics: # pylint: disable=too-few-public-methods
"""Custom MMLU-style metrics integrated with the evaluation framework."""

def __init__(self) -> None:
"""Initialize MMLU metrics."""
self.exact_match = MultipleChoiceExactMatch()
self.strict_match = MultipleChoiceStrictMatch()

self.supported_metrics = {
"multiple_choice_exact": self._evaluate_exact_match,
"multiple_choice_strict": self._evaluate_strict_match,
}

def evaluate(
self,
metric_name: str,
conv_data: Any,
scope: EvaluationScope,
) -> tuple[Optional[float], str]:
"""Evaluate an MMLU-style metric.

Args:
metric_name: Name of the metric to evaluate.
conv_data: Conversation data (unused for MMLU metrics).
scope: Evaluation scope containing turn data.

Returns:
Tuple of (score, reason) where score is between 0.0 and 1.0.
"""
if metric_name not in self.supported_metrics:
return None, f"Unsupported MMLU metric: {metric_name}"

try:
return self.supported_metrics[metric_name](
conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation
)
except (ValueError, AttributeError, KeyError) as e:
return None, f"MMLU {metric_name} evaluation failed: {str(e)}"

def _evaluate_exact_match(
self,
_conv_data: Any,
_turn_idx: Optional[int],
turn_data: Optional[TurnData],
is_conversation: bool,
) -> tuple[Optional[float], str]:
"""Evaluate using exact match with flexible letter extraction.

Args:
_conv_data: Conversation data (unused).
_turn_idx: Turn index (unused).
turn_data: Turn data containing response and expected response.
is_conversation: Whether this is conversation-level evaluation.

Returns:
Tuple of (score, reason).
"""
if is_conversation:
return None, "MMLU exact match is a turn-level metric"

if turn_data is None:
return None, "TurnData is required for MMLU evaluation"

if not turn_data.response:
return None, "Response is required for MMLU evaluation"

if not turn_data.expected_response:
return None, "Expected response is required for MMLU evaluation"

result = self.exact_match.evaluate(
turn_data.response, turn_data.expected_response
)
return result["score"], result["reason"]

def _evaluate_strict_match(
self,
_conv_data: Any,
_turn_idx: Optional[int],
turn_data: Optional[TurnData],
is_conversation: bool,
) -> tuple[Optional[float], str]:
"""Evaluate using strict exact match (single letter only).

Args:
_conv_data: Conversation data (unused).
_turn_idx: Turn index (unused).
turn_data: Turn data containing response and expected response.
is_conversation: Whether this is conversation-level evaluation.

Returns:
Tuple of (score, reason).
"""
if is_conversation:
return None, "MMLU strict match is a turn-level metric"

if turn_data is None:
return None, "TurnData is required for MMLU evaluation"

if not turn_data.response:
return None, "Response is required for MMLU evaluation"

if not turn_data.expected_response:
return None, "Expected response is required for MMLU evaluation"

result = self.strict_match.evaluate(
turn_data.response, turn_data.expected_response
)
return result["score"], result["reason"]
Loading