diff --git a/config/mmlu_example.yaml b/config/mmlu_example.yaml new file mode 100644 index 0000000..751f366 --- /dev/null +++ b/config/mmlu_example.yaml @@ -0,0 +1,48 @@ +- conversation_group_id: rh199_9.0_ch01 + description: Vim editor and file management + turns: + - turn_id: 849c1899-e729-4387-b5e4-7b87f4bfadc9 + query: 'What is the difference between the `vim-minimal` and `vim-enhanced` packages in Red Hat Enterprise Linux, and what features do they provide for editing text-based files? + + + A) `vim-enhanced` provides a more comprehensive set of features, an online help system, and a tutorial program, while `vim-minimal` offers a lightweight installation with core features and the basic `vi` command. + + B) `vim-enhanced` and `vim-minimal` provide the same set of features with slight variations in the online help system and tutorial program. + + C) `vim-minimal` provides a more comprehensive set of features, an online help system, and a tutorial program, while `vim-enhanced` offers a lightweight installation with core features and the basic `vi` command. + + D) `vim-enhanced` provides a lightweight installation with core features and the basic `vi` command, while `vim-minimal` offers a more comprehensive set of features with additional plugins.' + expected_response: A + response: null + turn_metrics: + - custom:multiple_choice_exact + - turn_id: 4320ff94-bf75-4dfc-b7d9-38f704ccf47d + query: 'How can you open a file for editing using the `vi` and `vim` commands in Red Hat Enterprise Linux? + + + A) vi --help or vim --help + + B) vi -r filename or vim -r filename + + C) vi filename or vim filename + + D) vi -w filename or vim -w filename' + expected_response: C + response: null + turn_metrics: + - custom:multiple_choice_exact + - turn_id: 68ddb5b7-c16c-4de2-afb6-a736c548fd52 + query: 'What are the different modes of operation in the Vim editor, and how can you move between them? + + + A) Vim has command mode, insert mode, visual mode, and extended command mode. You can move between them using ''i'' to enter insert mode from command mode, ''Esc'' to return to command mode from insert mode or visual mode, and '':'' to enter extended command mode from command mode. + + B) Vim has command mode, insert mode, select mode, and extended command mode. You can move between them using ''i'' to enter select mode from command mode, ''Esc'' to return to command mode from insert mode or visual mode, and '':'' to enter extended command mode from command mode. + + C) Vim has edit mode, insert mode, visual mode, and extended command mode. You can move between them using ''e'' to enter edit mode from command mode, ''Esc'' to return to command mode from insert mode or visual mode, and '':'' to enter extended command mode from command mode. + + D) Vim has command mode, insert mode, visual mode, and search mode. You can move between them using ''i'' to enter insert mode from command mode, ''Esc'' to return to command mode from insert mode or visual mode, and ''/'' to enter search mode from command mode.' + expected_response: A + response: null + turn_metrics: + - custom:multiple_choice_exact diff --git a/config/system.yaml b/config/system.yaml index bc45a07..1c7045f 100644 --- a/config/system.yaml +++ b/config/system.yaml @@ -83,6 +83,14 @@ metrics_metadata: "custom:tool_eval": description: "Tool call evaluation comparing expected vs actual tool calls" + "custom:multiple_choice_exact": + threshold: 1.0 + description: "MMLU-style multiple choice exact match with flexible letter extraction" + + "custom:multiple_choice_strict": + threshold: 1.0 + description: "MMLU-style multiple choice strict match (single letter only)" + # Script-based metrics "script:action_eval": description: "Script-based evaluation for infrastructure/environment validation" diff --git a/src/lightspeed_evaluation/core/metrics/custom/custom.py b/src/lightspeed_evaluation/core/metrics/custom/custom.py index 292ec33..4073bb2 100644 --- a/src/lightspeed_evaluation/core/metrics/custom/custom.py +++ b/src/lightspeed_evaluation/core/metrics/custom/custom.py @@ -5,6 +5,7 @@ from lightspeed_evaluation.core.llm.custom import BaseCustomLLM from lightspeed_evaluation.core.llm.manager import LLMManager +from lightspeed_evaluation.core.metrics.custom.mmlu_style_eval import MMLUMetrics from lightspeed_evaluation.core.metrics.custom.prompts import ( ANSWER_CORRECTNESS_PROMPT, INTENT_EVALUATION_PROMPT, @@ -26,11 +27,14 @@ def __init__(self, llm_manager: LLMManager): self.llm = BaseCustomLLM( llm_manager.get_model_name(), llm_manager.get_llm_params() ) + self.mmlu_metrics = MMLUMetrics() self.supported_metrics = { "answer_correctness": self._evaluate_answer_correctness, "intent_eval": self._evaluate_intent, "tool_eval": self._evaluate_tool_calls, + "multiple_choice_exact": self._evaluate_mmlu_exact, + "multiple_choice_strict": self._evaluate_mmlu_strict, } print(f"✅ Custom Metrics initialized: {self.llm.model_name}") @@ -241,3 +245,50 @@ def _evaluate_intent( return score, reason except LLMError as e: return None, f"Intent evaluation failed: {str(e)}" + + def _evaluate_mmlu_exact( + self, + conv_data: Any, + turn_idx: Optional[int], + turn_data: Optional[TurnData], + is_conversation: bool, + ) -> tuple[Optional[float], str]: + """Evaluate using MMLU exact match metric. + + Args: + conv_data: Conversation data. + turn_idx: Turn index. + turn_data: Turn data containing response and expected response. + is_conversation: Whether this is conversation-level evaluation. + + Returns: + Tuple of (score, reason). + """ + scope = EvaluationScope( + turn_idx=turn_idx, turn_data=turn_data, is_conversation=is_conversation + ) + return self.mmlu_metrics.evaluate("multiple_choice_exact", conv_data, scope) + + def _evaluate_mmlu_strict( + self, + conv_data: Any, + turn_idx: Optional[int], + turn_data: Optional[TurnData], + is_conversation: bool, + ) -> tuple[Optional[float], str]: + """Evaluate using MMLU strict match metric. + + Args: + conv_data: Conversation data. + turn_idx: Turn index. + turn_data: Turn data containing response and expected response. + is_conversation: Whether this is conversation-level evaluation. + + Returns: + Tuple of (score, reason). + """ + scope = EvaluationScope( + turn_idx=turn_idx, turn_data=turn_data, is_conversation=is_conversation + ) + return self.mmlu_metrics.evaluate("multiple_choice_strict", conv_data, scope) + diff --git a/src/lightspeed_evaluation/core/metrics/custom/mmlu_style_eval.py b/src/lightspeed_evaluation/core/metrics/custom/mmlu_style_eval.py new file mode 100644 index 0000000..4cd1506 --- /dev/null +++ b/src/lightspeed_evaluation/core/metrics/custom/mmlu_style_eval.py @@ -0,0 +1,215 @@ +"""MMLU-style multiple choice evaluation metrics.""" + +import re +from typing import Any, Optional + +from lightspeed_evaluation.core.models import EvaluationScope, TurnData + + +class MultipleChoiceExactMatch: # pylint: disable=too-few-public-methods + """Exact match metric for multiple choice questions (MMLU-style scoring). + + Returns 1.0 for correct answer, 0.0 for incorrect. + """ + + def __init__(self, threshold: float = 1.0) -> None: + """Initialize metric. + + Args: + threshold: Score threshold for passing (default: 1.0, meaning must be exact). + """ + self.threshold = threshold + + def evaluate( # pylint: disable=unused-argument + self, response: str, expected_response: str, **kwargs: Any + ) -> dict[str, Any]: + """Evaluate if the AI response matches the expected answer. + + Args: + response: The AI's generated response. + expected_response: The correct answer (e.g., "A", "B", "C", or "D"). + **kwargs: Additional arguments (ignored). + + Returns: + Dict with 'score' (1.0 or 0.0) and 'reason' (explanation). + """ + # Clean inputs + response_clean = response.strip().upper() + expected_clean = expected_response.strip().upper() + + # Extract letter from response using regex + # Handles cases like: + # - "B" + # - "The answer is B" + # - "B) Code can survive..." + # - "I think B is correct" + letter_match = re.search(r"\b([ABCD])\b", response_clean) + + if letter_match: + response_letter = letter_match.group(1) + else: + # No clear letter found, try first character + response_letter = response_clean[0] if response_clean else "" + + # Compare + is_correct = response_letter == expected_clean + score = 1.0 if is_correct else 0.0 + + # Build explanation + reason = ( + f"Expected: {expected_clean} | " + f"Extracted: {response_letter} | " + f"Result: {'✓ CORRECT' if is_correct else '✗ INCORRECT'} | " + f"Full response: '{response[:100]}...'" + if len(response) > 100 + else f"Full response: '{response}'" + ) + + return {"score": score, "reason": reason} + + +class MultipleChoiceStrictMatch: # pylint: disable=too-few-public-methods + """Stricter version requiring response to be exactly A, B, C, or D.""" + + def __init__(self, threshold: float = 1.0) -> None: + """Initialize metric. + + Args: + threshold: Score threshold for passing (default: 1.0). + """ + self.threshold = threshold + + def evaluate( # pylint: disable=unused-argument + self, response: str, expected_response: str, **kwargs: Any + ) -> dict[str, Any]: + """Evaluate if response exactly matches expected answer. + + Args: + response: The AI's generated response. + expected_response: The correct answer (single letter). + **kwargs: Additional arguments (ignored). + + Returns: + Dict with 'score' (1.0 or 0.0) and 'reason' (explanation). + """ + response_clean = response.strip().upper() + expected_clean = expected_response.strip().upper() + + # Must be exactly one letter + is_correct = response_clean == expected_clean and len(response_clean) == 1 + score = 1.0 if is_correct else 0.0 + + return { + "score": score, + "reason": f"Expected exactly '{expected_clean}', got '{response_clean}'", + } + + +class MMLUMetrics: # pylint: disable=too-few-public-methods + """Custom MMLU-style metrics integrated with the evaluation framework.""" + + def __init__(self) -> None: + """Initialize MMLU metrics.""" + self.exact_match = MultipleChoiceExactMatch() + self.strict_match = MultipleChoiceStrictMatch() + + self.supported_metrics = { + "multiple_choice_exact": self._evaluate_exact_match, + "multiple_choice_strict": self._evaluate_strict_match, + } + + def evaluate( + self, + metric_name: str, + conv_data: Any, + scope: EvaluationScope, + ) -> tuple[Optional[float], str]: + """Evaluate an MMLU-style metric. + + Args: + metric_name: Name of the metric to evaluate. + conv_data: Conversation data (unused for MMLU metrics). + scope: Evaluation scope containing turn data. + + Returns: + Tuple of (score, reason) where score is between 0.0 and 1.0. + """ + if metric_name not in self.supported_metrics: + return None, f"Unsupported MMLU metric: {metric_name}" + + try: + return self.supported_metrics[metric_name]( + conv_data, scope.turn_idx, scope.turn_data, scope.is_conversation + ) + except (ValueError, AttributeError, KeyError) as e: + return None, f"MMLU {metric_name} evaluation failed: {str(e)}" + + def _evaluate_exact_match( + self, + _conv_data: Any, + _turn_idx: Optional[int], + turn_data: Optional[TurnData], + is_conversation: bool, + ) -> tuple[Optional[float], str]: + """Evaluate using exact match with flexible letter extraction. + + Args: + _conv_data: Conversation data (unused). + _turn_idx: Turn index (unused). + turn_data: Turn data containing response and expected response. + is_conversation: Whether this is conversation-level evaluation. + + Returns: + Tuple of (score, reason). + """ + if is_conversation: + return None, "MMLU exact match is a turn-level metric" + + if turn_data is None: + return None, "TurnData is required for MMLU evaluation" + + if not turn_data.response: + return None, "Response is required for MMLU evaluation" + + if not turn_data.expected_response: + return None, "Expected response is required for MMLU evaluation" + + result = self.exact_match.evaluate( + turn_data.response, turn_data.expected_response + ) + return result["score"], result["reason"] + + def _evaluate_strict_match( + self, + _conv_data: Any, + _turn_idx: Optional[int], + turn_data: Optional[TurnData], + is_conversation: bool, + ) -> tuple[Optional[float], str]: + """Evaluate using strict exact match (single letter only). + + Args: + _conv_data: Conversation data (unused). + _turn_idx: Turn index (unused). + turn_data: Turn data containing response and expected response. + is_conversation: Whether this is conversation-level evaluation. + + Returns: + Tuple of (score, reason). + """ + if is_conversation: + return None, "MMLU strict match is a turn-level metric" + + if turn_data is None: + return None, "TurnData is required for MMLU evaluation" + + if not turn_data.response: + return None, "Response is required for MMLU evaluation" + + if not turn_data.expected_response: + return None, "Expected response is required for MMLU evaluation" + + result = self.strict_match.evaluate( + turn_data.response, turn_data.expected_response + ) + return result["score"], result["reason"]