diff --git a/src/ragas/exceptions.py b/src/ragas/exceptions.py index 75688760a..28ee067eb 100644 --- a/src/ragas/exceptions.py +++ b/src/ragas/exceptions.py @@ -26,8 +26,21 @@ class RagasOutputParserException(RagasException): Exception raised when the output parser fails to parse the output. """ - def __init__(self): - msg = "The output parser failed to parse the output including retries." + def __init__(self, details: str = None): + base_msg = "The output parser failed to parse the output including retries." + if details: + msg = f"{base_msg} Details: {details}" + else: + msg = base_msg + + # Add suggestions for local LLMs + msg += "\nFor local LLMs, consider the following:\n" \ + "1. Increase the timeout in RunConfig (default is now 300 seconds)\n" \ + "2. Use a more capable local model that can better follow JSON formatting instructions\n" \ + "3. Reduce batch size to process fewer examples at once\n" \ + "4. For metrics that require structured output (context_recall, faithfulness, context_precision), " \ + "consider using simpler metrics like answer_correctness and answer_similarity if the issues persist" + super().__init__(msg) diff --git a/src/ragas/prompt/pydantic_prompt.py b/src/ragas/prompt/pydantic_prompt.py index 7d424c8c2..f19669ee3 100644 --- a/src/ragas/prompt/pydantic_prompt.py +++ b/src/ragas/prompt/pydantic_prompt.py @@ -395,19 +395,33 @@ async def parse_output_string( prompt_value: PromptValue, llm: BaseRagasLLM, callbacks: Callbacks, - retries_left: int = 1, + retries_left: int = 3, # Increased default retries from 1 to 3 ) -> OutputModel: + import json + import logging + + logger = logging.getLogger(__name__) callbacks = callbacks or [] + try: + # First attempt: Extract and parse JSON directly jsonstr = extract_json(output_string) - result = super().parse(jsonstr) - except OutputParserException: - if retries_left != 0: + try: + result = super().parse(jsonstr) + return result + except (OutputParserException, json.JSONDecodeError) as e: + logger.debug(f"Initial parsing failed: {str(e)}") + # Continue to retry logic + + # If we're here, the first attempt failed + if retries_left > 0: retry_rm, retry_cb = new_group( name="fix_output_format", inputs={"output_string": output_string}, callbacks=callbacks, ) + + # Use the fix_output_format prompt to try to fix the output fixed_output_string = await fix_output_format_prompt.generate( llm=llm, data=OutputStringAndPrompt( @@ -418,10 +432,72 @@ async def parse_output_string( retries_left=retries_left - 1, ) retry_rm.on_chain_end({"fixed_output_string": fixed_output_string}) - result = super().parse(fixed_output_string.text) - else: - raise RagasOutputParserException() - return result + + # Try to parse the fixed output + try: + fixed_jsonstr = extract_json(fixed_output_string.text) + result = super().parse(fixed_jsonstr) + return result + except (OutputParserException, json.JSONDecodeError) as e: + logger.debug(f"Parsing fixed output failed: {str(e)}") + + # Last resort: Try to manually construct a valid JSON + # This is especially helpful for local LLMs that might not follow the exact format + try: + # Get the expected schema + schema = self.pydantic_object.model_json_schema() + required_fields = schema.get("required", []) + properties = schema.get("properties", {}) + + # Create a minimal valid JSON with default values + minimal_json = {} + for field in required_fields: + field_type = properties.get(field, {}).get("type") + if field_type == "string": + minimal_json[field] = "Unable to parse" + elif field_type == "integer": + minimal_json[field] = 0 + elif field_type == "number": + minimal_json[field] = 0.0 + elif field_type == "boolean": + minimal_json[field] = False + elif field_type == "array": + minimal_json[field] = [] + elif field_type == "object": + minimal_json[field] = {} + + # Try to parse with this minimal JSON + if minimal_json: + logger.warning(f"Using fallback minimal JSON: {minimal_json}") + result = self.pydantic_object.model_validate(minimal_json) + return result + except Exception as e: + logger.debug(f"Minimal JSON fallback failed: {str(e)}") + + # If we've exhausted all retries and approaches + if retries_left > 1: + # Recursive call with one less retry + return await self.parse_output_string( + output_string=fixed_output_string.text, + prompt_value=prompt_value, + llm=llm, + callbacks=callbacks, + retries_left=retries_left - 1, + ) + + # If all attempts fail + raise RagasOutputParserException(details=f"Failed after {3-retries_left+1} attempts with output: {output_string[:100]}...") + except Exception as e: + if retries_left > 0: + logger.warning(f"Unexpected error during parsing: {str(e)}. Retrying...") + return await self.parse_output_string( + output_string=output_string, + prompt_value=prompt_value, + llm=llm, + callbacks=callbacks, + retries_left=retries_left - 1, + ) + raise RagasOutputParserException(details=f"Unexpected error: {str(e)}") # Ragas Adaptation diff --git a/src/ragas/prompt/utils.py b/src/ragas/prompt/utils.py index 4019940ab..2954a8457 100644 --- a/src/ragas/prompt/utils.py +++ b/src/ragas/prompt/utils.py @@ -68,13 +68,22 @@ def replace_string(s: str) -> str: def extract_json(text: str) -> str: """Identify json from a text blob by matching '[]' or '{}'. + Enhanced to handle various LLM output formats, including markdown code blocks, + single quotes, and other common formatting issues. Warning: This will identify the first json structure!""" - - # check for markdown indicator; if present, start there - md_json_idx = text.find("```json") - if md_json_idx != -1: - text = text[md_json_idx:] + import re + + # Check for any markdown code block (not just json-specific) + code_block_pattern = r"```(?:json)?\s*([\s\S]*?)```" + code_blocks = re.findall(code_block_pattern, text) + + if code_blocks: + # Use the first code block that contains valid JSON markers + for block in code_blocks: + if ("{" in block and "}" in block) or ("[" in block and "]" in block): + text = block + break # search for json delimiter pairs left_bracket_idx = text.find("[") @@ -101,6 +110,17 @@ def extract_json(text: str) -> str: # When count returns to zero, we've found a complete structure if count == 0: - return text[start_idx : i + 1] + json_str = text[start_idx : i + 1] + + # Clean up common issues with JSON formatting from LLMs + # Replace single quotes with double quotes if needed + if "'" in json_str and '"' not in json_str: + json_str = json_str.replace("'", '"') + + # Fix trailing commas in lists or objects which are invalid in JSON + json_str = re.sub(r',\s*}', '}', json_str) + json_str = re.sub(r',\s*]', ']', json_str) + + return json_str return text # In case of unbalanced JSON, return the original text diff --git a/src/ragas/run_config.py b/src/ragas/run_config.py index 124e78e78..23eec20bb 100644 --- a/src/ragas/run_config.py +++ b/src/ragas/run_config.py @@ -23,7 +23,8 @@ class RunConfig: Parameters ---------- timeout : int, optional - Maximum time (in seconds) to wait for a single operation, by default 180. + Maximum time (in seconds) to wait for a single operation, by default 300. + For local LLMs, a higher timeout may be needed. max_retries : int, optional Maximum number of retry attempts, by default 10. max_wait : int, optional @@ -48,7 +49,7 @@ class RunConfig: number generator using the specified seed. """ - timeout: int = 180 + timeout: int = 300 # Increased from 180 to 300 to accommodate slower local LLMs max_retries: int = 10 max_wait: int = 60 max_workers: int = 16 diff --git a/tests/unit/prompt/test_output_parsing.py b/tests/unit/prompt/test_output_parsing.py new file mode 100644 index 000000000..f5fc7965a --- /dev/null +++ b/tests/unit/prompt/test_output_parsing.py @@ -0,0 +1,106 @@ +import pytest +from pydantic import BaseModel, Field +import json + +from ragas.prompt.utils import extract_json +from ragas.prompt.pydantic_prompt import RagasOutputParser +from langchain_core.output_parsers import PydanticOutputParser +from langchain_core.prompt_values import StringPromptValue + + +class TestModel(BaseModel): + """Test model for output parsing tests.""" + field1: str = Field(description="A string field") + field2: int = Field(description="An integer field") + + +class TestExtractJson: + """Test the extract_json function with various input formats.""" + + def test_standard_json(self): + """Test with standard JSON format.""" + text = '{"field1": "value1", "field2": 42}' + result = extract_json(text) + assert result == text + # Verify it's valid JSON + parsed = json.loads(result) + assert parsed["field1"] == "value1" + assert parsed["field2"] == 42 + + def test_json_in_markdown(self): + """Test with JSON in markdown code block.""" + text = """ + Here's the JSON: + ```json + {"field1": "value1", "field2": 42} + ``` + """ + result = extract_json(text) + # Verify it's valid JSON + parsed = json.loads(result) + assert parsed["field1"] == "value1" + assert parsed["field2"] == 42 + + def test_json_with_single_quotes(self): + """Test with JSON using single quotes instead of double quotes.""" + text = "{'field1': 'value1', 'field2': 42}" + result = extract_json(text) + # Verify it's valid JSON after conversion + parsed = json.loads(result) + assert parsed["field1"] == "value1" + assert parsed["field2"] == 42 + + def test_json_with_trailing_comma(self): + """Test with JSON containing trailing commas (invalid JSON but common in LLM outputs).""" + text = '{"field1": "value1", "field2": 42,}' + result = extract_json(text) + # Verify it's valid JSON after fixing + parsed = json.loads(result) + assert parsed["field1"] == "value1" + assert parsed["field2"] == 42 + + def test_json_in_text(self): + """Test with JSON embedded in text.""" + text = """ + The answer is as follows: + {"field1": "value1", "field2": 42} + Hope this helps! + """ + result = extract_json(text) + # Verify it's valid JSON + parsed = json.loads(result) + assert parsed["field1"] == "value1" + assert parsed["field2"] == 42 + + +@pytest.mark.asyncio +async def test_ragas_output_parser_fallback(mocker): + """Test that RagasOutputParser can handle malformed JSON with fallback mechanism.""" + # Create a parser + parser = RagasOutputParser(pydantic_object=TestModel) + + # Mock the LLM to avoid actual calls + mock_llm = mocker.MagicMock() + mock_llm.generate = mocker.AsyncMock() + + # Test with malformed JSON that should trigger the fallback + malformed_json = "This is not JSON at all" + + # Mock the fix_output_format_prompt.generate to return something that's still not valid + mocker.patch( + "ragas.prompt.pydantic_prompt.fix_output_format_prompt.generate", + return_value=mocker.MagicMock(text="Still not valid JSON") + ) + + # The parser should use the fallback mechanism and return a valid model + result = await parser.parse_output_string( + output_string=malformed_json, + prompt_value=StringPromptValue(text="test prompt"), + llm=mock_llm, + callbacks=None, + ) + + # Verify we got a valid model with default values + assert isinstance(result, TestModel) + assert result.field1 == "Unable to parse" + assert result.field2 == 0 \ No newline at end of file diff --git a/tests/unit/test_timeout_config.py b/tests/unit/test_timeout_config.py new file mode 100644 index 000000000..d59fd028d --- /dev/null +++ b/tests/unit/test_timeout_config.py @@ -0,0 +1,61 @@ +import pytest +import asyncio +from ragas.run_config import RunConfig +from ragas.metrics.base import Metric, SingleTurnMetric +from ragas.dataset_schema import SingleTurnSample +from typing import Optional, List + + +class TestTimeoutConfig: + """Test the timeout configuration in RunConfig.""" + + def test_default_timeout(self): + """Test that the default timeout is set to 300 seconds.""" + config = RunConfig() + assert config.timeout == 300, "Default timeout should be 300 seconds" + + def test_custom_timeout(self): + """Test that a custom timeout can be set.""" + config = RunConfig(timeout=500) + assert config.timeout == 500, "Custom timeout should be respected" + + +class SlowMetric(SingleTurnMetric): + """A test metric that simulates slow processing.""" + + name = "slow_metric" + + def __init__(self, sleep_time: float = 0.1): + super().__init__() + self.sleep_time = sleep_time + + def init(self): + """Initialize the metric.""" + pass + + async def _single_turn_ascore(self, sample: SingleTurnSample, callbacks=None) -> float: + """Simulate slow processing by sleeping.""" + await asyncio.sleep(self.sleep_time) + return 1.0 + + +@pytest.mark.asyncio +async def test_metric_timeout(): + """Test that the timeout is applied to metric scoring.""" + # Create a sample + sample = SingleTurnSample( + question="Test question", + answer="Test answer", + contexts=["Test context"] + ) + + # Create a slow metric + slow_metric = SlowMetric(sleep_time=0.2) + + # Test with sufficient timeout + score = await slow_metric.single_turn_ascore(sample, timeout=0.5) + assert score == 1.0, "Metric should complete with sufficient timeout" + + # Test with insufficient timeout + with pytest.raises(asyncio.TimeoutError): + await slow_metric.single_turn_ascore(sample, timeout=0.1) \ No newline at end of file