Skip to content

Move dsp/metrics into evaluate/metrics #8402

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion dspy/dsp/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from dspy.dsp.utils.dpr import *
from dspy.dsp.utils.metrics import *
from dspy.dsp.utils.settings import *
from dspy.dsp.utils.utils import *
113 changes: 0 additions & 113 deletions dspy/dsp/utils/metrics.py

This file was deleted.

3 changes: 1 addition & 2 deletions dspy/evaluate/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from dspy.dsp.utils import EM, normalize_text
from dspy.evaluate.auto_evaluation import CompleteAndGrounded, SemanticF1
from dspy.evaluate.evaluate import Evaluate
from dspy.evaluate.metrics import answer_exact_match, answer_passage_match
from dspy.evaluate.metrics import EM, answer_exact_match, answer_passage_match, normalize_text

__all__ = [
"EM",
Expand Down
76 changes: 39 additions & 37 deletions dspy/evaluate/auto_evaluation.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,50 @@
import dspy
from dspy.predict.chain_of_thought import ChainOfThought
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The change here is for avoiding circular imports

from dspy.primitives import Module
from dspy.signatures import InputField, OutputField, Signature


class SemanticRecallPrecision(dspy.Signature):
class SemanticRecallPrecision(Signature):
"""
Compare a system's response to the ground truth to compute its recall and precision.
If asked to reason, enumerate key ideas in each response, and whether they are present in the other response.
"""

question: str = dspy.InputField()
ground_truth: str = dspy.InputField()
system_response: str = dspy.InputField()
recall: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
precision: float = dspy.OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
question: str = InputField()
ground_truth: str = InputField()
system_response: str = InputField()
recall: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
precision: float = OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")


class DecompositionalSemanticRecallPrecision(dspy.Signature):
class DecompositionalSemanticRecallPrecision(Signature):
"""
Compare a system's response to the ground truth to compute recall and precision of key ideas.
You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision.
"""

question: str = dspy.InputField()
ground_truth: str = dspy.InputField()
system_response: str = dspy.InputField()
ground_truth_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the ground truth")
system_response_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the system response")
discussion: str = dspy.OutputField(desc="discussion of the overlap between ground truth and system response")
recall: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
precision: float = dspy.OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
question: str = InputField()
ground_truth: str = InputField()
system_response: str = InputField()
ground_truth_key_ideas: str = OutputField(desc="enumeration of key ideas in the ground truth")
system_response_key_ideas: str = OutputField(desc="enumeration of key ideas in the system response")
discussion: str = OutputField(desc="discussion of the overlap between ground truth and system response")
recall: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
precision: float = OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")


def f1_score(precision, recall):
precision, recall = max(0.0, min(1.0, precision)), max(0.0, min(1.0, recall))
return 0.0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)


class SemanticF1(dspy.Module):
class SemanticF1(Module):
def __init__(self, threshold=0.66, decompositional=False):
self.threshold = threshold

if decompositional:
self.module = dspy.ChainOfThought(DecompositionalSemanticRecallPrecision)
self.module = ChainOfThought(DecompositionalSemanticRecallPrecision)
else:
self.module = dspy.ChainOfThought(SemanticRecallPrecision)
self.module = ChainOfThought(SemanticRecallPrecision)

def forward(self, example, pred, trace=None):
scores = self.module(question=example.question, ground_truth=example.response, system_response=pred.response)
Expand All @@ -55,42 +57,42 @@ def forward(self, example, pred, trace=None):
###########


class AnswerCompleteness(dspy.Signature):
class AnswerCompleteness(Signature):
"""
Estimate the completeness of a system's responses, against the ground truth.
You will first enumerate key ideas in each response, discuss their overlap, and then report completeness.
"""

question: str = dspy.InputField()
ground_truth: str = dspy.InputField()
system_response: str = dspy.InputField()
ground_truth_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the ground truth")
system_response_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the system response")
discussion: str = dspy.OutputField(desc="discussion of the overlap between ground truth and system response")
completeness: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
question: str = InputField()
ground_truth: str = InputField()
system_response: str = InputField()
ground_truth_key_ideas: str = OutputField(desc="enumeration of key ideas in the ground truth")
system_response_key_ideas: str = OutputField(desc="enumeration of key ideas in the system response")
discussion: str = OutputField(desc="discussion of the overlap between ground truth and system response")
completeness: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")



class AnswerGroundedness(dspy.Signature):
class AnswerGroundedness(Signature):
"""
Estimate the groundedness of a system's responses, against real retrieved documents written by people.
You will first enumerate whatever non-trivial or check-worthy claims are made in the system response, and then
discuss the extent to which some or all of them can be deduced from the retrieved context and basic commonsense.
"""

question: str = dspy.InputField()
retrieved_context: str = dspy.InputField()
system_response: str = dspy.InputField()
system_response_claims: str = dspy.OutputField(desc="enumeration of non-trivial or check-worthy claims in the system response")
discussion: str = dspy.OutputField(desc="discussion of how supported the claims are by the retrieved context")
groundedness: float = dspy.OutputField(desc="fraction (out of 1.0) of system response supported by the retrieved context")
question: str = InputField()
retrieved_context: str = InputField()
system_response: str = InputField()
system_response_claims: str = OutputField(desc="enumeration of non-trivial or check-worthy claims in the system response")
discussion: str = OutputField(desc="discussion of how supported the claims are by the retrieved context")
groundedness: float = OutputField(desc="fraction (out of 1.0) of system response supported by the retrieved context")


class CompleteAndGrounded(dspy.Module):
class CompleteAndGrounded(Module):
def __init__(self, threshold=0.66):
self.threshold = threshold
self.completeness_module = dspy.ChainOfThought(AnswerCompleteness)
self.groundedness_module = dspy.ChainOfThought(AnswerGroundedness)
self.completeness_module = ChainOfThought(AnswerCompleteness)
self.groundedness_module = ChainOfThought(AnswerGroundedness)

def forward(self, example, pred, trace=None):
completeness = self.completeness_module(question=example.question, ground_truth=example.response, system_response=pred.response)
Expand Down
2 changes: 1 addition & 1 deletion dspy/evaluate/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def process_item(example):
return round(100 * ncorrect / ntotal, 2)

def _construct_result_table(
self, results: list[Tuple[dspy.Example, dspy.Example, Any]], metric_name: str
self, results: list[Tuple["dspy.Example", "dspy.Example", Any]], metric_name: str
) -> "pd.DataFrame":
"""
Construct a pandas DataFrame from the specified result list.
Expand Down
Loading