stanfordnlp · okhat · Jun 18, 2025 · Jun 17, 2025 · Jun 17, 2025 · Jun 17, 2025
diff --git a/dspy/dsp/utils/__init__.py b/dspy/dsp/utils/__init__.py
@@ -1,4 +1,3 @@
 from dspy.dsp.utils.dpr import *
-from dspy.dsp.utils.metrics import *
 from dspy.dsp.utils.settings import *
 from dspy.dsp.utils.utils import *
diff --git a/dspy/dsp/utils/metrics.py b/dspy/dsp/utils/metrics.py
diff --git a/dspy/evaluate/__init__.py b/dspy/evaluate/__init__.py
@@ -1,7 +1,6 @@
-from dspy.dsp.utils import EM, normalize_text
 from dspy.evaluate.auto_evaluation import CompleteAndGrounded, SemanticF1
 from dspy.evaluate.evaluate import Evaluate
-from dspy.evaluate.metrics import answer_exact_match, answer_passage_match
+from dspy.evaluate.metrics import EM, answer_exact_match, answer_passage_match, normalize_text
 
 __all__ = [
     "EM",

diff --git a/dspy/evaluate/auto_evaluation.py b/dspy/evaluate/auto_evaluation.py
@@ -1,48 +1,50 @@
-import dspy
+from dspy.predict.chain_of_thought import ChainOfThought
+from dspy.primitives import Module
+from dspy.signatures import InputField, OutputField, Signature
 
 
-class SemanticRecallPrecision(dspy.Signature):
+class SemanticRecallPrecision(Signature):
     """
     Compare a system's response to the ground truth to compute its recall and precision.
     If asked to reason, enumerate key ideas in each response, and whether they are present in the other response.
     """
 
-    question: str = dspy.InputField()
-    ground_truth: str = dspy.InputField()
-    system_response: str = dspy.InputField()
-    recall: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
-    precision: float = dspy.OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
+    question: str = InputField()
+    ground_truth: str = InputField()
+    system_response: str = InputField()
+    recall: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
+    precision: float = OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
 
 
-class DecompositionalSemanticRecallPrecision(dspy.Signature):
+class DecompositionalSemanticRecallPrecision(Signature):
     """
     Compare a system's response to the ground truth to compute recall and precision of key ideas.
     You will first enumerate key ideas in each response, discuss their overlap, and then report recall and precision.
     """
 
-    question: str = dspy.InputField()
-    ground_truth: str = dspy.InputField()
-    system_response: str = dspy.InputField()
-    ground_truth_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the ground truth")
-    system_response_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the system response")
-    discussion: str = dspy.OutputField(desc="discussion of the overlap between ground truth and system response")
-    recall: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
-    precision: float = dspy.OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
+    question: str = InputField()
+    ground_truth: str = InputField()
+    system_response: str = InputField()
+    ground_truth_key_ideas: str = OutputField(desc="enumeration of key ideas in the ground truth")
+    system_response_key_ideas: str = OutputField(desc="enumeration of key ideas in the system response")
+    discussion: str = OutputField(desc="discussion of the overlap between ground truth and system response")
+    recall: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
+    precision: float = OutputField(desc="fraction (out of 1.0) of system response covered by the ground truth")
 
 
 def f1_score(precision, recall):
     precision, recall = max(0.0, min(1.0, precision)), max(0.0, min(1.0, recall))
     return 0.0 if precision + recall == 0 else 2 * (precision * recall) / (precision + recall)
 
 
-class SemanticF1(dspy.Module):
+class SemanticF1(Module):
     def __init__(self, threshold=0.66, decompositional=False):
         self.threshold = threshold
 
         if decompositional:
-            self.module = dspy.ChainOfThought(DecompositionalSemanticRecallPrecision)
+            self.module = ChainOfThought(DecompositionalSemanticRecallPrecision)
         else:
-            self.module = dspy.ChainOfThought(SemanticRecallPrecision)
+            self.module = ChainOfThought(SemanticRecallPrecision)
 
     def forward(self, example, pred, trace=None):
         scores = self.module(question=example.question, ground_truth=example.response, system_response=pred.response)
@@ -55,42 +57,42 @@ def forward(self, example, pred, trace=None):
 ###########
 
 
-class AnswerCompleteness(dspy.Signature):
+class AnswerCompleteness(Signature):
     """
     Estimate the completeness of a system's responses, against the ground truth.
     You will first enumerate key ideas in each response, discuss their overlap, and then report completeness.
     """
 
-    question: str = dspy.InputField()
-    ground_truth: str = dspy.InputField()
-    system_response: str = dspy.InputField()
-    ground_truth_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the ground truth")
-    system_response_key_ideas: str = dspy.OutputField(desc="enumeration of key ideas in the system response")
-    discussion: str = dspy.OutputField(desc="discussion of the overlap between ground truth and system response")
-    completeness: float = dspy.OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
+    question: str = InputField()
+    ground_truth: str = InputField()
+    system_response: str = InputField()
+    ground_truth_key_ideas: str = OutputField(desc="enumeration of key ideas in the ground truth")
+    system_response_key_ideas: str = OutputField(desc="enumeration of key ideas in the system response")
+    discussion: str = OutputField(desc="discussion of the overlap between ground truth and system response")
+    completeness: float = OutputField(desc="fraction (out of 1.0) of ground truth covered by the system response")
 
 
 
-class AnswerGroundedness(dspy.Signature):
+class AnswerGroundedness(Signature):
     """
     Estimate the groundedness of a system's responses, against real retrieved documents written by people.
     You will first enumerate whatever non-trivial or check-worthy claims are made in the system response, and then
     discuss the extent to which some or all of them can be deduced from the retrieved context and basic commonsense.
     """
 
-    question: str = dspy.InputField()
-    retrieved_context: str = dspy.InputField()
-    system_response: str = dspy.InputField()
-    system_response_claims: str = dspy.OutputField(desc="enumeration of non-trivial or check-worthy claims in the system response")
-    discussion: str = dspy.OutputField(desc="discussion of how supported the claims are by the retrieved context")
-    groundedness: float = dspy.OutputField(desc="fraction (out of 1.0) of system response supported by the retrieved context")
+    question: str = InputField()
+    retrieved_context: str = InputField()
+    system_response: str = InputField()
+    system_response_claims: str = OutputField(desc="enumeration of non-trivial or check-worthy claims in the system response")
+    discussion: str = OutputField(desc="discussion of how supported the claims are by the retrieved context")
+    groundedness: float = OutputField(desc="fraction (out of 1.0) of system response supported by the retrieved context")
 
 
-class CompleteAndGrounded(dspy.Module):
+class CompleteAndGrounded(Module):
     def __init__(self, threshold=0.66):
         self.threshold = threshold
-        self.completeness_module = dspy.ChainOfThought(AnswerCompleteness)
-        self.groundedness_module = dspy.ChainOfThought(AnswerGroundedness)
+        self.completeness_module = ChainOfThought(AnswerCompleteness)
+        self.groundedness_module = ChainOfThought(AnswerGroundedness)
 
     def forward(self, example, pred, trace=None):
         completeness = self.completeness_module(question=example.question, ground_truth=example.response, system_response=pred.response)

diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py
@@ -200,7 +200,7 @@ def process_item(example):
         return round(100 * ncorrect / ntotal, 2)
 
     def _construct_result_table(
-        self, results: list[Tuple[dspy.Example, dspy.Example, Any]], metric_name: str
+        self, results: list[Tuple["dspy.Example", "dspy.Example", Any]], metric_name: str
     ) -> "pd.DataFrame":
         """
         Construct a pandas DataFrame from the specified result list.