-
Notifications
You must be signed in to change notification settings - Fork 42
/
Copy pathscoring.py
168 lines (132 loc) · 5.55 KB
/
scoring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# coding=utf-8
# Copyright 2020 The Google Research Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python2, python3
"""Library for scoring and evaluation of text samples.
Aggregation functions use bootstrap resampling to compute confidence intervals
as per the original ROUGE perl implementation.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import abc
import collections
import numpy as np
import six
from six.moves import range
class Score(
collections.namedtuple("Score", ["precision", "recall", "fmeasure"])):
"""Tuple containing precision, recall, and f-measure values."""
class BaseScorer(object):
"""Base class for Scorer objects."""
@abc.abstractmethod
def score(self, target, prediction):
"""Calculates score between the target and prediction.
Args:
target: Text containing the target (ground truth) text.
prediction: Text containing the predicted text.
Returns:
A dict mapping each score_type (string) to Score object.
"""
class AggregateScore(
collections.namedtuple("AggregateScore", ["low", "mid", "high"])):
"""Tuple containing confidence intervals for scores."""
class BootstrapAggregator(object):
"""Aggregates scores to provide confidence intervals.
Sample usage:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'])
aggregator = Aggregator()
aggregator.add_scores(scorer.score("one two three", "one two"))
aggregator.add_scores(scorer.score("one two five six", "seven eight"))
result = aggregator.aggregate()
print result
{'rougeL': AggregateScore(
low=Score(precision=0.0, recall=0.0, fmeasure=0.0),
mid=Score(precision=0.5, recall=0.33, fmeasure=0.40),
high=Score(precision=1.0, recall=0.66, fmeasure=0.80)),
'rouge1': AggregateScore(
low=Score(precision=0.0, recall=0.0, fmeasure=0.0),
mid=Score(precision=0.5, recall=0.33, fmeasure=0.40),
high=Score(precision=1.0, recall=0.66, fmeasure=0.80))}
"""
def __init__(self, confidence_interval=0.95, n_samples=1000):
"""Initializes a BootstrapAggregator object.
Args:
confidence_interval: Confidence interval to compute on the mean as a
decimal.
n_samples: Number of samples to use for bootstrap resampling.
Raises:
ValueError: If invalid argument is given.
"""
if confidence_interval < 0 or confidence_interval > 1:
raise ValueError("confidence_interval must be in range [0, 1]")
if n_samples <= 0:
raise ValueError("n_samples must be positive")
self._n_samples = n_samples
self._confidence_interval = confidence_interval
self._scores = collections.defaultdict(list)
def add_scores(self, scores):
"""Adds a sample for future aggregation.
Args:
scores: Dict mapping score_type strings to a namedtuple object/class
representing a score.
"""
for score_type, score in six.iteritems(scores):
self._scores[score_type].append(score)
def aggregate(self):
"""Aggregates scores previously added using add_scores.
Returns:
A dict mapping score_type to AggregateScore objects.
"""
result = {}
for score_type, scores in six.iteritems(self._scores):
# Stack scores into a 2-d matrix of (sample, measure).
score_matrix = np.vstack(tuple(scores))
# Percentiles are returned as (interval, measure).
percentiles = self._bootstrap_resample(score_matrix)
# Extract the three intervals (low, mid, high).
intervals = tuple(
(scores[0].__class__(*percentiles[j, :]) for j in range(3)))
result[score_type] = AggregateScore(
low=intervals[0], mid=intervals[1], high=intervals[2])
return result
def _bootstrap_resample(self, matrix):
"""Performs bootstrap resampling on a matrix of scores.
Args:
matrix: A 2-d matrix of (sample, measure).
Returns:
A 2-d matrix of (bounds, measure). There are three bounds: low (row 0),
mid (row 1) and high (row 2). Mid is always the mean, while low and high
bounds are specified by self._confidence_interval (which defaults to 0.95
meaning it will return the 2.5th and 97.5th percentiles for a 95%
confidence interval on the mean).
"""
# Matrix of (bootstrap sample, measure).
sample_mean = np.zeros((self._n_samples, matrix.shape[1]))
for i in range(self._n_samples):
sample_idx = np.random.choice(
np.arange(matrix.shape[0]), size=matrix.shape[0])
sample = matrix[sample_idx, :]
sample_mean[i, :] = np.mean(sample, axis=0)
# Take percentiles on the estimate of the mean using bootstrap samples.
# Final result is a (bounds, measure) matrix.
percentile_delta = (1 - self._confidence_interval) / 2
q = 100 * np.array([percentile_delta, 0.5, 1 - percentile_delta])
return np.percentile(sample_mean, q, axis=0)
def fmeasure(precision, recall):
"""Computes f-measure given precision and recall values."""
if precision + recall > 0:
return 2 * precision * recall / (precision + recall)
else:
return 0.0