euroeval.scores
docs
module
euroeval.scores
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90 | """Aggregation of raw scores into the mean and a confidence interval."""
import logging
import typing as t
import warnings
import numpy as np
if t.TYPE_CHECKING:
from .data_models import MetricConfig
from .types import ScoreDict
logger = logging.getLogger("euroeval")
def log_scores(
dataset_name: str,
metric_configs: list["MetricConfig"],
scores: list[dict[str, float]],
model_id: str,
) -> "ScoreDict":
"""Log the scores.
Args:
dataset_name:
Name of the dataset.
metric_configs:
List of metrics to log.
scores:
The scores that are to be logged. This is a list of dictionaries full of
scores.
model_id:
The full Hugging Face Hub path to the pretrained transformer model.
Returns:
A dictionary with keys 'raw_scores' and 'total', with 'raw_scores' being
identical to `scores` and 'total' being a dictionary with the aggregated scores
(means and standard errors).
"""
logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
total_dict: dict[str, float] = dict()
for metric_cfg in metric_configs:
test_score, test_se = aggregate_scores(scores=scores, metric_config=metric_cfg)
test_score, test_score_str = metric_cfg.postprocessing_fn(test_score)
test_se, test_se_str = metric_cfg.postprocessing_fn(test_se)
total_dict[f"test_{metric_cfg.name}"] = test_score
total_dict[f"test_{metric_cfg.name}_se"] = test_se
logger.info(f"{metric_cfg.pretty_name}: {test_score_str} ± {test_se_str}")
return dict(raw=scores, total=total_dict)
def aggregate_scores(
scores: list[dict[str, float]], metric_config: "MetricConfig"
) -> tuple[float, float]:
"""Helper function to compute the mean with confidence intervals.
Args:
scores:
Dictionary with the names of the metrics as keys, of the form
"<split>_<metric_name>", such as "val_f1", and values the metric values.
metric_config:
The configuration of the metric, which is used to collect the correct
metric from `scores`.
Returns:
A pair of floats, containing the score and the radius of its 95% confidence
interval.
"""
with warnings.catch_warnings():
warnings.simplefilter("ignore")
test_scores = [
(
dct[metric_config.name]
if metric_config.name in dct
else dct[f"test_{metric_config.name}"]
)
for dct in scores
]
test_score = np.mean(test_scores).item()
if len(test_scores) > 1:
sample_std = np.std(test_scores, ddof=1)
test_se = sample_std / np.sqrt(len(test_scores))
else:
test_se = np.nan
return (test_score, 1.96 * test_se)
|