euroeval.metrics.base
[docs]
module
euroeval.metrics.base
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88 | """The abstract base class for all metrics."""
import abc
import collections.abc as c
import logging
import typing as t
if t.TYPE_CHECKING:
from datasets.arrow_dataset import Dataset
from ..data_models import BenchmarkConfig, DatasetConfig
logger: logging.Logger = logging.getLogger("euroeval")
class Metric(abc.ABC):
"""Abstract base class for all metrics."""
def __init__(
self,
name: str,
pretty_name: str,
postprocessing_fn: t.Callable[[float], tuple[float, str]] | None = None,
) -> None:
"""Initialise the metric.
Args:
name:
The name of the metric in snake_case.
pretty_name:
The pretty name of the metric, used for display purposes.
postprocessing_fn:
A function to apply to the metric scores after they are computed,
taking the score to the postprocessed score along with its string
representation. Defaults to x -> (100 * x, f"{x:.2%}").
"""
self.name = name
self.pretty_name = pretty_name
self.postprocessing_fn = (
postprocessing_fn
if postprocessing_fn is not None
else lambda x: (100 * x, f"{x:.2%}")
)
def download(self, cache_dir: str) -> "Metric":
"""Initiates the download of the metric if needed.
Args:
cache_dir:
The directory where the metric will be downloaded to.
Returns:
The metric object itself.
"""
return self
@abc.abstractmethod
def __call__(
self,
predictions: c.Sequence,
references: c.Sequence,
dataset: "Dataset",
dataset_config: "DatasetConfig",
benchmark_config: "BenchmarkConfig",
) -> float | None:
"""Calculate the metric score.
Args:
predictions:
The model predictions.
references:
The ground truth references.
dataset:
The dataset used for evaluation. This is only used in case any
additional metadata is used to compute the metrics.
dataset_config:
The dataset configuration.
benchmark_config:
The benchmark configuration.
Returns:
The calculated metric score, or None if the score should be ignored.
"""
...
def __hash__(self) -> int:
"""Return a hash of the metric configuration."""
return hash(self.name)
|