Skip to content

euroeval.metrics.base

[docs] module euroeval.metrics.base

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""The abstract base class for all metrics."""

import abc
import collections.abc as c
import logging
import typing as t

if t.TYPE_CHECKING:
    from datasets.arrow_dataset import Dataset

    from ..data_models import BenchmarkConfig, DatasetConfig

logger: logging.Logger = logging.getLogger("euroeval")


class Metric(abc.ABC):
    """Abstract base class for all metrics."""

    def __init__(
        self,
        name: str,
        pretty_name: str,
        postprocessing_fn: t.Callable[[float], tuple[float, str]] | None = None,
    ) -> None:
        """Initialise the metric.

        Args:
            name:
                The name of the metric in snake_case.
            pretty_name:
                The pretty name of the metric, used for display purposes.
            postprocessing_fn:
                A function to apply to the metric scores after they are computed,
                taking the score to the postprocessed score along with its string
                representation. Defaults to x -> (100 * x, f"{x:.2%}").
        """
        self.name = name
        self.pretty_name = pretty_name
        self.postprocessing_fn = (
            postprocessing_fn
            if postprocessing_fn is not None
            else lambda x: (100 * x, f"{x:.2%}")
        )

    def download(self, cache_dir: str) -> "Metric":
        """Initiates the download of the metric if needed.

        Args:
            cache_dir:
                The directory where the metric will be downloaded to.

        Returns:
            The metric object itself.
        """
        return self

    @abc.abstractmethod
    def __call__(
        self,
        predictions: c.Sequence,
        references: c.Sequence,
        dataset: "Dataset",
        dataset_config: "DatasetConfig",
        benchmark_config: "BenchmarkConfig",
    ) -> float | None:
        """Calculate the metric score.

        Args:
            predictions:
                The model predictions.
            references:
                The ground truth references.
            dataset:
                The dataset used for evaluation. This is only used in case any
                additional metadata is used to compute the metrics.
            dataset_config:
                The dataset configuration.
            benchmark_config:
                The benchmark configuration.

        Returns:
            The calculated metric score, or None if the score should be ignored.
        """
        ...

    def __hash__(self) -> int:
        """Return a hash of the metric configuration."""
        return hash(self.name)