Skip to content

euroeval.metrics.ifeval.metric

[docs] module euroeval.metrics.ifeval.metric

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""IFEval instruction-following metric."""

import collections.abc as c
import logging
import typing as t

import nltk

from ...logging_utils import log_once
from ..base import Metric
from .constraints import ALL_CONSTRAINTS

if t.TYPE_CHECKING:
    from datasets.arrow_dataset import Dataset

    from ..data_models import BenchmarkConfig, DatasetConfig

logger = logging.getLogger(__name__)


class IFEvalInstructionAccuracy(Metric):
    """Metric for instruction-level accuracy using IFEval methodology."""

    def __init__(self) -> None:
        """Initialise the metric."""
        self.downloaded_nltk = False
        super().__init__(
            name="instruction_accuracy",
            pretty_name="Instruction Accuracy",
            postprocessing_fn=None,
        )

    def __call__(
        self,
        predictions: c.Sequence,
        references: c.Sequence,
        dataset: "Dataset",
        dataset_config: "DatasetConfig",
        benchmark_config: "BenchmarkConfig",
    ) -> float | None:
        """Calculate instruction-level accuracy.

        Args:
            predictions:
                The model's predictions.
            references:
                The reference data.
            dataset:
                The dataset.
            dataset_config:
                The dataset configuration.
            benchmark_config:
                The benchmark configuration.

        Returns:
            The instruction-level accuracy.
        """
        if not self.downloaded_nltk:
            nltk.download("punkt_tab", quiet=True)
            self.downloaded_nltk = True

        all_results: list[bool] = []
        for pred, ref in zip(predictions, references):
            response = str(pred)

            if not response.strip():
                results = [False] * len(
                    [
                        instruction_id
                        for instruction_id in ref["instruction_id_list"]
                        if instruction_id in ALL_CONSTRAINTS
                    ]
                )
                all_results.extend(results)
                continue

            results: list[bool] = list()
            for instruction_id, kwargs in zip(
                ref["instruction_id_list"], ref["kwargs"]
            ):
                if instruction_id not in ALL_CONSTRAINTS:
                    log_once(
                        f"Skipping unsupported instruction: {instruction_id}",
                        level=logging.WARNING,
                    )
                    continue

                constraint_function = ALL_CONSTRAINTS[instruction_id]
                is_following = constraint_function(response, **kwargs)
                results.append(is_following)

            all_results.extend(results)
        return sum(all_results) / len(all_results) if all_results else 0.0


instruction_accuracy = IFEvalInstructionAccuracy()