euroeval.metrics.ifeval.metric
[docs]
module
euroeval.metrics.ifeval.metric
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96 | """IFEval instruction-following metric."""
import collections.abc as c
import logging
import typing as t
import nltk
from ...logging_utils import log_once
from ..base import Metric
from .constraints import ALL_CONSTRAINTS
if t.TYPE_CHECKING:
from datasets.arrow_dataset import Dataset
from ..data_models import BenchmarkConfig, DatasetConfig
logger = logging.getLogger(__name__)
class IFEvalInstructionAccuracy(Metric):
"""Metric for instruction-level accuracy using IFEval methodology."""
def __init__(self) -> None:
"""Initialise the metric."""
self.downloaded_nltk = False
super().__init__(
name="instruction_accuracy",
pretty_name="Instruction Accuracy",
postprocessing_fn=None,
)
def __call__(
self,
predictions: c.Sequence,
references: c.Sequence,
dataset: "Dataset",
dataset_config: "DatasetConfig",
benchmark_config: "BenchmarkConfig",
) -> float | None:
"""Calculate instruction-level accuracy.
Args:
predictions:
The model's predictions.
references:
The reference data.
dataset:
The dataset.
dataset_config:
The dataset configuration.
benchmark_config:
The benchmark configuration.
Returns:
The instruction-level accuracy.
"""
if not self.downloaded_nltk:
nltk.download("punkt_tab", quiet=True)
self.downloaded_nltk = True
all_results: list[bool] = []
for pred, ref in zip(predictions, references):
response = str(pred)
if not response.strip():
results = [False] * len(
[
instruction_id
for instruction_id in ref["instruction_id_list"]
if instruction_id in ALL_CONSTRAINTS
]
)
all_results.extend(results)
continue
results: list[bool] = list()
for instruction_id, kwargs in zip(
ref["instruction_id_list"], ref["kwargs"]
):
if instruction_id not in ALL_CONSTRAINTS:
log_once(
f"Skipping unsupported instruction: {instruction_id}",
level=logging.WARNING,
)
continue
constraint_function = ALL_CONSTRAINTS[instruction_id]
is_following = constraint_function(response, **kwargs)
results.append(is_following)
all_results.extend(results)
return sum(all_results) / len(all_results) if all_results else 0.0
instruction_accuracy = IFEvalInstructionAccuracy()
|