1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257 | """Metrics based on LLM-as-a-judge."""
import collections.abc as c
import logging
import typing as t
from pathlib import Path
from pydantic import BaseModel, Field
from ..exceptions import InvalidBenchmark
from ..model_cache import ModelCache
from ..utils import extract_json_dict_from_string
from .base import Metric
if t.TYPE_CHECKING:
from datasets.arrow_dataset import Dataset
from ..data_models import BenchmarkConfig, DatasetConfig
logger: logging.Logger = logging.getLogger("euroeval")
class LLMAsAJudgeMetric(Metric):
"""Use an LLM to judge the quality of the predictions."""
def __init__(
self,
name: str,
pretty_name: str,
judge_id: str,
judge_kwargs: dict[str, t.Any],
user_prompt: str,
response_format: t.Type[BaseModel],
scoring_fn: t.Callable[[BaseModel | None], float],
condition_formatting_fn: t.Callable[[str], str] = lambda x: x,
system_prompt: str | None = None,
) -> None:
"""Initialise the LLM as a judge metric.
Args:
name:
The name of the metric in snake_case.
pretty_name:
The pretty name of the metric, used for display purposes.
judge_id:
The model ID of the LLM to use as a judge.
judge_kwargs:
Generation parameters for the judge model, such as temperature.
user_prompt:
The user prompt to use for the judge model. The prompt should be
formatted with the variables `prediction` and `condition`, to
include the model predictions and a description of what the prediction
should be judged on, respectively. If the condition is not needed,
it can be omitted from the prompt, but the `prediction` variable must
still be present.
response_format:
The response format to use for the judge model. This should be a
Pydantic model that defines the expected structure of the judge's
response.
scoring_fn:
A function that takes the judge's response and returns a score.
condition_formatting_fn (optional):
A function to format the condition string before it is included in the
user prompt. Defaults to a no-op function that returns the input
unchanged.
system_prompt (optional):
The system prompt to use for the judge model. If not provided, no system
prompt will be used.
"""
super().__init__(name=name, pretty_name=pretty_name)
self.judge_id = judge_id
self.judge_kwargs = judge_kwargs
self.user_prompt = user_prompt
self.response_format = response_format
self.scoring_fn = scoring_fn
self.condition_formatting_fn = condition_formatting_fn
self.system_prompt = system_prompt
# Add response format to the generation kwargs
self.judge_kwargs["response_format"] = self.response_format
def __call__(
self,
predictions: c.Sequence,
references: c.Sequence,
dataset: "Dataset",
dataset_config: "DatasetConfig",
benchmark_config: "BenchmarkConfig",
) -> float | None:
"""Calculate the metric score using the judge model.
Args:
predictions:
The model predictions.
references:
The ground truth references.
dataset:
The dataset used for evaluation. This is only used in case any
additional metadata is used to compute the metrics.
dataset_config:
The dataset configuration.
benchmark_config:
The benchmark configuration.
Returns:
The calculated metric score, or None if the score should be ignored.
Raises:
InvalidBenchmark:
If the number of predictions does not match the number of references,
or if the user prompt requires a condition but none is provided.
"""
# Importing here to avoid circular imports
from ..benchmark_modules import LiteLLMModel
if not predictions or not references:
return None
elif len(predictions) != len(references):
raise InvalidBenchmark(
f"The number of predictions ({len(predictions):,}) does not match the "
f"number of references ({len(references):,})."
)
# Load the judge model
judge_model_config = LiteLLMModel.get_model_config(
model_id=self.judge_id, benchmark_config=benchmark_config
)
self.judge = LiteLLMModel(
model_config=judge_model_config,
dataset_config=dataset_config,
benchmark_config=benchmark_config,
log_metadata=False,
**self.judge_kwargs,
)
# Create a cache for the judge model
judge_cache = ModelCache(
model_cache_dir=Path(judge_model_config.model_cache_dir),
cache_name=f"{dataset_config.name}-model-outputs.json",
max_generated_tokens=dataset_config.max_generated_tokens,
)
judge_cache.load()
# Prepare the messages for the LLM
conversations = [
[
dict(
role="user",
content=self._apply_user_prompt(
prediction=prediction, condition=condition
),
)
]
for prediction, condition in zip(predictions, references)
]
if self.system_prompt:
conversations = [
[dict(role="system", content=self.system_prompt), *conversation]
for conversation in conversations
]
# Get the non-cached conversations and generate the completions for them
non_cached_conversations = [
(idx, conversation)
for idx, conversation in enumerate(conversations)
if conversation not in judge_cache
]
if non_cached_conversations:
model_inputs = dict(messages=[c for _, c in non_cached_conversations])
non_cached_outputs = self.judge.generate(inputs=model_inputs)
# Store the non-cached outputs in the cache
judge_cache.add_to_cache(
model_inputs=model_inputs, model_output=non_cached_outputs
)
judge_cache.save()
# Load all the outputs from the cache, in the original order, and parse them
raw_outputs = [judge_cache[conversation] for conversation in conversations]
json_dicts = [
extract_json_dict_from_string(s=output.sequence) for output in raw_outputs
]
outputs = [
self.response_format.model_validate(obj=json_dict)
if json_dict is not None
else None
for json_dict in json_dicts
]
# Calculate the scores using the scoring function
scores = [self.scoring_fn(output) for output in outputs]
if not scores:
logger.warning(f"No scores were calculated for {self.pretty_name}.")
return None
return sum(scores) / len(scores)
def _apply_user_prompt(self, prediction: str, condition: str | None = None) -> str:
"""Apply the user prompt to the prediction and condition.
Args:
prediction:
The model prediction.
condition (optional):
A description of what the prediction should be judged on. If not
provided, it will be omitted from the prompt.
Returns:
The formatted user prompt with the prediction and reference.
Raises:
InvalidBenchmark:
If the user prompt requires a reference but none is provided.
"""
condition_required = "{condition}" in self.user_prompt
if condition_required and condition is None:
raise InvalidBenchmark(
f"The user prompt for the {self.pretty_name!r} metric requires a "
"condition, but none was provided."
)
if condition is not None:
return self.user_prompt.format(
prediction=prediction, condition=self.condition_formatting_fn(condition)
)
return self.user_prompt.format(prediction=prediction)
### Fluency metric ###
class Fluency(BaseModel):
"""Response format for the fluency metric.
Attributes:
fluency:
The fluency rating, an integer between 1 and 5.
"""
fluency: t.Annotated[int, Field(ge=1, le=5)]
fluency_metric = LLMAsAJudgeMetric(
name="fluency",
pretty_name="Fluency",
judge_id="gpt-5-2025-08-07",
judge_kwargs=dict(temperature=1.0),
user_prompt="Please rate the fluency of the following text on a scale from 1 to 5, "
"with the following definitions:\n"
"- 1: Very poor fluency, many grammatical errors\n"
"- 2: Poor fluency, several grammatical errors\n"
"- 3: Average fluency, a few grammatical errors\n"
"- 4: Good fluency, no grammatical errors but sounds a bit off\n"
"- 5: Excellent fluency, no grammatical errors and sounds natural\n\n"
"Text: {prediction!r}\n\n"
"Output your rating as a JSON object with a single key 'fluency'.",
response_format=Fluency,
scoring_fn=lambda output: (output.fluency - 1) / 4.0 if output is not None else 0.0,
)
|