euroeval.metrics.llm_as_a_judge

[docs] module euroeval.metrics.llm_as_a_judge
"""Metrics based on LLM-as-a-judge."""import collections.abc as cimport loggingimport typing as tfrom pathlib importPathfrom pydantic importBaseModel,Fieldfrom ..exceptions importInvalidBenchmarkfrom ..logging_utils importlogfrom ..utils importextract_json_dict_from_stringfrom .base importMetricift.TYPE_CHECKING:from datasets.arrow_dataset importDatasetfrom ..data_models importBenchmarkConfig,DatasetConfigclass LLMAsAJudgeMetric(Metric):[docs]
    """Use an LLM to judge the quality of the predictions."""def __init__(self,name:str,pretty_name:str,judge_id:str,judge_kwargs:dict[str,t.Any],user_prompt:str,response_format:t.Type[BaseModel],scoring_fn:t.Callable[[BaseModel|None],float],condition_formatting_fn:t.Callable[[str],str]=lambdax:x,system_prompt:str|None=None,)->None:        """Initialise the LLM as a judge metric.        Args:            name:                The name of the metric in snake_case.            pretty_name:                The pretty name of the metric, used for display purposes.            judge_id:                The model ID of the LLM to use as a judge.            judge_kwargs:                Generation parameters for the judge model, such as temperature.            user_prompt:                The user prompt to use for the judge model. The prompt should be                formatted with the variables `prediction` and `condition`, to                include the model predictions and a description of what the prediction                should be judged on, respectively. If the condition is not needed,                it can be omitted from the prompt, but the `prediction` variable must                still be present.            response_format:                The response format to use for the judge model. This should be a                Pydantic model that defines the expected structure of the judge's                response.            scoring_fn:                A function that takes the judge's response and returns a score.            condition_formatting_fn (optional):                A function to format the condition string before it is included in the                user prompt. Defaults to a no-op function that returns the input                unchanged.            system_prompt (optional):                The system prompt to use for the judge model. If not provided, no system                prompt will be used.        """super().__init__(name=name,pretty_name=pretty_name)self.judge_id=judge_idself.judge_kwargs=judge_kwargsself.user_prompt=user_promptself.response_format=response_formatself.scoring_fn=scoring_fnself.condition_formatting_fn=condition_formatting_fnself.system_prompt=system_prompt# Add response format to the generation kwargsself.judge_kwargs["response_format"]=self.response_formatdef __call__(self,predictions:c.Sequence,references:c.Sequence,dataset:"Dataset",dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",)->float|None:        """Calculate the metric score using the judge model.        Args:            predictions:                The model predictions.            references:                The ground truth references.            dataset:                The dataset used for evaluation. This is only used in case any                additional metadata is used to compute the metrics.            dataset_config:                The dataset configuration.            benchmark_config:                The benchmark configuration.        Returns:            The calculated metric score, or None if the score should be ignored.        Raises:            InvalidBenchmark:                If the number of predictions does not match the number of references,                or if the user prompt requires a condition but none is provided.        """# Importing here to avoid circular importsfrom ..benchmark_modules importLiteLLMModelfrom ..model_cache importModelCacheifnotpredictionsornotreferences:returnNoneeliflen(predictions)!=len(references):raiseInvalidBenchmark(f"The number of predictions ({len(predictions):,}) does not match the "f"number of references ({len(references):,}).")# Load the judge modeljudge_model_config=LiteLLMModel.get_model_config(model_id=self.judge_id,benchmark_config=benchmark_config)self.judge=LiteLLMModel(model_config=judge_model_config,dataset_config=dataset_config,benchmark_config=benchmark_config,log_metadata=False,**self.judge_kwargs,)# Create a cache for the judge modeljudge_cache=ModelCache(model_cache_dir=Path(judge_model_config.model_cache_dir),cache_name=f"{dataset_config.name}-model-outputs.json",max_generated_tokens=dataset_config.max_generated_tokens,)judge_cache.load()# Prepare the messages for the LLMconversations=[[dict(role="user",content=self._apply_user_prompt(prediction=prediction,condition=condition),)]forprediction,conditioninzip(predictions,references)]ifself.system_prompt:conversations=[[dict(role="system",content=self.system_prompt),*conversation]forconversationinconversations]# Get the non-cached conversations and generate the completions for themnon_cached_conversations=[(idx,conversation)foridx,conversationinenumerate(conversations)ifconversationnotinjudge_cache]ifnon_cached_conversations:model_inputs=dict(messages=[cfor_,cinnon_cached_conversations])non_cached_outputs=self.judge.generate(inputs=model_inputs)# Store the non-cached outputs in the cachejudge_cache.add_to_cache(model_inputs=model_inputs,model_output=non_cached_outputs)judge_cache.save()# Load all the outputs from the cache, in the original order, and parse themraw_outputs=[judge_cache[conversation]forconversationinconversations]json_dicts=[extract_json_dict_from_string(s=output.sequence)foroutputinraw_outputs]outputs=[self.response_format.model_validate(obj=json_dict)ifjson_dictisnotNoneelseNoneforjson_dictinjson_dicts]# Calculate the scores using the scoring functionscores=[self.scoring_fn(output)foroutputinoutputs]ifnotscores:log(f"No scores were calculated for {self.pretty_name}.",level=logging.WARNING,)returnNonereturnsum(scores)/len(scores)def _apply_user_prompt(self,prediction:str,condition:str|None=None)->str:        """Apply the user prompt to the prediction and condition.        Args:            prediction:                The model prediction.            condition (optional):                A description of what the prediction should be judged on. If not                provided, it will be omitted from the prompt.        Returns:            The formatted user prompt with the prediction and reference.        Raises:            InvalidBenchmark:                If the user prompt requires a reference but none is provided.        """condition_required="{condition}"inself.user_promptifcondition_requiredandconditionisNone:raiseInvalidBenchmark(f"The user prompt for the {self.pretty_name!r} metric requires a ""condition, but none was provided.")ifconditionisnotNone:returnself.user_prompt.format(prediction=prediction,condition=self.condition_formatting_fn(condition))returnself.user_prompt.format(prediction=prediction)### Fluency metric ###class Fluency(BaseModel):[docs]
    """Response format for the fluency metric.    Attributes:        fluency:            The fluency rating, an integer between 1 and 5.    """fluency:t.Annotated[int,Field(ge=1,le=5)]fluency_metric=LLMAsAJudgeMetric(name="fluency",pretty_name="Fluency",judge_id="gpt-5-2025-08-07",judge_kwargs=dict(temperature=1.0),user_prompt="Please rate the fluency of the following text on a scale from 1 to 5, ""with the following definitions:\n""- 1: Very poor fluency, many grammatical errors\n""- 2: Poor fluency, several grammatical errors\n""- 3: Average fluency, a few grammatical errors\n""- 4: Good fluency, no grammatical errors but sounds a bit off\n""- 5: Excellent fluency, no grammatical errors and sounds natural\n\n""Text: {prediction!r}\n\n""Output your rating as a JSON object with a single key 'fluency'.",response_format=Fluency,scoring_fn=lambdaoutput:(output.fluency-1)/4.0ifoutputisnotNoneelse0.0,)