euroeval.metrics.ifeval.metric¶

[docs] module euroeval.metrics.ifeval.metric
"""IFEval instruction-following metric."""import collections.abc as cimport loggingimport typing as timport nltkfrom ...logging_utils importlog_oncefrom ..base importMetricfrom .constraints importALL_CONSTRAINTSift.TYPE_CHECKING:from datasets.arrow_dataset importDatasetfrom ..data_models importBenchmarkConfig,DatasetConfiglogger=logging.getLogger(__name__)class IFEvalInstructionAccuracy(Metric):[docs]
    """Metric for instruction-level accuracy using IFEval methodology."""def __init__(self)->None:        """Initialise the metric."""self.downloaded_nltk=Falsesuper().__init__(name="instruction_accuracy",pretty_name="Instruction Accuracy",postprocessing_fn=None,)def __call__(self,predictions:c.Sequence,references:c.Sequence,dataset:"Dataset",dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",)->float|None:        """Calculate instruction-level accuracy.        Args:            predictions:                The model's predictions.            references:                The reference data.            dataset:                The dataset.            dataset_config:                The dataset configuration.            benchmark_config:                The benchmark configuration.        Returns:            The instruction-level accuracy.        """ifnotself.downloaded_nltk:nltk.download("punkt_tab",quiet=True)self.downloaded_nltk=Trueall_results:list[bool]=[]forpred,refinzip(predictions,references):response=str(pred)ifnotresponse.strip():results=[False]*len([instruction_idforinstruction_idinref["instruction_id_list"]ifinstruction_idinALL_CONSTRAINTS])all_results.extend(results)continueresults:list[bool]=list()forinstruction_id,kwargsinzip(ref["instruction_id_list"],ref["kwargs"]):ifinstruction_idnotinALL_CONSTRAINTS:log_once(f"Skipping unsupported instruction: {instruction_id}",level=logging.WARNING,)continueconstraint_function=ALL_CONSTRAINTS[instruction_id]is_following=constraint_function(response,**kwargs)results.append(is_following)all_results.extend(results)returnsum(all_results)/len(all_results)ifall_resultselse0.0instruction_accuracy=IFEvalInstructionAccuracy()