euroeval.task_group_utils.text_to_text

[docs] module euroeval.task_group_utils.text_to_text
"""Utility functions related to the text-to-text task group."""import collections.abc as cimport loggingimport typing as timport numpy as npfrom ..constants importMETRIC_ATTRIBUTES_TAKING_UP_MEMORYfrom ..exceptions importInvalidBenchmarkfrom ..logging_utils importlogfrom ..metrics importHuggingFaceMetricfrom ..utils importraise_if_model_output_contains_nan_valuesift.TYPE_CHECKING:from datasets.arrow_dataset importDatasetfrom transformers.trainer_utils importEvalPredictionfrom ..data_models importBenchmarkConfig,DatasetConfig,GenerativeModelOutputfrom ..types importLabels,Predictionsdef compute_metrics([docs]
model_outputs_and_labels:"tuple[Predictions, Labels] | EvalPrediction",dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",dataset:"Dataset",)->dict[str,float]:    """Compute the metrics needed for evaluation.    Args:        model_outputs_and_labels:            The first sequence contains the model outputs and the second sequence            contains the true labels.        dataset_config:            The configuration of the dataset.        benchmark_config:            The configuration of the benchmark.        dataset:            The dataset used for evaluation. This is only used in case any additional            metadata is used to compute the metrics.    Returns:        A dictionary with the names of the metrics as keys and the metric values as        values.    Raises:        InvalidBenchmark:            If the metric computation fails.    """model_outputs,labels=model_outputs_and_labels# If the model outputs is a pair, then the first element corresponds to the model# predictionsifisinstance(model_outputs,tuple)andlen(model_outputs)==2:model_outputs=model_outputs[0]assertnotisinstance(model_outputs,tuple)raise_if_model_output_contains_nan_values(model_output=model_outputs)model_output_dtype=np.asarray(model_outputs).dtypeoutput_is_prob=model_output_dtypein[np.float16,np.float32,np.float64]ifoutput_is_prob:predictions=np.asarray(model_outputs).argmax(axis=-1)else:predictions=model_outputsresults:dict[str,float]=dict()formetricindataset_config.task.metrics:# Some metrics can be computed on hardware accelerators. In this case we# start by setting the device to the same device as the modelif(isinstance(metric,HuggingFaceMetric)andmetric.compute_kwargs.get("device",None)=="auto"):metric.compute_kwargs["device"]=benchmark_config.device.typefor_inrange(num_attempts:=5):try:score:float|None=metric(predictions=predictions,references=labels,dataset=dataset,dataset_config=dataset_config,benchmark_config=benchmark_config,)breakexceptExceptionase:oom_error=["CUDA out of memory","CUDA error","MPS backend out of memory",]ifnotany(errorinstr(e)forerrorinoom_error):raiseInvalidBenchmark(str(e))from eif(isinstance(metric,HuggingFaceMetric)andmetric.compute_kwargs.get("device","cpu")!="cpu"):metric.compute_kwargs["device"]="cpu"log("Out of memory error occurred during the computation of "f"the metric {metric.pretty_name}. Moving the computation to ""the CPU.",level=logging.DEBUG,)else:raiseInvalidBenchmark(str(e))from efinally:forattributeinMETRIC_ATTRIBUTES_TAKING_UP_MEMORY:ifhasattr(metric,attribute):log(f"Deleting the {attribute!r} attribute of the metric "f"{metric.pretty_name} to free up memory.",level=logging.DEBUG,)delattr(metric,attribute)else:raiseInvalidBenchmark(f"Could not compute the metric {metric.pretty_name} after "f"{num_attempts} attempts due to out of memory errors.")# The metric returns None if we are running on multi-GPU and the current# process is not the main processifscoreisnotNone:results[metric.name]=scorereturnresultsdef extract_labels_from_generation([docs]
input_batch:dict[str,list],model_output:"GenerativeModelOutput")->c.Sequence[t.Any]:    """Extract the predicted labels from the generated output.    Args:        input_batch:            The input batch, where the keys are the feature names and the values            are lists with the feature values.        model_output:            The raw generated output of the model.    Returns:        The predicted labels.    """returnmodel_output.sequences