euroeval.task_group_utils.sequence_classification

[docs] module euroeval.task_group_utils.sequence_classification
"""Utility functions related to the sequence-classification task group."""import loggingimport reimport typing as timport Levenshteinimport numpy as npfrom ..enums importTaskGroupfrom ..exceptions importInvalidBenchmarkfrom ..utils importlog_once,raise_if_model_output_contains_nan_valuesift.TYPE_CHECKING:from datasets.arrow_dataset importDatasetfrom transformers.trainer_utils importEvalPredictionfrom ..data_models importDatasetConfig,GenerativeModelOutputfrom ..types importLabels,Predictionslogger=logging.getLogger("euroeval")def compute_metrics([docs]
model_outputs_and_labels:"tuple[Predictions, Labels] | EvalPrediction",dataset_config:"DatasetConfig",dataset:"Dataset",)->dict[str,float]:    """Compute the metrics needed for evaluation.    Args:        model_outputs_and_labels:            The first sequence contains the model outputs and the second sequence            contains the true labels.        dataset_config:            The configuration of the dataset.        dataset:            The dataset used for evaluation. This is only used in case any additional            metadata is used to compute the metrics.    Returns:        A dictionary with the names of the metrics as keys and the metric values as        values.    """model_outputs,labels=model_outputs_and_labelslabel2id={label:idxforidx,labelindataset_config.id2label.items()}# If the model outputs is a pair, then the first element corresponds to the model# predictionsifisinstance(model_outputs,tuple)andlen(model_outputs)==2:model_outputs=model_outputs[0]model_output_dtype=np.asarray(model_outputs).dtypeifmodel_output_dtypein[np.float16,np.float32,np.float64]:predictions=np.asarray(model_outputs).argmax(axis=-1)else:predictions=model_outputsassertnotisinstance(model_outputs,tuple)raise_if_model_output_contains_nan_values(model_output=model_outputs)prompt_label_to_label_mapping={prompt_label:labelforlabel,prompt_labelindataset_config.prompt_label_mapping.items()}predictions=[(label2id[prompt_label_to_label_mapping[pred.lower()]]ifisinstance(pred,str)elsepred)forpredinpredictions]label_ids=[label2id[label.lower()]ifisinstance(label,str)elselabelforlabelinlabels]results:dict[str,float]=dict()formetricindataset_config.task.metrics:score:float|None=metric(predictions=predictions,references=label_ids,dataset=dataset)# The metric returns None if we are running on multi-GPU and the current# process is not the main processifscoreisnotNone:results[metric.name]=scorereturnresultsdef extract_labels_from_generation([docs]
input_batch:dict[str,list],model_output:"GenerativeModelOutput",dataset_config:"DatasetConfig",first_label_token_mapping:dict[str,str]|bool,)->list[str]:    """Extract the predicted labels from the generated output.    Args:        input_batch:            The input batch, where the keys are the feature names and the values            are lists with the feature values.        model_output:            The raw generated output of the model.        dataset_config:            The configuration of the dataset.        first_label_token_mapping:            A mapping from labels to the first token in each label, or alternatively a            Boolean value indicating whether the model should output scores (if the            mapping is outputted then the model will always output scores).    Returns:        The predicted labels.    Raises:        InvalidBenchmark:            If the task requires log probabilities, but the model did not output them,            or if the model outputted log probabilities but the first label token            mapping is not provided.    """ifmodel_output.scoresisnotNone:iffirst_label_token_mappingisFalse:raiseInvalidBenchmark("The model outputted logprobs, but the first label token mapping is ""not provided, which is not supported.")labels=get_closest_logprobs_labels(generation_logprobs=model_output.scores,dataset_config=dataset_config,first_label_token_mapping=first_label_token_mapping,)iflabelsisnotNone:returnlabelselifdataset_config.task.requires_logprobs:raiseInvalidBenchmark("This task requires the model to output logprobs, and this model ""does not seem to be able to do that. Skipping the evaluation.")# Get the candidate labels, which are the labels that the model can predictcandidate_labels=[dataset_config.prompt_label_mapping[lbl]forlblindataset_config.id2label.values()]new_predicted_labels:list[str]=list()foridx,predicted_labelinenumerate(model_output.sequences):# Special case if we are doing multiple choice classification: we in this case# dynamically change the candidate labels to the labels mentioned in the promptifdataset_config.task.task_group==TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:prompt=input_batch["text"][idx]sample_candidate_labels=[candidate_labelforcandidate_labelincandidate_labelsifre.search(pattern=rf"\b{candidate_label}. ",string=prompt,flags=re.IGNORECASE,)isnotNone]else:sample_candidate_labels=candidate_labels# If the prediction includes a boxed answer, use that instead of the full# generationif(m:=re.search(r"boxed\{(.*?)\}",predicted_label))isnotNone:predicted_label=m.group(1)# We set the word edit distance weights such that we heavily penalise insertions# and substitutions, so that we don't just insert the correct label, but that we# want the model to have included the correct label in its output.insertion_weight=1000deletion_weight=1substitution_weight=1000# Compute the word edit distances between the predicted label and all candidate# labelsedit_distances=[Levenshtein.distance(s1=predicted_label.lower(),s2=candidate_label.lower(),weights=(insertion_weight,deletion_weight,substitution_weight),)forcandidate_labelinsample_candidate_labels]# If no candidate labels were found, we assume that something is wrong with the# model output, and we raise an errorifmin(edit_distances)>100:raiseInvalidBenchmark(f"No candidate labels found for the predicted label "f"{predicted_label!r}. This likely means that the model output is ""completely off, and we cannot extract any labels from it. Please ""check the model output and the candidate labels.")# Pick the label with the smallest word edit distance to the predicted labelbest_candidate_label=sample_candidate_labels[np.argmin(edit_distances).item()]new_predicted_labels.append(best_candidate_label)returnnew_predicted_labelsdef get_closest_logprobs_labels([docs]
generation_logprobs:list[list[list[tuple[str,float]]]],dataset_config:"DatasetConfig",first_label_token_mapping:dict[str,str]|t.Literal[True],)->list[str]|None:    """Get the labels with the highest predicted logprob value.    In case a candidate label is split into multiple tokens, we only use the first    token to compute the logprob value. E.g., if the candidate label "positive" is    tokenised as ["pos", "itive"], we only use the logprob value of "pos" to    represent the logprob value of the entire label.    Args:        generation_logprobs:            The logprobs of the generated tokens, for all samples in the batch. Of shape            (batch_size, num_tokens, num_logprobs).        dataset_config:            The configuration of the dataset.        first_label_token_mapping:            A mapping from labels to the first token in each label, or alternatively a            `True` value indicating that the model should output logprobs.    Returns:        The predicted labels, or None if labels could not be extracted.    Raises:        InvalidBenchmark:            If no candidate label can be found for any of the generated labels.    """english_labels=list(dataset_config.id2label.values())english2local=dataset_config.prompt_label_mappingcandidate_labels=[english2local[lbl].lower()forlblinenglish_labels]output_labels:list[str]=list()forsampleingeneration_logprobs:forlogprob_listinsample:generated_labels=[re.sub(pattern=r"^[^a-zæøåüöä0-9]+$",repl="",string=label.lower())forlabel,_inlogprob_list]generated_labels=[labelforlabelingenerated_labelsiflabel!=""]# We want to use the first generated label which contains a unique candidate# label, as the output labeloutput_label:str|None=Noneforgenerated_labelingenerated_labels:# Get the candidate labels. If we have a first label token mapping, we# use it to get the candidate labels. Otherwise, we check if any of the# labels start with the generated label.ifisinstance(first_label_token_mapping,dict):ifany(candidate_labelnotinfirst_label_token_mappingforcandidate_labelincandidate_labels):raiseInvalidBenchmark("There is a label not present in the first label token ""mapping - this should never happen! Please report this ""issue to the EuroEval team at ""github.com/EuroEval/EuroEval/issues.")candidate_output_labels={candidate_labelforcandidate_labelincandidate_labelsifgenerated_label==first_label_token_mapping[candidate_label]}else:candidate_output_labels={candidate_labelforcandidate_labelincandidate_labelsifcandidate_label.startswith(generated_label)}# If the generated label is a numeral (e.g., "1", "2", "3") and there is# a matching candidate label, we only keep the full matchifre.match(r"^\d+$",generated_label)andany(candidate_label==generated_labelforcandidate_labelincandidate_output_labels):candidate_output_labels={candidate_labelforcandidate_labelincandidate_output_labelsifcandidate_label==generated_label}# If we can uniquely determine the output label, we break the loop.iflen(candidate_output_labels)==1:output_label=candidate_output_labels.pop()break# If we have multiple candidate labels, we cannot uniquely determine the# output label, so we abandon extracting the labels using logprobs and# fall back to using word edit distance.eliflen(candidate_output_labels)>1:log_once("Multiple candidate labels found for the generated label "f"{generated_label!r}: {candidate_output_labels}. This means ""that using logprobs to extract the labels is not reliable, ""and we will instead fall back to extracting the labels ""using word edit distance.",level=logging.DEBUG,)returnNone# If no candidate label is found, we first check if any of the labels# start with the generated label. This could be the case if the labels# in the first token mapping is inaccurate or incomplete, for instance# if 'pos' is in the first label token mapping, but the model outputted# 'posit'. If this is the case then we cannot trust the first label# token mapping, and we fall back to using word edit distance.# Otherwise, the generated label is just bad, and we skip to the next# generated label.eliflen(candidate_output_labels)==0:candidate_output_labels_starting_with_generated_label=[candidate_labelforcandidate_labelincandidate_labelsifcandidate_label.startswith(generated_label)]ifcandidate_output_labels_starting_with_generated_label:log_once(f"No candidate label found for the generated label "f"{generated_label!r}, but there are candidate labels "f"starting with it: "f"{candidate_output_labels_starting_with_generated_label}. ""This means that the first label token mapping is not ""reliable, and we will instead fall back to extracting ""the labels using word edit distance.",level=logging.DEBUG,)returnNone# If we did not find any candidate label for any of the generated labels, we# assume that something is wrong with the model output, and we fall back to# using word edit distance to extract the labelselse:log_once(f"No candidate label found for any of the generated labels "f"{generated_labels}. This means that using logprobs to extract ""the labels is not reliable, and we will instead fall back to ""extracting the labels using word edit distance.",level=logging.DEBUG,)returnNoneifoutput_labelisnotNone:output_labels.append(output_label)breakelse:iflen(sample)==0:log_once("The model outputted an empty string, so no candidate labels could "f"be determined. Using the first label, {candidate_labels[0]!r}, ""as the output label.",level=logging.INFO,)else:log_once("Could not find a candidate label for any of the generated "f"labels in the sample {sample}. Using the first label, "f"{candidate_labels[0]!r}, as the output label.",level=logging.INFO,)output_labels.append(candidate_labels[0])assertlen(output_labels)==len(generation_logprobs)returnoutput_labels