euroeval.metrics.pipeline

[docs] module euroeval.metrics.pipeline
"""Metrics based on a scikit-learn Pipeline."""import collections.abc as cimport loggingimport typing as tfrom pathlib importPathimport cloudpickleimport huggingface_hub as hf_hubimport numpy as npfrom scipy.special importexpitassigmoidfrom ..exceptions importInvalidBenchmarkfrom ..logging_utils importlog,no_terminal_outputfrom ..utils importunscramblefrom .base importMetricift.TYPE_CHECKING:from datasets.arrow_dataset importDatasetfrom sklearn.pipeline importPipelinefrom ..data_models importBenchmarkConfig,DatasetConfigT=t.TypeVar("T",bound=int|float|str|bool)class PreprocessingFunction(t.Protocol):[docs]
    """A protocol for a preprocessing function."""def __call__(self,predictions:c.Sequence[int],dataset:"Dataset")->c.Sequence[int]:        """Preprocess the model predictions before they are passed to the pipeline.        Args:            predictions:                The model predictions.            dataset:                The dataset used for evaluation. This is only used in case any                additional metadata is used to compute the metrics.        Returns:            The preprocessed model predictions.        """...class PipelineMetric(Metric):[docs]
    """Load a scikit-learn pipeline and use it to get scores from the predictions."""def __init__(self,name:str,pretty_name:str,pipeline_repo:str,pipeline_scoring_function:c.Callable[["Pipeline",c.Sequence],float],pipeline_file_name:str="pipeline.pkl",preprocessing_fn:PreprocessingFunction|None=None,postprocessing_fn:c.Callable[[float],tuple[float,str]]|None=None,)->None:        """Initialise the pipeline transform metric.        Args:            name:                The name of the metric in snake_case.            pretty_name:                The pretty name of the metric, used for display purposes.            pipeline_repo:                The Hugging Face repository ID of the scikit-learn pipeline to load.            pipeline_scoring_method:                The method to use for scoring the predictions with the pipeline. Takes                a 1D sequence of predictions and returns a float score.            pipeline_file_name (optional):                The name of the file to download from the Hugging Face repository.                Defaults to "pipeline.joblib".            preprocessing_fn (optional):                A function to apply to the predictions before they are passed to the                pipeline. This is useful for preprocessing the predictions to match                the expected input format of the pipeline. Defaults to a no-op function                that returns the input unchanged.            postprocessing_fn (optional):                A function to apply to the metric scores after they are computed,                taking the score to the postprocessed score along with its string                representation. Defaults to x -> (100 * x, f"{x:.2%}").        """super().__init__(name=name,pretty_name=pretty_name,postprocessing_fn=postprocessing_fn)self.pipeline_repo=pipeline_repoself.pipeline_file_name=pipeline_file_nameself.pipeline_scoring_function=pipeline_scoring_functionself.pipeline:"Pipeline | None"=Noneself.preprocessing_fn=preprocessing_fndef __call__(self,predictions:c.Sequence,references:c.Sequence,dataset:"Dataset",dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",)->float|None:        """Calculate the metric score using the scikit-learn pipeline.        Args:            predictions:                The model predictions.            references:                Not used, but required for consistency with the Metric interface.            dataset:                The dataset used for evaluation. This is only used in case any                additional metadata is used to compute the metrics.            dataset_config:                The dataset configuration.            benchmark_config:                The benchmark configuration.        Returns:            The calculated metric score, or None if the score should be ignored.        """ifself.pipelineisNone:self.pipeline=self._download_pipeline(cache_dir=benchmark_config.cache_dir)ifself.preprocessing_fnisnotNone:predictions=self.preprocessing_fn(predictions=predictions,dataset=dataset)returnself.pipeline_scoring_function(self.pipeline,predictions)def _download_pipeline(self,cache_dir:str)->"Pipeline":        """Download the scikit-learn pipeline from the given URL.        Args:            cache_dir:                The directory to use for caching the downloaded pipeline.        Returns:            The downloaded scikit-learn pipeline.        Raises:            InvalidBenchmark:                If the loading of the pipeline fails for any reason.        """log(f"Loading pipeline from {self.pipeline_repo}...",level=logging.DEBUG)withno_terminal_output():folder_path=hf_hub.HfApi(token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_")).snapshot_download(repo_id=self.pipeline_repo,repo_type="model",cache_dir=cache_dir)model_path=Path(folder_path,self.pipeline_file_name)try:withmodel_path.open(mode="rb")asf:pipeline=cloudpickle.load(f)exceptExceptionase:raiseInvalidBenchmark(f"Failed to load pipeline from {self.pipeline_repo!r}: {e}")from elog(f"Successfully loaded pipeline: {pipeline}",level=logging.DEBUG)returnpipeline### European Values Metric ###def european_values_preprocessing_fn([docs]
predictions:c.Sequence[int],dataset:"Dataset")->c.Sequence[int]:    """Preprocess the model predictions for the European Values metric.    Args:        predictions:            The model predictions, a sequence of integers representing the predicted            choices for each question.        dataset:            The dataset used for evaluation. This is only used in case any additional            metadata is used to compute the metrics.    Returns:        The preprocessed model predictions, a sequence of integers representing the        final predicted choices for each question after any necessary aggregation and        mapping.    Raises:        AssertionError:            If the number of predictions is not a multiple of 53, which is required            for the European Values metric.    """num_questions=53num_phrasings_per_question=5# Convert the predictions to integersinteger_predictions=[]forprediction,idx_to_choiceinzip(predictions,dataset["idx_to_choice"]):idx_to_choice={int(idx):int(choice)foridx,choiceinidx_to_choice.items()ifchoiceisnotNone}ifpredictionnotinidx_to_choice:raiseInvalidBenchmark(f"The prediction {prediction} is not a valid index for the "f"question with choices {idx_to_choice}.")integer_prediction=idx_to_choice[prediction]integer_predictions.append(integer_prediction)assertlen(predictions)%num_questions==0,(f"The number of predictions ({len(predictions)}) is not a multiple of "f"{num_questions}, which is required for the European Values metric.")# When we are using the situational version of the dataset, there are 5 phrasings# for each question, so we need to aggregate the predictions by question, which we# do using majority voting.using_situational=len(predictions)==num_questions*num_phrasings_per_questionifusing_situational:# Reshape the predictions to a 2D array with `num_phrasings_per_question` rows# (one for each phrasing) and `num_questions` columns (one for each question).# The five phrasings for each question appear right after each other, e.g.,# (0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, ...)# Shape: (num_questions, num_phrasings_per_question)arr=np.array([integer_predictions[i:i+num_phrasings_per_question]foriinrange(0,len(predictions),num_phrasings_per_question)])# Double check that we reshaped the predictions correctlyforidx,predinenumerate(integer_predictions):assertarr[idx//5,idx%5]==pred,(f"Reshaped predictions do not match the original predictions at index "f"{idx}: {arr[idx // 5, idx % 5]} != {pred}.")# Use majority voting to get the final prediction for each question# Shape: (53,)arr=np.apply_along_axis(lambdax:np.bincount(x).argmax(),axis=1,arr=arr)# Convert the array to a listinteger_predictions=arr.tolist()# Some of the questions are categorical and we're only interested in whether the# model chooses a specific choice or not. This mapping takes the question index# to the choice value that we're interested in.question_choices={0:1,1:5,3:3,6:1,15:4,20:2,47:8,48:7,49:4,51:4,52:4,}# Map the predictions to the choices we're interested ininteger_predictions=list(integer_predictions)forquestion_idx,choiceinquestion_choices.items():integer_predictions[question_idx]=(1ifinteger_predictions[question_idx]==choiceelse0)returninteger_predictionsdef european_values_scoring_function([docs]
pipeline:"Pipeline",predictions:c.Sequence[int])->float:    """Scoring function for the European Values metric."""normalised_predictions=pipeline[0].transform([predictions])log_likelihoods=pipeline[1].transform(normalised_predictions)[0]score=sigmoid(pipeline[2].alpha_*(log_likelihoods-pipeline[2].center_))returnscore.item()european_values_metric=PipelineMetric(name="european_values",pretty_name="European Values",pipeline_repo="EuroEval/european-values-pipeline",pipeline_scoring_function=european_values_scoring_function,preprocessing_fn=european_values_preprocessing_fn,)