euroeval.metrics.sacrebleu¶

[docs] module euroeval.metrics.sacrebleu
"""Metrics from the SacreBLEU package."""import collections.abc as cimport typing as tfrom sacrebleu.metrics importCHRFfrom .base importMetricfrom .language_detection importlanguage_detectorift.TYPE_CHECKING:from datasets.arrow_dataset importDatasetfrom ..data_models importBenchmarkConfig,DatasetConfigfrom .language_detection importLanguageDetectorclass ChrF(Metric):[docs]
    """The ChrF metric."""def __init__(self,word_order:int=0,beta:int=2,language_detector:"LanguageDetector | None"=None,)->None:        """Initialise the ChrF metric.        Args:            word_order (optional):                The word order for the ChrF metric. Defaults to 0, which is the                original chrF metric. If set to 2, it is the chrF++ metric.            beta (optional):                The beta parameter for the ChrF metric. Defaults to 2, which is the                original chrF (and chrF++) metric.            language_detector (optional):                A LanguageDetector instance. If provided, each per-sentence score is                multiplied by a binary language penalty (1.0 if the prediction is in                the correct language, 0.0 otherwise) before averaging. Defaults to                None, which disables language penalization.        """super().__init__(name=f"chr_f{beta}"+"p"*word_order,pretty_name=f"ChrF{beta}"+"+"*word_order,postprocessing_fn=lambdax:(x,f"{x:.2f}%"),)self.word_order=word_orderself.beta=betaself.language_detector=language_detectorself.metric=CHRF(char_order=6,word_order=self.word_order,beta=self.beta)def download(self,cache_dir:str)->"ChrF":[docs]
        """Download the language detection model if needed.        Args:            cache_dir:                The directory where the metric will be downloaded to.        Returns:            The metric object itself.        """ifself.language_detectorisnotNone:self.language_detector.download()returnselfdef __call__(self,predictions:c.Sequence,references:c.Sequence,dataset:"Dataset",dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",)->float|None:        """Calculate the ChrF score.        Args:            predictions:                The predictions of the model.            references:                The references for the predictions.            dataset:                The dataset used for evaluation. This is only used in case any                additional metadata is used to compute the metrics.            dataset_config:                The dataset configuration.            benchmark_config:                The benchmark configuration.        Returns:            The ChrF score, penalized per-sentence by language correctness.        """scores=[self.metric.sentence_score(hypothesis=prediction,references=[reference]).scoreforprediction,referenceinzip(predictions,references)]ifnotscores:return1.0ifself.language_detectorisnotNone:penalties=self.language_detector(predictions=predictions,dataset_config=dataset_config)scores=[s*pfors,pinzip(scores,penalties)]returnsum(scores)/len(scores)chrf2_metric=ChrF(language_detector=language_detector)chrf3_metric=ChrF(beta=3,language_detector=language_detector)chrf4_metric=ChrF(beta=4,language_detector=language_detector)chrf2pp_metric=ChrF(word_order=2,language_detector=language_detector)chrf3pp_metric=ChrF(word_order=2,beta=3,language_detector=language_detector)chrf4pp_metric=ChrF(word_order=2,beta=4,language_detector=language_detector)