euroeval.data_models

[docs] module euroeval.data_models
"""Data models used in EuroEval."""import jsonimport pathlibimport reimport typing as tfrom dataclasses importdataclass,fieldimport pydanticimport torchfrom .enums importDevice,InferenceBackend,ModelType,TaskGroupfrom .metrics importMetricfrom .types importScoreDictfrom .utils importget_package_version@dataclassclass Language:[docs]
    """A benchmarkable language.    Attributes:        code:            The ISO 639-1 language code of the language.        name:            The name of the language.        and_separator (optional):            The word 'and' in the language.        or_separator (optional):            The word 'or' in the language.    """code:strname:str_and_separator:str|None=field(repr=False,default=None)_or_separator:str|None=field(repr=False,default=None)def __hash__(self)->int:        """Return a hash of the language."""returnhash(self.code)@propertydef and_separator(self)->str:[docs]
        """Get the word 'and' in the language.        Returns:            The word 'and' in the language.        Raises:            NotImplementedError:                If `and_separator` is `None`.        """ifnotself._and_separator:raiseNotImplementedError(f"Separator for the word 'and' has not been defined for {self.name}.")returnself._and_separator@and_separator.setterdef and_separator(self,value:str|None)->None:self._and_separator=value@propertydef or_separator(self)->str:[docs]
        """Get the word 'or' in the language.        Returns:            The word 'or' in the language.        Raises:            NotImplementedError:                If `or_separator` is `None`.        """ifnotself._or_separator:raiseNotImplementedError(f"Separator for the word 'or' has not been defined for {self.name}.")returnself._or_separator@or_separator.setterdef or_separator(self,value:str|None)->None:self._or_separator=value@dataclassclass Task:[docs]
    """A dataset task.    Attributes:        name:            The name of the task.        task_group:            The task group of the task.        template_dict:            The template dictionary for the task, from language to prompt template.        metrics:            The metrics used to evaluate the task.        default_num_few_shot_examples:            The default number of examples to use when benchmarking the task using            few-shot evaluation. For a classification task, these will be drawn evenly            from each label.        default_max_generated_tokens:            The default maximum number of tokens to generate when benchmarking the task            using few-shot evaluation.        default_labels:            The default labels for datasets using this task.    """name:strtask_group:TaskGrouptemplate_dict:dict["Language","PromptConfig"]metrics:list[Metric]default_num_few_shot_examples:intdefault_max_generated_tokens:intdefault_labels:list[str]def __hash__(self)->int:        """Return a hash of the task."""returnhash(self.name)@dataclassclass BenchmarkConfig:[docs]
    """General benchmarking configuration, across datasets and models.    Attributes:        model_languages:            The languages of the models to benchmark.        dataset_languages:            The languages of the datasets in the benchmark.        tasks:            The tasks benchmark the model(s) on.        datasets:            The datasets to benchmark on.        batch_size:            The batch size to use.        raise_errors:            Whether to raise errors instead of skipping them.        cache_dir:            Directory to store cached models and datasets.        api_key:            The API key to use for a given inference API.        force:            Whether to force the benchmark to run even if the results are already            cached.        progress_bar:            Whether to show a progress bar.        save_results:            Whether to save the benchmark results to 'euroeval_benchmark_results.json'.        device:            The device to use for benchmarking.        verbose:            Whether to print verbose output.        trust_remote_code:            Whether to trust remote code when loading models from the Hugging Face Hub.        clear_model_cache:            Whether to clear the model cache after benchmarking each model.        evaluate_test_split:            Whether to evaluate on the test split.        few_shot:            Whether to only evaluate the model using few-shot evaluation. Only relevant            if the model is generative.        num_iterations:            The number of iterations each model should be evaluated for.        api_base:            The base URL for a given inference API. Only relevant if `model` refers to a            model on an inference API.        api_version:            The version of the API to use. Only relevant if `model` refers to a model on            an inference API.        debug:            Whether to run the benchmark in debug mode.        run_with_cli:            Whether the benchmark is being run with the CLI.        only_allow_safetensors:            Whether to only allow models that use the safetensors format.    """model_languages:list[Language]dataset_languages:list[Language]tasks:list[Task]datasets:list[str]batch_size:intraise_errors:boolcache_dir:strapi_key:str|Noneforce:boolprogress_bar:boolsave_results:booldevice:torch.deviceverbose:booltrust_remote_code:boolclear_model_cache:boolevaluate_test_split:boolfew_shot:boolnum_iterations:intapi_base:str|Noneapi_version:str|Nonedebug:boolrun_with_cli:boolonly_allow_safetensors:boolclass BenchmarkConfigParams(pydantic.BaseModel):[docs]
    """The parameters for the benchmark configuration."""model_config=pydantic.ConfigDict(protected_namespaces=())progress_bar:boolsave_results:booltask:str|list[str]|Nonedataset:str|list[str]|Nonelanguage:str|list[str]model_language:str|list[str]|Nonedataset_language:str|list[str]|Nonedevice:Device|Nonebatch_size:intraise_errors:boolcache_dir:strapi_key:str|Noneforce:boolverbose:booltrust_remote_code:boolclear_model_cache:boolevaluate_test_split:boolfew_shot:boolnum_iterations:intapi_base:str|Noneapi_version:str|Nonedebug:boolrun_with_cli:boolonly_allow_safetensors:boolclass BenchmarkResult(pydantic.BaseModel):[docs]
    """A benchmark result."""dataset:strtask:strdataset_languages:list[str]model:strresults:ScoreDictnum_model_parameters:intmax_sequence_length:intvocabulary_size:intmerge:boolgenerative:boolgenerative_type:str|Nonefew_shot:boolvalidation_split:booleuroeval_version:str|None=get_package_version("euroeval")transformers_version:str|None=get_package_version("transformers")torch_version:str|None=get_package_version("torch")vllm_version:str|None=get_package_version("vllm")outlines_version:str|None=get_package_version("outlines")@classmethoddef from_dict(cls,config:dict)->"BenchmarkResult":[docs]
        """Create a benchmark result from a dictionary.        Args:            config:                The configuration dictionary.        Returns:            The benchmark result.        """# To be backwards compatible, we accept old results which changed the model# name with parameters rather than adding them as explicit parametersval_matches=re.search(r"\(.*val.*\)$",config["model"])few_shot_matches=re.search(r"\(.*few-shot.*\)$",config["model"])zero_shot_matches=re.search(r"\(.*zero-shot.*\)$",config["model"])config["model"]=re.sub(r"\(.*(few-shot|val).*\)$","",config["model"]).strip()if"merge"notinconfig:config["merge"]=Falseif"generative"notinconfig:config["generative"]=(few_shot_matchesisnotNoneorzero_shot_matchesisnotNone)if"generative_type"notinconfig:config["generative_type"]=Noneif"few_shot"notinconfig:config["few_shot"]=zero_shot_matchesisNoneif"validation_split"notinconfig:config["validation_split"]=val_matchesisnotNonereturncls(**config)def append_to_results(self,results_path:pathlib.Path)->None:[docs]
        """Append the benchmark result to the results file.        Args:            results_path:                The path to the results file.        """json_str=json.dumps(self.model_dump())withresults_path.open("a")asf:f.write("\n"+json_str)@dataclassclass DatasetConfig:[docs]
    """Configuration for a dataset.    Attributes:        name:            The name of the dataset. Must be lower case with no spaces.        pretty_name:            A longer prettier name for the dataset, which allows cases and spaces. Used            for logging.        huggingface_id:            The Hugging Face ID of the dataset.        task:            The task of the dataset.        languages:            The ISO 639-1 language codes of the entries in the dataset.        id2label:            The mapping from ID to label.        label2id:            The mapping from label to ID.        num_labels:            The number of labels in the dataset.        _prompt_prefix (optional):            The prefix to use in the few-shot prompt. Defaults to the template for the            task and language.        _prompt_template (optional):            The template for the prompt to use when benchmarking the dataset using            few-shot evaluation. Defaults to the template for the task and language.        _instruction_prompt (optional):            The prompt to use when benchmarking the dataset using instruction-based            evaluation. Defaults to the template for the task and language.        _num_few_shot_examples (optional):            The number of examples to use when benchmarking the dataset using few-shot            evaluation. For a classification task, these will be drawn evenly from            each label. Defaults to the template for the task and language.        _max_generated_tokens (optional):            The maximum number of tokens to generate when benchmarking the dataset            using few-shot evaluation. Defaults to the template for the task and            language.        _labels (optional):            The labels in the dataset. Defaults to the template for the task and            language.        _prompt_label_mapping (optional):            A mapping from the labels to another phrase which is used as a substitute            for the label in few-shot evaluation. If "auto" then the mapping will be set            to a 1:1 mapping between the labels and themselves. If None then the mapping            will be set to the default mapping for the task and language. Defaults to            None.        unofficial (optional):            Whether the dataset is unofficial. Defaults to False.    """name:strpretty_name:strhuggingface_id:strtask:Tasklanguages:list[Language]_prompt_prefix:str|None=None_prompt_template:str|None=None_instruction_prompt:str|None=None_num_few_shot_examples:int|None=None_max_generated_tokens:int|None=None_labels:list[str]|None=None_prompt_label_mapping:dict[str,str]|t.Literal["auto"]|None=Noneunofficial:bool=False@propertydef prompt_prefix(self)->str:[docs]
        """The prefix to use in the few-shot prompt."""main_language=self.languages[0]prompt_config=self.task.template_dict[main_language]prompt_prefix=(prompt_config.default_prompt_prefixifself._prompt_prefixisNoneelseself._prompt_prefix)prompt_prefix=prompt_prefix.replace("{labels_str}",self._labels_str)returnprompt_prefix@propertydef prompt_template(self)->str:[docs]
        """The template used during few-shot evaluation."""main_language=self.languages[0]prompt_config=self.task.template_dict[main_language]prompt_template=(prompt_config.default_prompt_templateifself._prompt_templateisNoneelseself._prompt_template)prompt_template=prompt_template.replace("{labels_str}",self._labels_str)returnprompt_template@propertydef instruction_prompt(self)->str:[docs]
        """The prompt to use when evaluating instruction-tuned models."""main_language=self.languages[0]prompt_config=self.task.template_dict[main_language]instruction_prompt=(prompt_config.default_instruction_promptifself._instruction_promptisNoneelseself._instruction_prompt)instruction_prompt=instruction_prompt.replace("{labels_str}",self._labels_str)returninstruction_prompt@propertydef num_few_shot_examples(self)->int:[docs]
        """The number of few-shot examples to use."""return(self._num_few_shot_examplesifself._num_few_shot_examplesisnotNoneelseself.task.default_num_few_shot_examples)@propertydef max_generated_tokens(self)->int:[docs]
        """The maximum number of tokens to generate when evaluating a model."""return(self._max_generated_tokensifself._max_generated_tokensisnotNoneelseself.task.default_max_generated_tokens)@propertydef labels(self)->list[str]:[docs]
        """The labels in the dataset."""returnself._labelsifself._labelsisnotNoneelseself.task.default_labels@propertydef prompt_label_mapping(self)->dict[str,str]:[docs]
        """Mapping from English labels to localised labels."""ifself._prompt_label_mapping=="auto":return{label:labelforlabelinself.labels}elifself._prompt_label_mappingisnotNone:returnself._prompt_label_mappingmain_language=self.languages[0]prompt_config=self.task.template_dict[main_language]ifprompt_config.default_prompt_label_mapping=="auto":return{label:labelforlabelinself.labels}else:returnprompt_config.default_prompt_label_mapping@propertydef id2label(self)->dict[int,str]:[docs]
        """The mapping from ID to label."""return{idx:labelforidx,labelinenumerate(self.labels)}@propertydef label2id(self)->dict[str,int]:[docs]
        """The mapping from label to ID."""return{label:ifori,labelinenumerate(self.labels)}@propertydef num_labels(self)->int:[docs]
        """The number of labels in the dataset."""returnlen(self.labels)def __hash__(self)->int:        """Return a hash of the dataset configuration."""returnhash(self.name)@propertydef _labels_str(self)->str:        """Converts a set of labels to a natural string, in the specified language.        If the task is NER, we separate using 'and' and use the mapped labels instead of        the BIO NER labels.        Args:            language: The language to be used when converting the labels.        Returns:            The natural string representation of the labels in specified language.        """main_language=self.languages[0]ifself.task.task_group==TaskGroup.TOKEN_CLASSIFICATION:sep_word=main_language.and_separatorelse:sep_word=main_language.or_separatorlocal_labels:list[str]=[]forlabelinself.labels:iflabelnotinself.prompt_label_mapping:continuelocal_label=self.prompt_label_mapping[label]iflocal_labelnotinlocal_labels:local_labels.append(local_label)# Convert labels to single-quoted labels - and remove duplicatesquoted_labels=[f"'{label}'"forlabelinlocal_labels]ifnotquoted_labels:return""eliflen(quoted_labels)==1:returnquoted_labels[0]eliflen(quoted_labels)==2:returnf"{quoted_labels[0]} {sep_word} {quoted_labels[1]}"else:returnf"{', '.join(quoted_labels[:-1])} {sep_word} {quoted_labels[-1]}"@dataclassclass ModelConfig:[docs]
    """Configuration for a model.    Attributes:        model_id:            The ID of the model.        revision:            The revision of the model.        task:            The task that the model was trained on.        languages:            The languages of the model.        inference_backend:            The backend used to perform inference with the model.        merge:            Whether the model is a merged model.        model_type:            The type of the model (e.g., encoder, base decoder, instruction tuned).        fresh:            Whether the model is freshly initialised.        model_cache_dir:            The directory to cache the model in.        adapter_base_model_id:            The model ID of the base model if the model is an adapter model. Can be None            if the model is not an adapter model.    """model_id:strrevision:strtask:strlanguages:list[Language]inference_backend:InferenceBackendmerge:boolmodel_type:ModelTypefresh:boolmodel_cache_dir:stradapter_base_model_id:str|Nonedef __hash__(self)->int:        """Return a hash of the model configuration."""returnhash(self.model_id)@dataclassclass PreparedModelInputs:[docs]
    """The inputs to a model.    Attributes:        texts:            The texts to input to the model. Can be None if the input IDs and attention            mask are provided instead.        input_ids:            The input IDs of the texts. Can be None if the texts are provided instead.        attention_mask:            The attention mask of the texts. Can be None if the texts are provided            instead.    """texts:list[str]|None=Noneinput_ids:torch.Tensor|None=Noneattention_mask:torch.Tensor|None=None@dataclassclass GenerativeModelOutput:[docs]
    """The output of a generative model.    Attributes:        sequences:            The generated sequences.        scores:            The scores of the sequences. This is an array of shape (batch_size,            num_tokens, num_logprobs, 2), where the last dimension contains the            token and its logprob. Can be None if the scores are not available.    """sequences:list[str]scores:list[list[list[tuple[str,float]]]]|None=None@dataclassclass SingleGenerativeModelOutput:[docs]
    """A single output of a generative model.    Attributes:        sequence:            The generated sequence.        scores:            The scores of the sequence. This is an array of shape (num_tokens,            num_logprobs, 2), where the last dimension contains the token and its            logprob. Can be None if the scores are not available.    """sequence:strscores:list[list[tuple[str,float]]]|None=None@dataclassclass HFModelInfo:[docs]
    """Information about a Hugging Face model.    Attributes:        pipeline_tag:            The pipeline tag of the model.        tags:            The other tags of the model.        adapter_base_model_id:            The model ID of the base model if the model is an adapter model. Can be None            if the model is not an adapter model.    """pipeline_tag:strtags:list[str]adapter_base_model_id:str|None@dataclassclass PromptConfig:[docs]
    """Configuration for task-specific prompting across languages.    Defines the prompt templates needed for evaluating a specific task in a given    language.    Attributes:        default_prompt_prefix:            The default prefix to use in the few-shot prompt.        default_prompt_template:            The default template for the prompt to use when benchmarking the dataset            using few-shot evaluation.        default_instruction_prompt:            The default prompt to use when benchmarking the dataset using            instruction-based evaluation.        default_prompt_label_mapping:            The default mapping from the labels to another phrase which is used as a            substitute for the label in few-shot evaluation. If set to "auto", the            mapping will be set to a 1:1 mapping between the labels and themselves.    """default_prompt_prefix:strdefault_prompt_template:strdefault_instruction_prompt:strdefault_prompt_label_mapping:dict[str,str]|t.Literal["auto"]