euroeval.benchmark_modules.hf

[docs] module euroeval.benchmark_modules.hf
"""Encoder models from the Hugging Face Hub."""import collections.abc as cimport loggingimport reimport typing as tfrom functools importcached_property,partialfrom json importJSONDecodeErrorfrom pathlib importPathfrom time importsleepimport torchfrom datasets importDatasetDictfrom huggingface_hub importHfApifrom huggingface_hub importwhoamiashf_whoamifrom huggingface_hub.errors import(GatedRepoError,HfHubHTTPError,HFValidationError,LocalTokenNotFoundError,RepositoryNotFoundError,RevisionNotFoundError,)from huggingface_hub.hf_api importModelInfoasHfApiModelInfofrom peft importPeftConfigfrom requests.exceptions importRequestExceptionfrom torch importnnfrom transformers.data.data_collator import(DataCollatorForTokenClassification,DataCollatorWithPadding,)from transformers.modelcard importTASK_MAPPINGfrom transformers.modeling_utils importPreTrainedModelfrom transformers.models.auto.configuration_auto importAutoConfigfrom transformers.models.auto.tokenization_auto importAutoTokenizerfrom transformers.trainer importTrainerfrom urllib3.exceptions importRequestErrorfrom ..caching_utils importcache_argumentsfrom ..constants import(DUMMY_FILL_VALUE,GENERATIVE_PIPELINE_TAGS,LOCAL_MODELS_REQUIRED_FILES,MAX_CONTEXT_LENGTH,MERGE_TAGS,)from ..data_models importHashableDict,HFModelInfo,ModelConfigfrom ..enums import(BatchingPreference,GenerativeType,InferenceBackend,ModelType,TaskGroup,)from ..exceptions import(InvalidBenchmark,InvalidModel,NeedsAdditionalArgument,NeedsEnvironmentVariable,NeedsExtraInstalled,)from ..generation_utils importraise_if_wrong_paramsfrom ..languages importget_all_languagesfrom ..logging_utils importblock_terminal_output,log,log_oncefrom ..task_group_utils import(multiple_choice_classification,question_answering,token_classification,)from ..tokenisation_utils importget_bos_token,get_eos_tokenfrom ..utils import(create_model_cache_dir,get_class_by_name,get_hf_token,internet_connection_available,split_model_id,)from .base importBenchmarkModuleift.TYPE_CHECKING:from transformers.configuration_utils importPretrainedConfigfrom transformers.tokenization_utils importPreTrainedTokenizerfrom transformers.tokenization_utils_base importBatchEncodingfrom ..data_models importBenchmarkConfig,DatasetConfig,Taskfrom ..types importExtractLabelsFunctionclass HuggingFaceEncoderModel(BenchmarkModule):[docs]
    """An encoder model from the Hugging Face Hub."""fresh_model=Falsebatching_preference=BatchingPreference.NO_PREFERENCEhigh_priority=Trueallowed_params={re.compile(r".*"):["slow-tokenizer"]}def __init__(self,model_config:"ModelConfig",dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",log_metadata:bool=True,)->None:        """Initialise the model.        Args:            model_config:                The model configuration.            dataset_config:                The dataset configuration.            benchmark_config:                The benchmark configuration.            log_metadata:                Whether to log the model metadata.        """raise_if_wrong_params(model_config=model_config,allowed_params=self.allowed_params)model,tokeniser=load_model_and_tokeniser(model_config=model_config,dataset_config=dataset_config,benchmark_config=benchmark_config,)self._model:"PreTrainedModel"=modelself._tokeniser:"PreTrainedTokenizer"=tokeniserself._model,self._tokeniser=align_model_and_tokeniser(model=self._model,tokeniser=self._tokeniser,model_max_length=self.model_max_length,raise_errors=benchmark_config.raise_errors,)super().__init__(model_config=model_config,dataset_config=dataset_config,benchmark_config=benchmark_config,log_metadata=log_metadata,)@cached_propertydef num_params(self)->int:[docs]
        """The number of parameters in the model.        Returns:            The number of parameters in the model.        """# No need to try to use the API if we have no internet.ifnotinternet_connection_available():repo_info=Noneelse:token=get_hf_token(api_key=self.benchmark_config.api_key)hf_api=HfApi(token=token)try:repo_info=hf_api.model_info(repo_id=self.model_config.adapter_base_model_idorself.model_config.model_id,revision=self.model_config.revision,)except(RepositoryNotFoundError,RevisionNotFoundError,RequestException,HFValidationError,):repo_info=Noneif(repo_infoisnotNoneandhasattr(repo_info,"safetensors")andrepo_info.safetensorsisnotNoneand"total"inrepo_info.safetensors):num_params=repo_info.safetensors["total"]elif(hasattr(self._model.config,"num_params")andself._model.config.num_paramsisnotNone):num_params=self._model.config.num_paramselifhasattr(self._model,"parameters"):num_params=sum(p.numel()forpinself._model.parameters())else:log("The number of parameters could not be determined for the model, since ""the model is not stored in the safetensors format. If this is your ""own model, then you can use this Hugging Face Space to convert your ""model to the safetensors format: ""https://huggingface.co/spaces/safetensors/convert.",level=logging.WARNING,)num_params=-1returnnum_params@cached_propertydef vocab_size(self)->int:[docs]
        """The vocabulary size of the model.        Returns:            The vocabulary size of the model.        """if(hasattr(self._model.config,"vocab_size")andself._model.config.vocab_sizeisnotNone):vocab_size=self._model.config.vocab_sizeelif(hasattr(self._tokeniser,"vocab_size")andself._tokeniser.vocab_sizeisnotNone):vocab_size=self._tokeniser.vocab_sizeelse:vocab_size=-1returnvocab_size@cached_propertydef model_max_length(self)->int:[docs]
        """The maximum context length of the model.        Returns:            The maximum context length of the model.        """all_max_lengths:list[int]=list()# Add the registered max length of the tokeniserifhasattr(self._tokeniser,"model_max_length")andself._tokeniser.model_max_length<int(1e30):all_max_lengths.append(self._tokeniser.model_max_length)# Add the max length derived from the model's input sizesifhasattr(self._tokeniser,"max_model_input_sizes"):all_max_lengths.extend([sizeforsizeinself._tokeniser.max_model_input_sizes.values()ifsizeisnotNone])# Add max length candidates from the model's configurationcandidate_config_max_lengths=["max_position_embeddings","max_sequence_length","model_max_length","n_positions",]forcandidate_config_max_lengthincandidate_config_max_lengths:if(hasattr(self._model.config,candidate_config_max_length)and(value:=getattr(self._model.config,candidate_config_max_length))isnotNone):all_max_lengths.append(value)# To avoid models having artificially low max lengths, we remove any max lengths# that are less than 128all_max_lengths=[max_lengthformax_lengthinall_max_lengthsifmax_length>=128]iflen(list(all_max_lengths))>0:model_max_length=min(list(all_max_lengths))else:model_max_length=-1returnmodel_max_length@property[docs]
def data_collator(self)->c.Callable[[c.Sequence[t.Any]],dict[str,t.Any]]:        """The data collator used to prepare samples during finetuning.        Returns:            The data collator.        """matchself.dataset_config.task.task_group:case(TaskGroup.SEQUENCE_CLASSIFICATION|TaskGroup.TEXT_TO_TEXT|TaskGroup.QUESTION_ANSWERING|TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION):returnDataCollatorWithPadding(self._tokeniser,padding="longest")caseTaskGroup.TOKEN_CLASSIFICATION:returnDataCollatorForTokenClassification(tokenizer=self._tokeniser,label_pad_token_id=-100)case _:raiseNotImplementedError(f"Unsupported task group: {self.dataset_config.task.task_group}.")@propertydef generative_type(self)->GenerativeType|None:[docs]
        """Get the generative type of the model.        Returns:            The generative type of the model, or None if it has not been set yet.        """returnNone@propertydef extract_labels_from_generation(self)->"ExtractLabelsFunction":[docs]
        """The function used to extract the labels from the generated output.        Returns:            The function used to extract the labels from the generated output.        """raiseNotImplementedError("The `extract_labels_from_generation` property has not been implemented ""for Hugging Face Encoder models.")@propertydef trainer_class(self)->t.Type["Trainer"]:[docs]
        """The Trainer class to use for finetuning.        Returns:            The Trainer class.        """matchself.dataset_config.task.task_group:case(TaskGroup.SEQUENCE_CLASSIFICATION|TaskGroup.TEXT_TO_TEXT|TaskGroup.TOKEN_CLASSIFICATION):returnTrainercaseTaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:return(multiple_choice_classification.MultipleChoiceClassificationTrainer)caseTaskGroup.QUESTION_ANSWERING:returnquestion_answering.QuestionAnsweringTrainercase _:raiseNotImplementedError(f"Unsupported task group: {self.dataset_config.task.task_group}.")def prepare_dataset([docs]
self,dataset:DatasetDict,task:"Task",itr_idx:int)->DatasetDict:        """Prepare the dataset for the model.        This includes things like tokenisation.        Args:            dataset:                The dataset to prepare.            task:                The task to prepare the dataset for.            itr_idx:                The index of the dataset in the iterator.        Returns:            The prepared dataset.        """def numericalise_labels(examples:dict)->dict:if"label"inexamples:try:examples["label"]=[self._model.config.label2id[lbl.lower()]forlblinexamples["label"]]exceptKeyErrorase:raiseInvalidBenchmark(f"One of the labels in the dataset, "f"{examples['label'].lower()}, does not occur in the "f"label2id dictionary {self._model.config.label2id}.")from ereturnexamplesdef tokenise(examples:dict)->"BatchEncoding":returnself._tokeniser(text=examples["text"],truncation=True,padding=True)matchtask.task_group:caseTaskGroup.SEQUENCE_CLASSIFICATION:dataset=dataset.map(numericalise_labels,batched=True,load_from_cache_file=False).map(tokenise,batched=True,load_from_cache_file=False)caseTaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:dataset=DatasetDict({split_name:split.map(partial(multiple_choice_classification.prepare_examples,tokeniser=self._tokeniser,),batched=True,batch_size=10,remove_columns=split.column_names,load_from_cache_file=False,keep_in_memory=True,)forsplit_name,splitindataset.items()})caseTaskGroup.TEXT_TO_TEXT:dataset=dataset.map(tokenise,batched=True,load_from_cache_file=False,keep_in_memory=True,)caseTaskGroup.TOKEN_CLASSIFICATION:dataset=dataset.map(partial(token_classification.tokenize_and_align_labels,tokeniser=self._tokeniser,label2id=self._model.config.label2id,),batched=True,load_from_cache_file=False,keep_in_memory=True,)caseTaskGroup.QUESTION_ANSWERING:data_dict=dict()if"train"indataset:data_dict["train"]=dataset["train"].map(partial(question_answering.prepare_train_examples,tokeniser=self._tokeniser,),batched=True,batch_size=10,remove_columns=dataset["test"].column_names,load_from_cache_file=False,keep_in_memory=True,)if"val"indataset:data_dict["val"]=dataset["val"].map(partial(question_answering.prepare_train_examples,tokeniser=self._tokeniser,),batched=True,batch_size=10,remove_columns=dataset["test"].column_names,load_from_cache_file=False,keep_in_memory=True,)if"test"indataset:data_dict["test"]=dataset["test"].map(partial(question_answering.prepare_test_examples,tokeniser=self._tokeniser,),batched=True,batch_size=10,remove_columns=dataset["test"].column_names,load_from_cache_file=False,keep_in_memory=True,)dataset=DatasetDict(data_dict)# The Trainer hides the columns that are not used by the model (here# `id` and `offset_mapping` which we will need for our post-processing),# so we put them backforsplit_name,splitindataset.items():dataset[split_name].set_format(type=split.format["type"],columns=list(split.features.keys()))case _:raiseNotImplementedError(f"Unsupported task group: {task.task_group}.")returndataset@classmethoddef model_exists([docs]
cls,model_id:str,benchmark_config:"BenchmarkConfig")->bool|NeedsExtraInstalled|NeedsEnvironmentVariable:        """Check if a model exists.        Args:            model_id:                The model ID.            benchmark_config:                The benchmark configuration.        Returns:            Whether the model exists, or an error describing why we cannot check            whether the model exists.        """model_id_components=split_model_id(model_id=model_id)model_info=get_model_repo_info(model_id=model_id_components.model_id,revision=model_id_components.revision,api_key=benchmark_config.api_key,cache_dir=benchmark_config.cache_dir,trust_remote_code=benchmark_config.trust_remote_code,requires_safetensors=benchmark_config.requires_safetensors,run_with_cli=benchmark_config.run_with_cli,)return(model_infoisnotNoneandmodel_info.pipeline_tagnotinGENERATIVE_PIPELINE_TAGS)@classmethoddef get_model_config([docs]
cls,model_id:str,benchmark_config:"BenchmarkConfig")->"ModelConfig":        """Fetch the model configuration.        Args:            model_id:                The model ID.            benchmark_config:                The benchmark configuration.        Returns:            The model configuration.        """model_id_components=split_model_id(model_id=model_id)model_info=get_model_repo_info(model_id=model_id_components.model_id,revision=model_id_components.revision,api_key=benchmark_config.api_key,cache_dir=benchmark_config.cache_dir,trust_remote_code=benchmark_config.trust_remote_code,requires_safetensors=benchmark_config.requires_safetensors,run_with_cli=benchmark_config.run_with_cli,)ifmodel_infoisNone:raiseInvalidModel(f"The model {model_id!r} could not be found.")language_mapping=get_all_languages()language_codes=list(language_mapping.keys())model_config=ModelConfig(model_id=model_id_components.model_id,revision=model_id_components.revision,param=model_id_components.param,task=model_info.pipeline_tag,languages=[language_mapping[tag]fortaginmodel_info.tagsiftaginlanguage_codes],merge=any(taginmodel_info.tagsfortaginMERGE_TAGS),inference_backend=InferenceBackend.TRANSFORMERS,model_type=ModelType.ENCODER,fresh=False,model_cache_dir=create_model_cache_dir(cache_dir=benchmark_config.cache_dir,model_id=model_id),adapter_base_model_id=None,)returnmodel_configdef load_model_and_tokeniser([docs]
model_config:"ModelConfig",dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig",)->tuple["PreTrainedModel","PreTrainedTokenizer"]:    """Load the model and tokeniser.    Args:        model_config:            The model configuration.        dataset_config:            The dataset configuration.        benchmark_config:            The benchmark configuration    Returns:        A pair (model, tokeniser), with the loaded model and tokeniser    """config:"PretrainedConfig"block_terminal_output()model_id=model_config.model_idtask_group=dataset_config.task.task_groupignore_mismatched_sizes=False# Special case where there is a mismatch between the labels during training and# testingifdataset_config.task.task_group==TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:id2label={0:"0",1:"1"}else:id2label=dataset_config.id2labelconfig=load_hf_model_config(model_id=model_id,num_labels=len(id2label),id2label=HashableDict(id2label),label2id=HashableDict({label:idxforidx,labelinid2label.items()}),revision=model_config.revision,model_cache_dir=model_config.model_cache_dir,api_key=benchmark_config.api_key,trust_remote_code=benchmark_config.trust_remote_code,run_with_cli=benchmark_config.run_with_cli,)model_kwargs=dict(config=config,ignore_mismatched_sizes=ignore_mismatched_sizes,revision=model_config.revision,token=get_hf_token(api_key=benchmark_config.api_key),cache_dir=model_config.model_cache_dir,trust_remote_code=benchmark_config.trust_remote_code,dtype=get_dtype(device=benchmark_config.device,dtype_is_set=config.to_dict().get("dtype")isnotNone,bf16_available=(torch.cuda.is_available()andtorch.cuda.is_bf16_supported()),),)model:"PreTrainedModel | None"=Nonefor_inrange(num_attempts:=5):# Get the model class associated with the task groupmodel_cls_or_none:t.Type["PreTrainedModel"]|None=get_class_by_name(class_name=task_group_to_class_name(task_group=task_group),module_name="transformers",)# If the model class could not be found then raise an errorifnotmodel_cls_or_none:raiseInvalidBenchmark(f"The task group {task_group.value!r} does not correspond to a ""Hugging Face AutoModel type (such as ""`AutoModelForSequenceClassification`).")# If the model is a DeBERTaV2 model then we ensure that# `pooler_hidden_size` is the same size as `hidden_size`ifconfig.model_type=="deberta-v2":config.pooler_hidden_size=config.hidden_sizetry:model_or_tuple=model_cls_or_none.from_pretrained(model_config.model_id,**model_kwargs)breakexcept(KeyError,RuntimeError)ase:ifnotmodel_kwargs["ignore_mismatched_sizes"]:log(f"{type(e).__name__} occurred during the loading "f"of the {model_id!r} model. Retrying with ""`ignore_mismatched_sizes` set to True.",level=logging.DEBUG,)model_kwargs["ignore_mismatched_sizes"]=Truecontinueelse:raiseInvalidModel(str(e))from eexcept(TimeoutError,RequestError):log(f"Couldn't load the model {model_id!r}. Retrying.",level=logging.WARNING,)sleep(5)continueexcept(OSError,ValueError)ase:if"checkpoint seems to be incorrect"instr(e):raiseInvalidModel(f"The model {model_id!r} has an incorrect checkpoint.")from eif"trust_remote_code"instr(e):raiseInvalidModel(f"Loading the model {model_id!r} needs to trust remote code. ""If you trust the suppliers of this model, then you can enable ""this by setting the `--trust-remote-code` flag.")from eraiseInvalidModel(f"The model {model_id!r} could not be loaded. The error was {e!r}.")from eelse:raiseInvalidModel(f"Could not load the model {model_id!r} after {num_attempts} attempts.")ifisinstance(model_or_tuple,tuple):model=model_or_tuple[0]else:model=model_or_tupleassertmodelisnotNone,"The model should not be None."model.eval()model.to(benchmark_config.device)# type: ignore[arg-type]if(isinstance(model,PreTrainedModel)andtask_group==TaskGroup.QUESTION_ANSWERING):model=setup_model_for_question_answering(model=model)tokeniser=load_tokeniser(model=model,model_id=model_id,trust_remote_code=benchmark_config.trust_remote_code,model_config=model_config,)returnmodel,tokeniser@cache_arguments("model_id","revision")def get_model_repo_info([docs]
model_id:str,revision:str,api_key:str|None,cache_dir:str,trust_remote_code:bool,requires_safetensors:bool,run_with_cli:bool,)->"HFModelInfo | None":    """Get the information about the model from the HF Hub or a local directory.    Args:        model_id:            The model ID.        revision:            The revision of the model.    Returns:        The information about the model, or None if the model could not be found.    """token=get_hf_token(api_key=api_key)hf_api=HfApi(token=token)# Get information on the model.# The first case is when the model is a local model, in which case we create a dummy# model info object.model_info:HfApiModelInfo|None=NoneifPath(model_id).is_dir():log(f"Checking for local model in {model_id}.",level=logging.DEBUG)ifall((Path(model_id)/required_file).exists()forrequired_fileinLOCAL_MODELS_REQUIRED_FILES):model_info=HfApiModelInfo(id=model_id,tags=None,pipeline_tag=None)# If we have not internet, and the model_id is not a directory for a local model# we also just create a dummy model info object.elifnotinternet_connection_available():model_info=HfApiModelInfo(id=model_id,tags=None,pipeline_tag=None)# If the model does not exist locally, then we get the model info from the Hugging# Face Hub, if possibleifmodel_infoisNone:num_attempts=3errors:list[Exception]=list()for_inrange(num_attempts):try:model_info=hf_api.model_info(repo_id=model_id,revision=revision,token=token)breakexcept(GatedRepoError,LocalTokenNotFoundError)ase:try:hf_whoami(token=token)log(f"Could not access the model {model_id} with the revision "f"{revision}. The error was {str(e)!r}.",level=logging.DEBUG,)returnNoneexceptLocalTokenNotFoundError:log(f"Could not access the model {model_id} with the revision "f"{revision}. The error was {str(e)!r}. Please set the ""`HUGGINGFACE_API_KEY` environment variable or use the ""`--api-key` argument.",level=logging.DEBUG,)returnNoneexcept(RepositoryNotFoundError,HFValidationError,HfHubHTTPError):returnNoneexcept(OSError,RequestException)ase:ifinternet_connection_available():errors.append(e)continuelog("Could not access the Hugging Face Hub. Please check your internet ""connection.",level=logging.DEBUG,)returnNoneelse:log(f"Could not access model info for the model {model_id!r} from the "f"Hugging Face Hub, after {num_attempts} attempts. The errors "f"encountered were {errors!r}.",level=logging.DEBUG,)returnNone# Get all the Hugging Face repository tags for the model. If the model is an adapter# model, then we also get the tags for the base modeltags=model_info.tagsorlist()base_model_id:str|None=Nonehas_adapter_config=model_info.siblingsisnotNoneandany(sibling.rfilename=="adapter_config.json"forsiblinginmodel_info.siblings)ifhas_adapter_config:adapter_config=PeftConfig.from_pretrained(model_id,revision=revision)base_model_id=adapter_config.base_model_name_or_pathlog_once(f"Model {model_id!r} identified as an adapter model, with base model "f"{base_model_id!r}.",level=logging.DEBUG,)ifbase_model_idisnotNone:base_model_info=hf_api.model_info(repo_id=base_model_id,token=token)tags+=base_model_info.tagsorlist()tags=list(set(tags))# Get the pipeline tag for the model. If it is not specified, then we determine it# by checking the model's architecture as written in the model's Hugging Face configpipeline_tag=model_info.pipeline_tagifpipeline_tagisNone:hf_config=load_hf_model_config(model_id=base_model_idormodel_id,num_labels=0,id2label=HashableDict(),label2id=HashableDict(),revision=revision,model_cache_dir=create_model_cache_dir(cache_dir=cache_dir,model_id=model_id),api_key=api_key,trust_remote_code=trust_remote_code,run_with_cli=run_with_cli,)class_names=hf_config.architecturesgenerative_class_names=[class_namefortaginGENERATIVE_PIPELINE_TAGSforclass_nameinTASK_MAPPING.get(tag,dict()).values()# type: ignore[attr-defined]]ifclass_namesisnotNoneandany(class_nameingenerative_class_namesforclass_nameinclass_names):pipeline_tag="text-generation"else:pipeline_tag="fill-mask"ifrequires_safetensors:repo_files=hf_api.list_repo_files(repo_id=model_id,revision=revision)has_safetensors=any(f.endswith(".safetensors")forfinrepo_files)ifnothas_safetensors:msg=f"Model {model_id} does not have safetensors weights available. "ifrun_with_cli:msg+="Skipping since the `--only-allow-safetensors` flag is set."else:msg+=("Skipping since the `requires_safetensors` argument is set ""to `True`.")log(msg,level=logging.WARNING)returnNone# Also check base model if we are evaluating an adapterifbase_model_idisnotNone:base_repo_files=hf_api.list_repo_files(repo_id=base_model_id)base_has_safetensors=any(f.endswith(".safetensors")forfinbase_repo_files)ifnotbase_has_safetensors:msg=(f"Base model {base_model_id} does not have safetensors weights ""available.")ifrun_with_cli:msg+=" Skipping since the `--only-allow-safetensors` flag is set."else:msg+=(" Skipping since the `requires_safetensors` argument is set ""to `True`.")logging.warning(msg)returnNonereturnHFModelInfo(pipeline_tag=pipeline_tag,tags=tags,adapter_base_model_id=base_model_id)def load_tokeniser([docs]
model:"PreTrainedModel | None",model_id:str,trust_remote_code:bool,model_config:"ModelConfig",)->"PreTrainedTokenizer":    """Load the tokeniser.    Args:        model:            The model, which is used to determine whether to add a prefix space to            the tokens. Can be None.        model_id:            The model identifier. Used for logging.        trust_remote_code:            Whether to trust remote code.        model_config:            The model configuration.    Returns:        The loaded tokeniser.    """loading_kwargs:dict[str,bool|str]=dict(use_fast=Falseifmodel_config.param=="slow-tokenizer"elseTrue,verbose=False,trust_remote_code=trust_remote_code,padding_side="right",truncation_side="right",cache_dir=model_config.model_cache_dir,)# If the model is a subclass of a certain model types then we have to add a prefix# space to the tokens, by the way the model is constructed.ifmodelisnotNone:prefix_models=["Roberta","GPT","Deberta"]add_prefix=any(model_typeintype(model).__name__formodel_typeinprefix_models)ifadd_prefix:loading_kwargs["add_prefix_space"]=Truenum_retries=5for_inrange(num_retries):try:tokeniser=AutoTokenizer.from_pretrained(model_id,**loading_kwargs)breakexcept(JSONDecodeError,OSError,TypeError)ase:raiseInvalidModel(f"Could not load tokeniser for model {model_id!r}.")from eexcept(TimeoutError,RequestError):log(f"Couldn't load tokeniser for {model_id!r}. Retrying.",level=logging.WARNING,)sleep(5)continueelse:raiseInvalidModel(f"Could not load tokeniser for model {model_id!r} after {num_retries} ""attempts.")# Ensure that BOS, EOS and PAD tokens are settokeniser.bos_token,tokeniser.bos_token_id=get_bos_token(tokeniser=tokeniser)tokeniser.eos_token,tokeniser.eos_token_id=get_eos_token(tokeniser=tokeniser)returntokeniser@cache_arguments()def get_dtype([docs]
device:torch.device,dtype_is_set:bool,bf16_available:bool)->str|torch.dtype:    """Get the torch dtype, used for loading the model.    Args:        device:            The device to use.        dtype_is_set:            Whether the data type is set in the model configuration.        bf16_available:            Whether bfloat16 is available.    Returns:        The dtype.    """using_cuda=device==torch.device("cuda")ifusing_cudaanddtype_is_set:return"auto"elifusing_cudaandbf16_available:returntorch.bfloat16elifusing_cuda:returntorch.float16returntorch.float32@cache_arguments("model_id","revision","num_labels","id2label","label2id")def load_hf_model_config([docs]
model_id:str,num_labels:int,id2label:dict[int,str],label2id:dict[str,int],revision:str,model_cache_dir:str|None,api_key:str|None,trust_remote_code:bool,run_with_cli:bool,)->"PretrainedConfig":    """Load the Hugging Face model configuration.    Args:        model_id:            The Hugging Face model ID.        num_labels:            The number of labels in the dataset.        id2label:            The mapping from label IDs to labels.        label2id:            The mapping from labels to label IDs.        revision:            The revision of the model.        model_cache_dir:            The directory to cache the model in.        api_key:            The Hugging Face API key.        trust_remote_code:            Whether to trust remote code.        run_with_cli:            Whether the script is being run with the CLI.    Returns:        The Hugging Face model configuration.    """for_inrange(num_attempts:=5):try:config=AutoConfig.from_pretrained(model_id,num_labels=num_labels,id2label=id2label,label2id=label2id,revision=revision,token=get_hf_token(api_key=api_key),trust_remote_code=trust_remote_code,cache_dir=model_cache_dir,local_files_only=notinternet_connection_available(),)breakexceptKeyErrorase:key=e.args[0]raiseInvalidModel(f"The model config for the model {model_id!r} could not be "f"loaded, as the key {key!r} was not found in the config.")from eexcept(OSError,GatedRepoError)ase:ifisinstance(e,GatedRepoError)or"gated repo"instr(e).lower():raiseInvalidModel(f"The model {model_id!r} is a gated repository. Please ensure ""that you are logged in with `hf auth login` or have provided a ""valid Hugging Face access token with the `HUGGINGFACE_API_KEY` ""environment variable or the `--api-key` argument. Also check that ""your account has access to this model.")from eraiseInvalidModel(f"Couldn't load model config for {model_id!r}. The error was "f"{e!r}. Skipping")from eexcept(TimeoutError,RequestError):log(f"Couldn't load model config for {model_id!r}. Retrying.",level=logging.WARNING,)sleep(5)continueexceptValueErrorase:if"awaiting a review from the repo authors"instr(e):raiseInvalidModel(f"The model {model_id!r} is awaiting a review from the repository ""authors. Please try again later.")from eif"trust_remote_code"instr(e):raiseNeedsAdditionalArgument(cli_argument="--trust-remote-code",script_argument="trust_remote_code=True",run_with_cli=run_with_cli,)from eraiseInvalidModel(f"The config for the model {model_id!r} could not be loaded. The "f"error was {e!r}.")from eelse:raiseInvalidModel(f"Couldn't load model config for {model_id!r} after {num_attempts} ""attempts.")# Ensure that the PAD token ID is setifconfig.eos_token_idisnotNoneandconfig.pad_token_idisNone:ifisinstance(config.eos_token_id,list):config.pad_token_id=config.eos_token_id[0]else:config.pad_token_id=config.eos_token_idreturnconfig[docs]
def setup_model_for_question_answering(model:"PreTrainedModel")->"PreTrainedModel":    """Setup a model for question answering.    Args:        model:            The model to setup.    Returns:        The setup model.    """# Get the models' token type embedding children, if they existchildren=get_children_of_module(name="model",module=model)assertisinstance(children,dict)# If the model has token type embeddings then get themifchildren:# Get the list of attributes that are token type embeddingsattribute_list=list()done=Falsewhilenotdone:forkey,valueinchildren.items():attribute_list.append(key)ifisinstance(value,dict):children=valueelse:done=Truebreak# Get the token type embeddingstoken_type_embeddings=modelforattributeinattribute_list:token_type_embeddings=getattr(token_type_embeddings,attribute)token_type_embedding_tensor=token_type_embeddings.weight.dataassertisinstance(token_type_embedding_tensor,torch.Tensor)# If the token type embeddings has shape (1, ...) then set the shape to# (2, ...) by randomly initializing the second token type embeddingiftoken_type_embedding_tensor.shape[0]==1:token_type_embeddings.weight.data=torch.cat((token_type_embedding_tensor,torch.rand_like(token_type_embedding_tensor),),dim=0,)token_type_embeddings.num_embeddings=2# type: ignore[assignment]# Set the model config to use the new type vocab sizemodel.config.type_vocab_size=2returnmodeldef get_children_of_module([docs]
name:str,module:nn.Module)->nn.Module|dict[str,t.Any]|None:    """Get the children of a module.    Args:        name:            The name of the module.        module:            The module to get the children of.    Returns:        The children of the module, or None if the module has no children.    """iflen(list(module.children()))==0:ifname=="token_type_embeddings":returnmoduleelse:returnNoneelse:submodules=dict()forsubname,submoduleinmodule.named_children():children=get_children_of_module(name=subname,module=submodule)ifchildren:submodules[subname]=childrenreturnsubmodulesdef align_model_and_tokeniser([docs]
model:"PreTrainedModel",tokeniser:"PreTrainedTokenizer",model_max_length:int,raise_errors:bool=False,)->tuple["PreTrainedModel","PreTrainedTokenizer"]:    """Aligns the model and the tokeniser.    Args:        model:            The model to fix.        tokeniser:            The tokeniser to fix.        model_max_length:            The maximum length of the model.        raise_errors:            Whether to raise errors instead of trying to fix them silently.    Returns:        The fixed model and tokeniser.    """model_max_length=min(model_max_length,MAX_CONTEXT_LENGTH)ifmodel_max_length>0:tokeniser.model_max_length=model_max_lengthelse:tokeniser.model_max_length=512# Move the model to the CPU, since otherwise we can't catch the IndexErrors when# finding the maximum sequence length of the modelmodel_device=model.devicemodel.to(torch.device("cpu"))# type: ignore[arg-type]# Manually check that this model max length is valid for the model, and adjust# otherwiseinitial_max_length=tokeniser.model_max_lengthformax_lengthinrange(initial_max_length,0,-1):tokeniser.model_max_length=max_lengthdummy_inputs=torch.full(size=(1,max_length),fill_value=DUMMY_FILL_VALUE,dtype=torch.long,device=model.device,)withtorch.inference_mode():try:model(dummy_inputs,attention_mask=torch.ones_like(dummy_inputs))break# This happens if `max_length` is too largeexceptIndexError:continueexceptValueErrorase:# This happens when the model is using Triton, such as with ModernBERT,# which doesn't work with CPU tensors at allif"cpu tensor"instr(e):breakelse:raisee# Move the model back to the original devicemodel.to(model_device)# type: ignore[arg-type]# If there is a mismatch between the vocab size according to the tokeniser and# the vocab size according to the model, we raise an errorifhasattr(model.config,"vocab_size"):ifmodel.config.vocab_size<len(tokeniser):ifraise_errors:raiseInvalidModel("The vocab size of the tokeniser is larger than the vocab size of ""the model. As the --raise-errors option was specified, the ""embeddings of the model will not be automatically adjusted.")ifhasattr(model,"resize_token_embeddings"):model.resize_token_embeddings(new_num_tokens=tokeniser.vocab_size+1)iftokeniser.bos_tokenisNoneandtokeniser.eos_tokenisnotNone:tokeniser.bos_token=tokeniser.eos_tokentokeniser.bos_token_id=tokeniser.eos_token_idreturnmodel,tokeniser@cache_arguments()def task_group_to_class_name(task_group:TaskGroup)->str:[docs]
    """Convert a task group to a class name.    Args:        task_group:            The task group.    Returns:        The class name.    """pascal_case=task_group.title().replace("_","")special_case_mapping=dict(MultipleChoiceClassification="SequenceClassification",Speed="SequenceClassification",)pascal_case=special_case_mapping.get(pascal_case,pascal_case)returnf"AutoModelFor{pascal_case}"