euroeval.benchmark_modules.litellm

[docs] module euroeval.benchmark_modules.litellm
"""Generative models from an inference API, using the LiteLLM framework."""import asyncioimport collections.abc as cimport loggingimport osimport reimport typing as tfrom functools importcached_property,partialfrom time importsleepimport litellmimport ollamafrom datasets importDatasetDictfrom huggingface_hub importHfApifrom huggingface_hub.errors import(HFValidationError,RepositoryNotFoundError,RevisionNotFoundError,)from litellm.exceptions import(APIConnectionError,APIError,AuthenticationError,BadRequestError,InternalServerError,NotFoundError,RateLimitError,ServiceUnavailableError,Timeout,)from litellm.llms.vertex_ai.common_utils importVertexAIErrorfrom litellm.router importRouterfrom litellm.types.utils importChoiceLogprobs,ModelResponsefrom pydantic importconlist,create_modelfrom requests.exceptions importRequestExceptionfrom tqdm.asyncio importtqdmastqdm_asyncfrom tqdm.auto importtqdmfrom transformers.trainer importTrainerfrom ..constants importMAX_LOGPROBS,REASONING_MAX_TOKENS,TASKS_USING_JSONfrom ..data_models import(BenchmarkConfig,DatasetConfig,GenerativeModelOutput,ModelConfig,Task,)from ..enums import(BatchingPreference,GenerativeType,InferenceBackend,ModelType,TaskGroup,)from ..exceptions import(InvalidBenchmark,InvalidModel,NeedsAdditionalArgument,NeedsEnvironmentVariable,NeedsExtraInstalled,)from ..generation_utils importapply_prompt,extract_few_shot_examplesfrom ..task_group_utils import(question_answering,sequence_classification,text_to_text,token_classification,)from ..tokenization_utils importget_first_label_token_mappingfrom ..types importExtractLabelsFunctionfrom ..utils import(add_semaphore_and_catch_exception,create_model_cache_dir,log_once,safe_run,)from .base importBenchmarkModulefrom .hf importHuggingFaceEncoderModel,load_hf_model_config,load_tokenizerlogger=logging.getLogger("euroeval")VOCAB_SIZE_MAPPING={# OpenAI modelsr"gpt-4-(32k)?(-[0-9]{4})?":100_256,r"gpt-4-[0-9]{4}-preview":100_256,r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?":100_256,r"gpt-4-(vision|turbo)(-preview)?":100_256,r"gpt-3.5-turbo-instruct(-[0-9]{4})?":100_256,r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?":200_019,r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?":-1,# Anthropic modelsr"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}":-1,# Gemini modelsr"(gemini/)?gemini-[1-9]\.[0-9]-(flash|pro).*":256_128,# xAI modelsr"(xai/)?grok.*":-1,}MODEL_MAX_LENGTH_MAPPING={# OpenAI modelsr"gpt-4(-[0-9]{4})?":8_191,r"gpt-4-32k(-[0-9]{4})?":32_767,r"gpt-4-[0-9]{4}-preview":128_000,r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?":128_000,r"gpt-4-(vision|turbo)(-preview)?":128_000,r"gpt-3.5-turbo-instruct(-[0-9]{4})?":4_095,r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?":128_000,r"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?":128_000,r"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?":200_000,r"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?":200_000,r"gpt-4.1.*":1_047_576,# Anthropic modelsr"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}":200_000,# Gemini modelsr"(gemini/)?gemini-1\.5-flash.*":1_048_576,r"(gemini/)?gemini-1\.5-pro.*":2_097_152,r"(gemini/)?gemini-2\.(0|5).*":1_048_576,# xAI modelsr"(xai/)?grok.*":131_072,}NUM_PARAMS_MAPPING={# OpenAI modelsr"gpt-4.*":-1,r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?":-1,# Anthropic modelsr"(anthropic/)?claude-*":-1,# Gemini modelsr"(gemini/)?gemini-1.5-flash-8b":8_000_000_000,r"(gemini/)?gemini-1.5-flash-[0-9]+":-1,r"(gemini/)?gemini-2.(0|5).*":-1,# xAI modelsr"(xai/)?grok.*":-1,}ALLOWED_PARAMS={# OpenAI modelsr"gpt-4.*":[],r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?":["low","high"],# Anthropic modelsr"(anthropic/)?claude-3-(haiku|sonnet|opus).*":[],r"(anthropic/)?claude-3-5-.*":[],r"(anthropic/)?claude-3-7-sonnet.*":["thinking"],# Gemini modelsr"(gemini/)?gemini-.*":[],# xAI modelsr"(xai/)?grok-2.*":[],r"(xai/)?grok-3(-fast)?(-beta)?":[],r"(xai/)?grok-3-mini(-fast)?(-beta)?":["low","high"],}REASONING_MODELS=[r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?",r"(gemini/)?gemini.*thinking.*",r"(gemini/)?gemini-2.5.*",r"(xai/)?grok-3-mini.*",]class LiteLLMModel(BenchmarkModule):[docs]
    """A generative model from LiteLLM."""fresh_model=Falsebatching_preference=BatchingPreference.ALL_AT_ONCEhigh_priority=False_handleable_exceptions=(BadRequestError,RateLimitError,APIError,APIConnectionError,Timeout,ServiceUnavailableError,InternalServerError,SystemError,AuthenticationError,)def __init__(self,model_config:ModelConfig,dataset_config:DatasetConfig,benchmark_config:BenchmarkConfig,)->None:        """Initialise the model.        Args:            model_config:                The model configuration.            dataset_config:                The dataset configuration.            benchmark_config:                The benchmark configuration.        """# Detect whether the model is an Ollama model, as we need to extract metadata# differently for these modelsself.is_ollama=model_config.model_id.startswith("ollama/")ormodel_config.model_id.startswith("ollama_chat/")self._ollama_show:ollama.ShowResponse=(ollama.show("/".join(model_config.model_id.split("/")[1:]))ifself.is_ollamaelseollama.ShowResponse(model_info=None))raise_if_wrong_params(model_config=model_config,allowed_params=ALLOWED_PARAMS)super().__init__(model_config=model_config,dataset_config=dataset_config,benchmark_config=benchmark_config,)self.buffer["first_label_token_mapping"]=get_first_label_token_mapping(dataset_config=self.dataset_config,model_config=self.model_config,tokenizer=None,generative_type=self.generative_type,)@propertydef generative_type(self)->GenerativeType|None:[docs]
        """Get the generative type of the model.        Returns:            The generative type of the model, or None if it has not been set yet.        """ifself.is_ollama:reasoning_model="thinking"in(self._ollama_show.capabilitiesor[])type_=(GenerativeType.REASONINGifreasoning_modelelseGenerativeType.INSTRUCTION_TUNED)elifself.model_config.revisionin{"thinking"}:type_=GenerativeType.REASONINGelifre.fullmatch(pattern="|".join(REASONING_MODELS),string=self.model_config.model_id):type_=GenerativeType.REASONINGelse:type_=GenerativeType.INSTRUCTION_TUNEDlog_once(f"Detected generative type {type_.name!r} for model "f"{self.model_config.model_id!r}",level=logging.DEBUG,)returntype_def generate(self,inputs:dict)->GenerativeModelOutput:[docs]
        """Generate outputs from the model.        Args:            inputs:                A batch of inputs to pass through the model.        Returns:            The generated model outputs.        """assert"messages"ininputs,"The input must contain a 'messages' key."conversations:list[list[litellm.AllMessageValues]]=inputs["messages"]# Get the mapping from labels to the first token in the label. We call this each# time we generate a new dataset since the dataset config can changeself.buffer["first_label_token_mapping"]=get_first_label_token_mapping(dataset_config=self.dataset_config,model_config=self.model_config,tokenizer=None,generative_type=self.generative_type,)# Set the core generation argumentsgeneration_kwargs:dict[str,t.Any]=dict(model=self.model_config.model_id,max_completion_tokens=(REASONING_MAX_TOKENSifself.generative_type==GenerativeType.REASONINGelseself.dataset_config.max_generated_tokens),stop=[],temperature=0.0,seed=4242,api_key=self.benchmark_config.api_key,api_base=self.benchmark_config.api_base,api_version=self.benchmark_config.api_version,max_retries=3,)# Set up the `response_format` generation argument if we are dealing with a task# using structured generationifself.dataset_config.taskinTASKS_USING_JSON:# Sanity check that "JSON" is included in the prompt, as some models require# thisforconversationinconversations:ifnotconversation:raiseInvalidBenchmark("Encountered an empty conversation in 'messages'.")last_message=conversation[-1]assertisinstance(last_message,dict),(f"Expected dict message, got {type(last_message)}")assert"content"inlast_message,("Expected 'content' key in the last message of the conversation.")assertisinstance(last_message["content"],str),("Expected 'content' to be a string.")assert"json"inlast_message["content"].lower(),("Prompt must contain 'json' for JSON tasks.")ifself.generative_type==GenerativeType.REASONING:log_once(f"The model {self.model_config.model_id!r} is a reasoning model ""and thus does not support structured generation, so we do not ""enable it.",level=logging.DEBUG,)eliflitellm.utils.supports_response_schema(model=self.model_config.model_id):ner_tag_names=list(self.dataset_config.prompt_label_mapping.values())keys_and_their_types:dict[str,t.Any]={tag_name:(conlist(str,max_length=5),...)fortag_nameinner_tag_names}pydantic_class=create_model("AnswerFormat",**keys_and_their_types)generation_kwargs["response_format"]=pydantic_classlog_once("Enabling structured generation for model "f"{self.model_config.model_id!r} with the JSON schema "f"{pydantic_class.model_json_schema()}",level=logging.DEBUG,)else:generation_kwargs["response_format"]=dict(type="json_object")log_once("Enabling structured JSON generation for model "f"{self.model_config.model_id!r} with no custom JSON schema, as ""the model does not support schemas.",level=logging.DEBUG,)# If the model is an Ollama reasoning model, we ensure that thinking is enabledifself.is_ollamaandself.generative_type==GenerativeType.REASONING:generation_kwargs["think"]=Truelog_once("Enabling thinking mode for Ollama model "f"{self.model_config.model_id!r}",level=logging.DEBUG,)# Handle manually set parametersifself.buffer["first_label_token_mapping"]:generation_kwargs["logprobs"]=Truegeneration_kwargs["top_logprobs"]=MAX_LOGPROBSifself.model_config.revision=="thinking":generation_kwargs["thinking"]=dict(type="enabled",budget_tokens=REASONING_MAX_TOKENS-1)log_once(f"Enabling thinking mode for model {self.model_config.model_id!r}",level=logging.DEBUG,)elifself.model_config.revisionin{"low","high"}:generation_kwargs["reasoning_effort"]=self.model_config.revisionlog_once(f"Enabling reasoning effort {self.model_config.revision!r} for model "f"{self.model_config.model_id!r}",level=logging.DEBUG,)# Drop generation kwargs that are not supported by the modellitellm.drop_params=Trueall_responses:dict[int,ModelResponse]={}conversations_to_run:list[tuple[int,list[litellm.AllMessageValues]]]=list(enumerate(conversations))forattemptinrange(num_attempts:=10):ifnotconversations_to_run:breakbatch_indices,batch_conversations=zip(*conversations_to_run)successes,failures=safe_run(self._generate_async(model_id=self.model_config.model_id,conversations=list(batch_conversations),**generation_kwargs,))# Store the successful model outputsforidx,responseinsuccesses:orig_idx=batch_indices[idx]all_responses[orig_idx]=response# If all requests were successful, breakifnotfailures:conversations_to_run=[]break# Put the failed requests back in the queue to try againconversations_to_run=[(batch_indices[idx],conversations[batch_indices[idx]])foridx,_infailures]logger.debug(f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "f"{len(conversations_to_run):,} failed message(s)")# Attempt to handle the exceptions, to improve the chance of getting# successful generations next time aroundfor_,errorinfailures:self._handle_exception(error=error,generation_kwargs=generation_kwargs)# Sleep for a second to avoid pinging the API server too quicklysleep(1)else:raiseInvalidBenchmark(message=f"Failed to generate text, after {num_attempts:,} attempts.")# Extract the generations from the model outputordered_responses=[all_responses[i]foriinrange(len(conversations))]model_output=self._create_model_output(model_responses=ordered_responses,model_id=self.model_config.model_id)iflen(conversations)!=len(model_output.sequences):raiseInvalidBenchmark(f"Number of model inputs ({len(conversations):,}) does not match the "f"number of model outputs ({len(model_output.sequences):,}).")returnmodel_outputdef _handle_exception(self,error:Exception,generation_kwargs:dict[str,t.Any])->None:        """Handle an exception from the model.        Args:            error:                The exception to handle.            generation_kwargs:                The generation kwargs to pass to the model.        """error_msg=str(error).lower()model_id=self.model_config.model_id# Error messages that we want to catch and handlestop_messages=["stop_sequences","'stop' is not supported with this model"]logprobs_messages=["you are not allowed to request logprobs","you've reached the maximum number of requests with logprobs","logprobs is not supported","logprobs is not enabled",]temperature_messages=["'temperature' is not supported with this model.","temperature is not supported with this model",]temperature_must_be_one_messages=["`temperature` may only be set to 1","'temperature' does not support 0.0 with this model. Only the default ""(1) value is supported",]max_items_messages=["'maxItems' is not permitted."]no_json_schema_messages=["Property keys should match pattern"]ifany(msg.lower()inerror_msgformsginstop_messages):log_once(f"The model {model_id!r} does not support ""stop sequences, so disabling them.",level=logging.DEBUG,)generation_kwargs["stop"]=Nonereturnelif(any(msg.lower()inerror_msgformsginlogprobs_messages)# Special case for Vertex AI models, since they have strict rate# limits on using logprobs. They also have a cap of 5 logprobs, but# we ignore this since the rate limiting makes it unusable anyway.or(isinstance(error,VertexAIError)and"logprobs"inerror_msg)):log_once(f"The model {model_id!r} does not support logprobs, so disabling it.",level=logging.DEBUG,)generation_kwargs.pop("logprobs",None)generation_kwargs.pop("top_logprobs",None)returnelifany(msg.lower()inerror_msgformsgintemperature_messages):log_once(f"The model {model_id!r} does not support ""temperature, so disabling it.",level=logging.DEBUG,)generation_kwargs.pop("temperature",None)returnelifany(msg.lower()inerror_msgformsgintemperature_must_be_one_messages):log_once(f"The model {model_id!r} requires ""temperature to be set to 1, so setting it.",level=logging.DEBUG,)generation_kwargs["temperature"]=1.0returnelifany(msg.lower()inerror_msgformsginmax_items_messages):log_once(f"The model {model_id!r} does not support ""maxItems in the JSON schema, so disabling it.",level=logging.DEBUG,)ner_tag_names=list(self.dataset_config.prompt_label_mapping.values())keys_and_their_types={tag_name:(list[str],...)fortag_nameinner_tag_names}pydantic_class=create_model("AnswerFormat",**keys_and_their_types)generation_kwargs["response_format"]=pydantic_classreturnelifany(msg.lower()inerror_msgformsginno_json_schema_messages):log_once(f"The model {self.model_config.model_id!r} does not support ""JSON schemas, so using the vanilla JSON format.",level=logging.DEBUG,)generation_kwargs["response_format"]=dict(type="json_object")returnelifisinstance(error,(Timeout,ServiceUnavailableError,InternalServerError,SystemError)):logger.debug(f"Service temporarily unavailable. The error message was: {error}. "f"Retrying in 5 seconds...")sleep(5)returnelifisinstance(error,(APIConnectionError,OSError)):# If there are too many I/O connections, we increase the number of allowed# file descriptorsif"too many open files"inerror_msg:raiseInvalidBenchmark("There are too many file descriptors running. See the current ""value by running `ulimit -n`. Try increasing it by running ""`ulimit -n <new-value>` and try again.")raiseInvalidBenchmark(f"Encountered {type(error)} during generation: {error}.")ifisinstance(error,RateLimitError):raiseInvalidModel(f"You have encountered your rate limit for model {model_id!r}. ""Skipping.")ifisinstance(error,AuthenticationError):raiseNeedsAdditionalArgument(cli_argument="--api-key",script_argument="api_key=<your-api-key>",run_with_cli=self.benchmark_config.run_with_cli,)raiseInvalidBenchmark(f"Failed to generate text. The error message was: {error}")asyncdef _generate_async(self,model_id:str,conversations:list[list[litellm.AllMessageValues]],**generation_kwargs,)->tuple[list[tuple[int,ModelResponse]],list[tuple[int,Exception]]]:        """Generate outputs from the model asynchronously.        Args:            model_id:                The ID of the model to use for generation.            conversations:                The conversations to pass to the model.            **generation_kwargs:                Additional generation arguments to pass to the model.        Returns:            A tuple (successes, failures), each being a list of tuples (idx, content),            where the `idx` corresponds to the index of `conversations`, and `content`            is either the model response or an Exception.        """# Create a LiteLLM router, which will ensure that we only use a single client# for all the requests, preventing "too many open files" errorsrouter=Router(model_list=[dict(model_name=self.model_config.model_id,litellm_params=generation_kwargs,)])# Get the LLM generations asynchronouslymax_concurrent_calls=20semaphore=asyncio.Semaphore(max_concurrent_calls)requests=[add_semaphore_and_catch_exception(router.acompletion(model=model_id,messages=conversation),semaphore=semaphore,)forconversationinconversations]responses=awaittqdm_async.gather(*requests,leave=False)# Separate the successful responses from the failed onessuccesses=[(idx,response)foridx,responseinenumerate(responses)ifnotisinstance(response,Exception)]failures=[(idx,response)foridx,responseinenumerate(responses)ifisinstance(response,Exception)]# Close connectionsforrequestinrequests:ifhasattr(request,"close"):request.close()returnsuccesses,failures@staticmethoddef _create_model_output(model_responses:list[ModelResponse],model_id:str)->GenerativeModelOutput:        """Create a GenerativeModelOutput object from a list of ModelResponse objects.        Args:            model_responses:                The list of ModelResponse objects to create the GenerativeModelOutput                object from.            model_id:                The ID of the model.        Returns:            A GenerativeModelOutput object.        """sequences=[]scores=[]formodel_responseinmodel_responses:ifnotmodel_response.choices:sequences.append("")logger.warning(f"The model {model_id!r} did not end up ""generating any text. This is likely because the model ran ""out of tokens while reasoning. Returning an empty string.")continuemodel_response_choices=model_response.choices[0]assertisinstance(model_response_choices,litellm.Choices)generated_message:litellm.Message=model_response_choices.messagegeneration_output=generated_message.contentor""generation_output=generation_output.strip()# Structure the model output as a GenerativeModelOutput objectsequences.append(generation_output)ifhasattr(model_response_choices,"logprobs"):logprobs_obj=model_response_choices.logprobsifisinstance(logprobs_obj,ChoiceLogprobs):logprobs_list:list[list[tuple[str,float]]]=[[(top_logprob.token,top_logprob.logprob)fortop_logprobincontent.top_logprobs]forcontentinmodel_response_choices.logprobs.contentorlist()]scores.append(logprobs_list)else:log_once("The logprobs object is malformed, so we won't use logprobs to ""determine the labels.",level=logging.WARNING,)ifnotsequences:logger.warning("No sequences were generated by the model "f"{model_id!r}. This may be due to the ""model running out of tokens or an issue with the input data. ""Returning an empty GenerativeModelOutput.")returnGenerativeModelOutput(sequences=[],scores=None)ifscoresandlen(sequences)!=len(scores):raiseInvalidBenchmark("Sequences and scores must have the same length. "f"Got {len(sequences)} sequences and {len(scores)} scores.")returnGenerativeModelOutput(sequences=sequences,scores=scoresifscoreselseNone)@cached_propertydef num_params(self)->int:[docs]
        """The number of parameters in the model.        Returns:            The number of parameters in the model.        """# Start by trying out the regex mapping, and use the value if it matchesforkey,valueinNUM_PARAMS_MAPPING.items():ifre.fullmatch(pattern=key,string=self.model_config.model_id)isnotNone:returnvalue# If it is an Ollama model then we can get the number of parameters from the# Ollama Python SDKifself.is_ollama:model_info=self._ollama_show.modelinfoifmodel_infoisnotNone:num_params=model_info.get("general.parameter_count")ifnum_paramsisnotNone:returnint(num_params)# If it is a model accessed through the Hugging Face inference API then we can# get the number of parameters from the Hugging Face model configuration from# the Hugging Face Hubifself.model_config.model_id.startswith("huggingface/"):model_id="/".join(self.model_config.model_id.split(sep="/")[-2:])ifHuggingFaceEncoderModel.model_exists(model_id=model_id,benchmark_config=self.benchmark_config):hf_config=load_hf_model_config(model_id=model_id,num_labels=self.dataset_config.num_labels,id2label=self.dataset_config.id2label,label2id=self.dataset_config.label2id,revision="main",model_cache_dir=self.model_config.model_cache_dir,api_key=self.benchmark_config.api_key,trust_remote_code=self.benchmark_config.trust_remote_code,run_with_cli=self.benchmark_config.run_with_cli,)hf_api=HfApi()try:repo_info=hf_api.model_info(repo_id=model_id,revision="main",token=os.getenv("HUGGINGFACE_API_KEY")orself.benchmark_config.api_keyorTrue,)except(RepositoryNotFoundError,RevisionNotFoundError,RequestException,HFValidationError,):repo_info=Noneif(repo_infoisnotNoneandhasattr(repo_info,"safetensors")andrepo_info.safetensorsisnotNoneand"total"inrepo_info.safetensors):returnrepo_info.safetensors["total"]elif(hasattr(hf_config,"num_params")andhf_config.num_paramsisnotNone):returnhf_config.num_paramsreturn-1@cached_propertydef vocab_size(self)->int:[docs]
        """The vocabulary size of the model.        Returns:            The vocabulary size of the model.        """# Start by trying out the regex mapping, and use the value if it matchesforkey,valueinVOCAB_SIZE_MAPPING.items():ifre.fullmatch(pattern=key,string=self.model_config.model_id)isnotNone:returnvalue# If it is a model accessed through the Hugging Face inference API then we can# get the vocabulary size from the Hugging Face model configuration from the# Hugging Face Hubifself.model_config.model_id.startswith("huggingface/"):model_id="/".join(self.model_config.model_id.split(sep="/")[-2:])ifHuggingFaceEncoderModel.model_exists(model_id=model_id,benchmark_config=self.benchmark_config):hf_config=load_hf_model_config(model_id=model_id,num_labels=self.dataset_config.num_labels,id2label=self.dataset_config.id2label,label2id=self.dataset_config.label2id,revision="main",model_cache_dir=self.model_config.model_cache_dir,api_key=self.benchmark_config.api_key,trust_remote_code=self.benchmark_config.trust_remote_code,run_with_cli=self.benchmark_config.run_with_cli,)tokenizer=load_tokenizer(model=None,model_id=model_id,trust_remote_code=self.benchmark_config.trust_remote_code,)if(hasattr(hf_config,"vocab_size")andhf_config.vocab_sizeisnotNone):vocab_size=hf_config.vocab_sizeelif(hasattr(tokenizer,"vocab_size")andtokenizer.vocab_sizeisnotNone):vocab_size=tokenizer.vocab_sizeelse:vocab_size=-1returnvocab_sizereturn-1@cached_propertydef model_max_length(self)->int:[docs]
        """The maximum length of the model.        Returns:            The maximum length of the model.        """# Start by trying out the regex mapping, and use the value if it matchesforkey,valueinMODEL_MAX_LENGTH_MAPPING.items():ifre.fullmatch(pattern=key,string=self.model_config.model_id)isnotNone:returnvalue# If it is an Ollama model then we can get the maximum length from the Ollama# Python SDKifself.is_ollama:ollama_model_id="/".join(self.model_config.model_id.split("/")[1:])model_info=self._ollama_show.modelinfoifmodel_infoisnotNone:context_length_keys=[keyforkeyinmodel_info.keys()if"context_length"inkey.lower()]ifcontext_length_keys:context_length=model_info[context_length_keys[0]]ifcontext_lengthisnotNone:log_once(f"Detected context length key {context_length_keys[0]!r} "f"for Ollama model {ollama_model_id!r}",level=logging.DEBUG,)returnint(context_length)else:log_once(f"Tried to get the maximum length of the Ollama model "f"{ollama_model_id!r}, but could not find a context length. "f"The model info was {model_info}. Returning -1",level=logging.DEBUG,)# If it is a model accessed through the Hugging Face inference API then we can# get the maximum length from the Hugging Face model configuration from the# Hugging Face Hubifself.model_config.model_id.startswith("huggingface/"):model_id="/".join(self.model_config.model_id.split(sep="/")[-2:])ifHuggingFaceEncoderModel.model_exists(model_id=model_id,benchmark_config=self.benchmark_config):hf_config=load_hf_model_config(model_id=model_id,num_labels=self.dataset_config.num_labels,id2label=self.dataset_config.id2label,label2id=self.dataset_config.label2id,revision="main",model_cache_dir=self.model_config.model_cache_dir,api_key=self.benchmark_config.api_key,trust_remote_code=self.benchmark_config.trust_remote_code,run_with_cli=self.benchmark_config.run_with_cli,)tokenizer=load_tokenizer(model=None,model_id=model_id,trust_remote_code=self.benchmark_config.trust_remote_code,)all_max_lengths:list[int]=list()# Add the registered max length of the tokenizerifhasattr(tokenizer,"model_max_length")andtokenizer.model_max_length<int(1e30):all_max_lengths.append(tokenizer.model_max_length)# Add the max length derived from the model's input sizesifhasattr(tokenizer,"max_model_input_sizes"):all_max_lengths.extend([sizeforsizeintokenizer.max_model_input_sizes.values()ifsizeisnotNone])# Add max length candidates from the model's configurationcandidate_config_max_lengths=["max_position_embeddings","max_sequence_length","model_max_length","sliding_window","sliding_window_size","n_positions",]forcandidate_config_max_lengthincandidate_config_max_lengths:if(hasattr(hf_config,candidate_config_max_length)and(value:=getattr(hf_config,candidate_config_max_length))isnotNone):all_max_lengths.append(value)# To avoid models having artificially low max lengths, we remove any max# lengths that are less than 128all_max_lengths=[max_lengthformax_lengthinall_max_lengthsifmax_length>=128]iflen(list(all_max_lengths))>0:returnmin(list(all_max_lengths))return-1@property[docs]
def data_collator(self)->c.Callable[[list[t.Any]],dict[str,t.Any]]:        """The data collator used to prepare samples during finetuning.        Returns:            The data collator.        """raiseNotImplementedError("The `data_collator` property has not been implemented for LiteLLM models.")@propertydef extract_labels_from_generation(self)->ExtractLabelsFunction:[docs]
        """The function used to extract the labels from the generated output.        Returns:            The function used to extract the labels from the generated output.        """matchself.dataset_config.task.task_group:case(TaskGroup.SEQUENCE_CLASSIFICATION|TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION):returnpartial(sequence_classification.extract_labels_from_generation,dataset_config=self.dataset_config,first_label_token_mapping=self.buffer["first_label_token_mapping"],)caseTaskGroup.TEXT_TO_TEXT:returntext_to_text.extract_labels_from_generationcaseTaskGroup.TOKEN_CLASSIFICATION:returnpartial(token_classification.extract_labels_from_generation,dataset_config=self.dataset_config,)caseTaskGroup.QUESTION_ANSWERING:returnquestion_answering.extract_labels_from_generationcase _:raiseNotImplementedError(f"Unsupported task group: {self.dataset_config.task.task_group}.")@propertydef trainer_class(self)->t.Type["Trainer"]:[docs]
        """The Trainer class to use for finetuning.        Returns:            The Trainer class.        """raiseNotImplementedError("The `trainer_class` property has not been implemented for LiteLLM models.")@classmethoddef model_exists([docs]
cls,model_id:str,benchmark_config:BenchmarkConfig)->bool|NeedsExtraInstalled|NeedsEnvironmentVariable:        """Check if a model exists.        Args:            model_id:                The model ID.            benchmark_config:                The benchmark configuration.        Returns:            Whether the model exists, or an error describing why we cannot check            whether the model exists.        """model_id,_=model_id.split("@")if"@"inmodel_idelse(model_id,"main")ifmodel_idinlitellm.model_list:returnTrue# Separate check for Ollama modelsifmodel_id.startswith("ollama/")ormodel_id.startswith("ollama_chat/"):ollama_model_exists=try_download_ollama_model(model_id=model_id)ifollama_model_exists:returnollama_model_existsnum_attempts=10for_inrange(num_attempts):try:litellm.completion(messages=[dict(role="user",content="X")],model=model_id,max_tokens=1,api_key=benchmark_config.api_key,api_base=benchmark_config.api_base,api_version=benchmark_config.api_version,)returnTrue# A rate limit indicates that the model *does* exist, but we are being rate# limited.exceptRateLimitError:returnTrueexcept(APIConnectionError,Timeout,ServiceUnavailableError,InternalServerError,)ase:logger.debug(f"Service temporarily unavailable. The error message was: {e}. ""Retrying in 10 seconds...")sleep(5)exceptAPIErrorase:if"'503 Service Unavailable"notinstr(e):raiseelogger.warning(f"Failed to check if model {model_id!r} exists. Retrying in 10 ""seconds...")sleep(10)except(BadRequestError,NotFoundError):candidate_models=[candidate_model_idforcandidate_model_idinlitellm.model_listifcandidate_model_id.startswith(model_id)]matchlen(candidate_models):case0:passcase1:logger.warning(f"Could not find the model ID {model_id!r}. Did you mean "f"{candidate_models[0]!r}?")case _:candidate_models_str="', '".join(candidate_models)logger.warning(f"Could not find the model ID {model_id!r}. Did you mean "f"any of the following model IDs: '{candidate_models_str}'?")returnFalseelse:logger.error(f"Failed to check if model {model_id!r} exists after {num_attempts} ""attempts. Assuming it does not exist.")returnFalse@classmethoddef get_model_config([docs]
cls,model_id:str,benchmark_config:BenchmarkConfig)->ModelConfig:        """Fetch the model configuration.        Args:            model_id:                The model ID.            benchmark_config:                The benchmark configuration.        Returns:            The model configuration.        """model_id,revision=model_id.split("@")if"@"inmodel_idelse(model_id,"")returnModelConfig(model_id=model_id,revision=revision,task="text-generation",languages=list(),merge=False,inference_backend=InferenceBackend.LITELLM,model_type=ModelType.GENERATIVE,fresh=False,model_cache_dir=create_model_cache_dir(cache_dir=benchmark_config.cache_dir,model_id=model_id),adapter_base_model_id=None,)def prepare_dataset([docs]
self,dataset:DatasetDict,task:Task,itr_idx:int)->DatasetDict:        """Prepare the dataset for the model.        This includes things like tokenisation.        Args:            dataset:                The dataset to prepare.            task:                The task to prepare the dataset for.            itr_idx:                The index of the dataset in the iterator.        Returns:            The prepared dataset.        """iftask.task_group==TaskGroup.QUESTION_ANSWERING:dataset=dataset.map(lambdaexamples:dict(label=[dict(id=id,answers=dict(answer_start=answer_dct["answer_start"],text=[answer_text.lower()foranswer_textinanswer_dct["text"]],),)forid,answer_dctinzip(examples["id"],examples["answers"])]),batched=True,load_from_cache_file=False,keep_in_memory=True,)ifself.benchmark_config.few_shot:few_shot_examples=extract_few_shot_examples(dataset=dataset,dataset_config=self.dataset_config,itr_idx=itr_idx)else:few_shot_examples=list()dataset["test"]=dataset["test"].map(partial(apply_prompt,few_shot_examples=few_shot_examples,model_config=self.model_config,dataset_config=self.dataset_config,instruction_model=True,always_populate_text_field=False,tokenizer=None,),batched=True,load_from_cache_file=False,keep_in_memory=True,)returndatasetdef raise_if_wrong_params([docs]
model_config:ModelConfig,allowed_params:dict[str,list[str]])->None:    """Raise an error if the model configuration has invalid parameters.    Args:        model_config:            The model configuration.        allowed_params:            The allowed parameters for the model.    Raises:        InvalidModel:            If the model configuration has invalid parameters.    """param=model_config.revisionifparam=="":returnformodel_regex,allowed_params_listinallowed_params.items():ifre.fullmatch(pattern=model_regex,string=model_config.model_id):ifparamnotinallowed_params_list:msg=(f"Invalid parameter {param!r} for model {model_config.model_id!r}.")ifallowed_params_list:msg+=f" Allowed parameters are: {', '.join(allowed_params_list)}."else:msg+=" No parameters are allowed."raiseInvalidModel(msg)returndef try_download_ollama_model(model_id:str)->bool:[docs]
    """Try to download an Ollama model.    Args:        model_id:            The model ID. If the model does not start with "ollama/" or "ollama_chat/"            then this function will return False.    Returns:        Whether the model was downloaded successfully.    Raises:        InvalidModel:            If Ollama is not running or the model cannot be downloaded.    """ifnot(model_id.startswith("ollama/")ormodel_id.startswith("ollama_chat/")):returnFalseifmodel_id.startswith("ollama/"):log_once("You're trying to benchmark a model with the old 'ollama/' prefix, which ""probably results in bad performance, as it doesn't use the model's chat ""template. If the model is not a chat model then just disregard this ""warning, but if it is a chat model then please cancel this run and ""use the 'ollama_chat/' prefix instead.",level=logging.WARNING,)try:downloaded_ollama_models:list[str]=[model_obj.modelformodel_objinollama.list().modelsifmodel_obj.modelisnotNone]exceptConnectionError:raiseInvalidModel("Ollama does not seem to be running, so we cannot evaluate the model "f"{model_id!r}. Please make sure that Ollama is running and try again.")ollama_model_id="/".join(model_id.split("/")[1:])ifollama_model_idnotindownloaded_ollama_models:# Try fetching the model infotry:response=ollama.pull(model=ollama_model_id,stream=True)exceptollama.ResponseErrorase:if"file does not exist"instr(e).lower():# Check if the model exists if we prepend "hf.co/"try:ollama_model_id_with_prefix=f"hf.co/{ollama_model_id}"model_id_with_prefix=(f"{model_id.split('/')[0]}/{ollama_model_id_with_prefix}")ollama.pull(model=ollama_model_id_with_prefix,stream=True)log_once(f"The model {model_id!r} cannot be found on Ollama, but the "f"model {model_id_with_prefix} *was* found, so we would ""recommend you cancelling this run and trying the evaluation ""with that model ID instead.")returnFalseexceptollama.ResponseErrorasinner_e:if"file does not exist"instr(inner_e).lower():returnFalseelse:raiseInvalidModel(f"Failed to download Ollama model {ollama_model_id}. "f"The error message was: {inner_e}")else:raiseInvalidModel(f"Failed to download Ollama model {ollama_model_id}. "f"The error message was: {e}")# Download the modelwithtqdm(desc=f"Downloading {ollama_model_id}",unit_scale=True,unit="B",leave=False,)aspbar:forstatusinresponse:ifstatus.totalisnotNone:pbar.total=status.totalifstatus.completedisnotNone:pbar.update(status.completed-pbar.n)returnTrueelse:log_once(f"Ollama model {ollama_model_id!r} already downloaded, so skipping ""download.",level=logging.DEBUG,)returnTrue