euroeval.speed_benchmark

[docs] module euroeval.speed_benchmark
"""Benchmarking model inference speed."""import collections.abc as cimport loggingimport typing as timport pyinferfrom transformers.models.auto.tokenization_auto importAutoTokenizerfrom .benchmark_modules importHuggingFaceEncoderModel,LiteLLMModel,VLLMModelfrom .exceptions importInvalidBenchmarkfrom .logging_utils importget_pbar,logfrom .utils importclear_memoryift.TYPE_CHECKING:from .benchmark_modules importBenchmarkModulefrom .data_models importBenchmarkConfigdef benchmark_speed([docs]
model:"BenchmarkModule",benchmark_config:"BenchmarkConfig")->c.Sequence[dict[str,float]]:    """Benchmark model inference speed.    Args:        model:            Model to use.        benchmark_config:            Configuration for the benchmark.    Returns:        Dictionary of scores.    """scores:list[dict[str,float]]=list()foridxinget_pbar(iterable=range(benchmark_config.num_iterations),desc="Benchmarking",disable=notbenchmark_config.progress_bar,):itr_scores=benchmark_speed_single_iteration(model=model,itr_idx=idx)clear_memory()scores.append(itr_scores)log(f"Scores for iteration {idx}: {itr_scores}",level=logging.DEBUG)returnscoresdef benchmark_speed_single_iteration([docs]
model:"BenchmarkModule",itr_idx:int)->dict[str,float]:    """Run a single iteration of the speed benchmark.    Args:        model:            The model to use in the benchmark.        itr_idx:            The index of the iteration.    Returns:        A dictionary containing the scores for the current iteration.    """gpt2_tokeniser=AutoTokenizer.from_pretrained("gpt2",trust_remote_code=True)base_doc="Document which contains roughly 10 tokens. "multiplier=10*(1+itr_idx)doc=base_doc*multipliershort_multiplier=1.25*(1+itr_idx)short_doc=base_doc*round(short_multiplier)def generate_messages_predict(doc:str)->None:model.generate(inputs=dict(messages=[[dict(role="user",content=doc)]]))def generate_prompt_predict(doc:str)->None:model.generate(inputs=dict(text=[doc]))def encoder_predict(doc:str)->None:tokeniser=model.get_tokeniser()pytorch_model=model.get_pytorch_module()inputs={key:tensor.to(pytorch_model.device)forkey,tensorintokeniser(text=[doc],truncation=True,return_tensors="pt").items()}pytorch_model(**inputs)ifisinstance(model,VLLMModel):predict=generate_prompt_predictelifisinstance(model,LiteLLMModel):predict=generate_messages_predictelifisinstance(model,HuggingFaceEncoderModel):predict=encoder_predictelse:raiseValueError(f"Model type {model} not supported for speed benchmark")try:# Do a warmup run, as the first run is always slowerpyinfer.InferenceReport(model=predict,inputs=base_doc,n_seconds=1).run(print_report=False)speed_scores=pyinfer.InferenceReport(model=predict,inputs=doc,n_seconds=3).run(print_report=False)num_gpt2_tokens=len(gpt2_tokeniser([doc],truncation=True)["input_ids"][0])gpt2_tokens_per_second=speed_scores["Infer(p/sec)"]*num_gpt2_tokensspeed_scores_short=pyinfer.InferenceReport(model=predict,inputs=short_doc,n_seconds=3).run(print_report=False)num_gpt2_tokens_short=len(gpt2_tokeniser([short_doc],truncation=True)["input_ids"][0])gpt2_tokens_per_second_short=(speed_scores_short["Infer(p/sec)"]*num_gpt2_tokens_short)except(RuntimeError,ValueError,IndexError)ase:raiseInvalidBenchmark(f"Speed benchmark failed with error: {e!r}")from ereturndict(test_speed=gpt2_tokens_per_second,test_speed_short=gpt2_tokens_per_second_short)