euroeval.tokenisation_utils

[docs] module euroeval.tokenisation_utils
"""Utility functions related to tokenisation."""import collections.abc as cimport loggingimport reimport typing as timport torchfrom .constants importBOS_TOKENS,EOS_TOKENS,PAD_TOKENSfrom .enums importGenerativeTypefrom .exceptions importInvalidModelfrom .logging_utils importlog,log_oncefrom .types importTokenisertry:from transformers.tokenization_mistral_common importMistralCommonTokenizerexceptImportError:from transformers.tokenization_mistral_common import(MistralCommonBackendasMistralCommonTokenizer,)ift.TYPE_CHECKING:from transformers.tokenization_utils_base importPreTrainedTokenizerBasefrom .data_models importDatasetConfig,ModelConfig[docs]
def get_special_token_metadata(tokeniser:"PreTrainedTokenizerBase")->dict:    """Get the special token metadata for a tokeniser.    Args:        tokeniser:            The tokeniser.    Returns:        The special token metadata.    """# Create some test input IDs, to check if the tokeniser is adding special tokenstest_input_ids=tokeniser("Test").input_ids# Extract the CLS token IDs from the tokeniser, if it's using themhas_cls_token=Trueiftokeniser.cls_token_idintest_input_ids:cls_token_id=tokeniser.cls_token_idcls_token=tokeniser.cls_tokeneliftokeniser.bos_token_idintest_input_ids:cls_token_id=tokeniser.bos_token_idcls_token=tokeniser.bos_tokeneliftokeniser.cls_tokenisnotNone:cls_token_id=tokeniser.cls_token_idcls_token=tokeniser.cls_tokenhas_cls_token=Falseelse:cls_token_id=tokeniser.bos_token_idcls_token=tokeniser.bos_tokenhas_cls_token=False# Extract the SEP token IDs from the tokeniser, if it's using themhas_sep_token=Trueiftokeniser.sep_token_idintest_input_ids:sep_token=tokeniser.sep_tokeneliftokeniser.eos_token_idintest_input_ids:sep_token=tokeniser.eos_tokeneliftokeniser.sep_tokenisnotNone:sep_token=tokeniser.sep_tokenhas_sep_token=Falseelse:sep_token=tokeniser.eos_tokenhas_sep_token=Falsereturndict(cls_token_id=cls_token_id,cls_token=cls_token,sep_token=sep_token,has_cls_token=has_cls_token,has_sep_token=has_sep_token,)def should_prompts_be_stripped([docs]
labels_to_be_generated:c.Sequence[str],tokeniser:Tokeniser)->bool:    """Determine if we should strip the prompts for few-shot evaluation.    This is the case if the tokeniser needs to include the space as part of the label    token. The strategy is thus to tokenise a label with a preceeding colon (as in the    prompts), i.e., ": positive", and check if the tokenisation starts with the tokens    of ": ". If this is the case, then we should not strip the prompts, since the    tokeniser produces the whitespace token separately.    Args:        labels_to_be_generated:            The labels that are to be generated.        tokeniser:            The tokeniser used to tokenise the labels.    Returns:        Whether we should strip the prompts.    """strip_prompts=Trueforlabelinlabels_to_be_generated:colon_tokens=tokeniser(": ",add_special_tokens=False).input_idslabel_tokens=tokeniser(": "+label,add_special_tokens=False).input_idsifisinstance(colon_tokens,torch.Tensor):colon_tokens=list(colon_tokens.squeeze(0))ifisinstance(label_tokens,torch.Tensor):label_tokens=list(label_tokens.squeeze(0))label_tokens_start_with_colon_tokens=(label_tokens[:len(colon_tokens)]==colon_tokens)iflabel_tokens_start_with_colon_tokens:strip_prompts=Falsereturnstrip_promptsdef should_prefix_space_be_added_to_labels([docs]
labels_to_be_generated:c.Sequence[str],tokeniser:Tokeniser)->bool:    """Determine if we should add a prefix space to the labels.    This is the case if the prompts are stripped and the tokeniser doesn't    automatically add prefix whitespaces to the labels.    Args:        labels_to_be_generated:            The labels that are to be generated.        tokeniser:            The tokeniser used to tokenise the labels.    Returns:        Whether we should add a prefix space to the labels.    """ifnotshould_prompts_be_stripped(labels_to_be_generated=labels_to_be_generated,tokeniser=tokeniser):returnFalsewhitespace_token=tokeniser.convert_ids_to_tokens(ids=tokeniser(" ",add_special_tokens=False).input_ids[0])[0]add_prefix_space=Trueforlabelinlabels_to_be_generated:label_tokens=tokeniser(label,add_special_tokens=False).input_idsifisinstance(label_tokens,torch.Tensor):label_tokens=list(label_tokens.squeeze(0))first_label_token:int=int(label_tokens[0])first_character_of_label=tokeniser.convert_ids_to_tokens(first_label_token)[0]has_prefix_space=first_character_of_label==whitespace_tokenifhas_prefix_space:add_prefix_space=Falsebreakreturnadd_prefix_space[docs]
def get_bos_token(tokeniser:Tokeniser)->tuple[str,int]|tuple[None,None]:    """Get the beginning-of-sequence token from a tokeniser.    Args:        tokeniser:            The tokeniser.    Returns:        A pair (token, token_id) representing the beginning-of-sequence token and its        token ID, or (None, None) if no BOS token is found.    """ifisinstance(tokeniser.bos_token,str)andisinstance(tokeniser.bos_token_id,int):returntokeniser.bos_token,tokeniser.bos_token_idvocab:dict[str,int]=tokeniser.get_vocab()forcandidate_bos_tokeninBOS_TOKENS:ifcandidate_bos_tokeninvocab:bos_token=candidate_bos_tokenbos_token_id=vocab[bos_token]breakelse:log_once("The model does not have a beginning-of-sequence token. Please ensure that ""this has been set in the tokeniser's configuration. Using no BOS token."" This may lead to unexpected behavior in the model.",level=logging.WARNING,)returnNone,Nonelog_once(f"Beginning-of-sequence token was not set, but detected it as {bos_token!r} "f"with ID {bos_token_id}.",level=logging.DEBUG,)returnbos_token,bos_token_id[docs]
def get_eos_token(tokeniser:Tokeniser)->tuple[str,int]|tuple[None,None]:    """Get the end-of-sequence token from a tokeniser.    Args:        tokeniser:            The tokeniser.    Returns:        A pair (token, token_id) representing the end-of-sequence token and its token        ID, or (None, None) if no EOS token is found.    """ifisinstance(tokeniser.eos_token,str)andisinstance(tokeniser.eos_token_id,int):returntokeniser.eos_token,tokeniser.eos_token_idvocab:dict[str,int]=tokeniser.get_vocab()forcandidate_eos_tokeninEOS_TOKENS:ifcandidate_eos_tokeninvocab:eos_token=candidate_eos_tokeneos_token_id=vocab[eos_token]breakelse:log_once("The model does not have an end-of-sequence token. Please ensure that this ""has been set in the tokeniser's configuration. Using no EOS token. This ""may lead to unexpected behavior in the model.",level=logging.WARNING,)returnNone,Nonelog_once(f"End-of-sequence token was not set, but detected it as {eos_token!r} with "f"ID {eos_token_id}.",level=logging.WARNING,)returneos_token,eos_token_id[docs]
def get_pad_token(tokeniser:Tokeniser)->tuple[str,int]|tuple[None,None]:    """Get the padding token from a tokeniser.    Args:        tokeniser:            The tokeniser.    Returns:        A pair (token, token_id) representing the padding token and its token ID, or        (None, None) if no padding token is found.    """# If the tokeniser already has a padding token, return itiftokeniser.pad_tokenisnotNoneandtokeniser.pad_token_idisnotNone:assertisinstance(tokeniser.pad_token,str),("Expected tokeniser.pad_token to be a string, but got "f"{type(tokeniser.pad_token)}.")assertisinstance(tokeniser.pad_token_id,int),("Expected tokeniser.pad_token_id to be an integer, but got "f"{type(tokeniser.pad_token_id)}.")return(tokeniser.pad_token,tokeniser.pad_token_id)# If the tokeniser has a BOS token, use it as the padding tokeniftokeniser.bos_tokenisnotNoneandtokeniser.bos_token_idisnotNone:assertisinstance(tokeniser.bos_token,str),("Expected tokeniser.bos_token to be a string, but got "f"{type(tokeniser.bos_token)}.")assertisinstance(tokeniser.bos_token_id,int),("Expected tokeniser.bos_token_id to be an integer, but got "f"{type(tokeniser.bos_token_id)}.")pad_token=tokeniser.bos_tokenpad_token_id=tokeniser.bos_token_id# If the tokeniser has an EOS token, use it as the padding tokeneliftokeniser.eos_tokenisnotNoneandtokeniser.eos_token_idisnotNone:assertisinstance(tokeniser.eos_token,str),("Expected tokeniser.eos_token to be a string, but got "f"{type(tokeniser.eos_token)}.")assertisinstance(tokeniser.eos_token_id,int),("Expected tokeniser.eos_token_id to be an integer, but got "f"{type(tokeniser.eos_token_id)}.")pad_token=tokeniser.eos_tokenpad_token_id=tokeniser.eos_token_id# Otherwise, try to find a candidate padding token in the vocabularyelse:forcandidateinPAD_TOKENS:ifcandidateintokeniser.get_vocab():pad_token=candidatepad_token_id=tokeniser.get_vocab()[candidate]breakelse:log_once("Could not identify a padding token for the model. Please ensure that ""this has been set in the tokeniser's configuration. Using no padding ""token. This may lead to unexpected behavior in the model.",level=logging.WARNING,)returnNone,Nonelog_once(f"Padding token was not set, but detected it as {pad_token!r} with ID "f"{pad_token_id}.",level=logging.DEBUG,)returnpad_token,pad_token_iddef get_end_of_chat_token_ids([docs]
tokeniser:Tokeniser,generative_type:GenerativeType|None)->c.Sequence[int]|None:    """Get the end token ID for chat models.    This is only relevant for tokenisers with a chat template.    Args:        tokeniser:            The tokeniser.        generative_type:            The generative type, or None if not available.    Returns:        The token IDs used to end chats, or None if the tokeniser does not have a chat        template or if no end-of-chat token could be found.    """ifgenerative_type==GenerativeType.BASE:returnNoneuser_message:dict[str,str]=dict(role="user",content="X")try:token_ids=apply_chat_template(conversation=[user_message],tokeniser=tokeniser,tokenise=True,add_generation_prompt=False,enable_thinking=generative_type==GenerativeType.REASONING,)exceptInvalidModelase:if"does not have a chat template"instr(e):returnNoneraiseeassertisinstance(token_ids,list)foridx,tokeninenumerate(tokeniser.convert_ids_to_tokens(token_ids)):if"X"intoken:x_token_index=idxbreakelse:log("Could not locate the end-of-chat token for the model.",level=logging.DEBUG)returnNoneend_of_chat_tokens=token_ids[x_token_index+1:]iflen(end_of_chat_tokens)==0:log("Could not locate the end-of-chat token for the model.",level=logging.DEBUG)returnNonelog_once(f"Detected end-of-chat token IDs as {end_of_chat_tokens}, corresponding to "f"tokens {tokeniser.convert_ids_to_tokens(end_of_chat_tokens)}.",level=logging.DEBUG,)returnend_of_chat_tokensdef get_first_label_token_mapping([docs]
dataset_config:"DatasetConfig",model_config:"ModelConfig",tokeniser:Tokeniser|None,generative_type:"GenerativeType | None",log_metadata:bool,)->dict[str,str]|bool:    """Check if the model should output scores.    Args:        dataset_config:            The dataset configuration.        model_config:            The model configuration.        tokeniser:            The tokeniser, or None if not available.        generative_type:            The generative type, or None if not available.        log_metadata:            Whether to log metadata.    Returns:        A mapping from labels to the first token in each label, or alternatively a        Boolean value indicating whether the model should output scores (if the mapping        is outputted then the model will always output scores).    """ifnot(dataset_config.task.uses_logprobsanddataset_config.labels):iflog_metadata:log_once("We will not use logprobs with the model, since the dataset does not ""have labels.",level=logging.DEBUG,)returnFalseelifgenerative_type==GenerativeType.REASONING:iflog_metadata:log_once(f"The model {model_config.model_id!r} is a reasoning model and ""thus does not support logprobs, so we do not enable it.",level=logging.DEBUG,)returnFalseeliftokeniserisNone:iflog_metadata:log_once(f"We will use logprobs with the model {model_config.model_id!r} ""since the dataset supports it and no tokeniser is available.",level=logging.DEBUG,)returnTruelocal_labels=[dataset_config.prompt_label_mapping[label].strip()forlabelindataset_config.labels]# Tokenise some text containing each label, which we will use to extract the# first token of each labelall_tokens:c.Sequence[c.Sequence[str]]ifnothas_chat_template(tokeniser=tokeniser):add_prefix_space=should_prefix_space_be_added_to_labels(labels_to_be_generated=local_labels,tokeniser=tokeniser)all_tokens=[[tokeniser.decode(token_id)fortoken_idintokeniser.encode(text=f" {label}"ifadd_prefix_spaceelselabel,add_special_tokens=False,)]forlabelinlocal_labels]else:all_tokens=[tokeniser.convert_ids_to_tokens(ids=apply_chat_template(# type: ignore[no-matching-overload]conversation=[dict(role="user",content=""),dict(role="assistant",content=label),# Adding extra user message as Mistral tokenisers require# conversations to end with a user messagedict(role="user",content=""),],tokeniser=tokeniser,tokenise=True,add_generation_prompt=True,enable_thinking=generative_type==GenerativeType.REASONING,))forlabelinlocal_labels]# Remove any non-alphabetic characters from the tokensall_tokens=[[re.sub(pattern=r"^[^a-zæøåüöä0-9 ]+|[^a-zæøåüöä0-9 ]+$",repl="",string=token.lower(),)fortokenintoken_list]fortoken_listinall_tokens]# Extract the first token of each labelfirst_tokens:list[str]=list()fortoken_list,labelinzip(all_tokens,local_labels):matching_tokens=[tokfortokintoken_listiftokandlabel.startswith(tok.strip())]ifnotmatching_tokens:iflog_metadata:log_once(f"No matching token found in token_list for label {label!r}, so ""we will not use logprobs with the model.",level=logging.DEBUG,)returnFalsefirst_tokens.append(matching_tokens[0])# Build a mapping from labels to the first token in each label if the first# tokens are distinctiflen(first_tokens)==len(set(first_tokens)):mapping={label:first_tokenforlabel,first_tokeninzip(local_labels,first_tokens)}iflog_metadata:log_once("Using logprobs as evaluation strategy for the model, with the "f"following mapping from labels to their first token: {mapping}.",level=logging.DEBUG,)returnmappingelse:iflog_metadata:log_once("We will not use logprobs with the model since the first tokens of the ""labels are not distinct. The first tokens for the labels "f"{local_labels} are {first_tokens}",level=logging.DEBUG,)returnFalsedef has_chat_template(tokeniser:Tokeniser)->bool:[docs]
    """Check if a tokeniser has a chat template.    Args:        tokeniser:            The tokeniser.    Returns:        Whether the tokeniser has a chat template.    """ifisinstance(tokeniser,MistralCommonTokenizer):log_once("The tokeniser is a Mistral tokeniser, so assuming that the model is ""instruction tuned.",level=logging.DEBUG,)returnTrueelifhasattr(tokeniser,"chat_template"):has_template=tokeniser.chat_templateisnotNoneifhas_template:log_once("The tokeniser has a chat template, so assuming that the model is ""instruction tuned.",level=logging.DEBUG,)returnhas_templateelse:log_once("We cannot find a chat template for the tokeniser, so assuming that the ""model isn't instruction tuned.",level=logging.DEBUG,)returnFalsedef apply_chat_template([docs]
conversation:list[dict[str,str]],tokeniser:Tokeniser,tokenise:bool,add_generation_prompt:bool,**extra_kwargs,)->str|list[int]:    """Apply the chat template to a prompt.    Args:        conversation:            The conversation to apply the chat template to.        tokeniser:            The tokeniser.        tokenise:            Whether to tokenise the resulting prompt, returning a list of token IDs            instead of a string.        add_generation_prompt:            Whether to add a generation prompt at the end of the conversation. This is            only relevant for regular Hugging Face tokenisers, as Mistral tokenisers            always add a generation prompt.        **extra_kwargs:            Extra keyword arguments to pass to the tokeniser's `apply_chat_template`            method. Only relevant for regular Hugging Face tokenisers.    Returns:        The prompt with the chat template applied, either as a string or a list of        token IDs, depending on the value of `tokenise`.    Raises:        InvalidModel:            If the tokeniser does not have a chat template.    """# Ensure that the first user message is not empty, as this can cause issues with# Jinja2conversation[0]["content"]=conversation[0]["content"]or" "ifnothas_chat_template(tokeniser=tokeniser):raiseInvalidModel("The tokeniser does not have a chat template, so cannot apply it.")elifisinstance(tokeniser,MistralCommonTokenizer):templated_prompt=tokeniser.apply_chat_template(conversation=conversation,tokenize=tokenise)else:templated_prompt=tokeniser.apply_chat_template(conversation=conversation,add_generation_prompt=add_generation_prompt,tokenize=tokenise,**extra_kwargs,)returntemplated_prompt#  type: ignore[bad-return]