euroeval.tokenization_utils

[docs] module euroeval.tokenization_utils
"""Utility functions related to tokenization."""import loggingimport reimport typing as timport torchfrom .constants importTASK_GROUPS_USING_LOGPROBSfrom .enums importGenerativeTypefrom .utils importlog_onceift.TYPE_CHECKING:from transformers.tokenization_utils importPreTrainedTokenizerfrom transformers.tokenization_utils_base importPreTrainedTokenizerBasefrom .data_models importDatasetConfig,ModelConfiglogger=logging.getLogger("euroeval")[docs]
def get_special_token_metadata(tokenizer:"PreTrainedTokenizerBase")->dict:    """Get the special token metadata for a tokenizer.    Args:        tokenizer:            The tokenizer.    Returns:        The special token metadata.    """# Create some test input IDs, to check if the tokenizer is adding special tokenstest_input_ids=tokenizer("Test").input_ids# Extract the CLS token IDs from the tokenizer, if it's using themhas_cls_token=Trueiftokenizer.cls_token_idintest_input_ids:cls_token_id=tokenizer.cls_token_idcls_token=tokenizer.cls_tokeneliftokenizer.bos_token_idintest_input_ids:cls_token_id=tokenizer.bos_token_idcls_token=tokenizer.bos_tokeneliftokenizer.cls_tokenisnotNone:cls_token_id=tokenizer.cls_token_idcls_token=tokenizer.cls_tokenhas_cls_token=Falseelse:cls_token_id=tokenizer.bos_token_idcls_token=tokenizer.bos_tokenhas_cls_token=False# Extract the SEP token IDs from the tokenizer, if it's using themhas_sep_token=Trueiftokenizer.sep_token_idintest_input_ids:sep_token=tokenizer.sep_tokeneliftokenizer.eos_token_idintest_input_ids:sep_token=tokenizer.eos_tokeneliftokenizer.sep_tokenisnotNone:sep_token=tokenizer.sep_tokenhas_sep_token=Falseelse:sep_token=tokenizer.eos_tokenhas_sep_token=Falsereturndict(cls_token_id=cls_token_id,cls_token=cls_token,sep_token=sep_token,has_cls_token=has_cls_token,has_sep_token=has_sep_token,)def should_prompts_be_stripped([docs]
labels_to_be_generated:list[str],tokenizer:"PreTrainedTokenizer")->bool:    """Determine if we should strip the prompts for few-shot evaluation.    This is the case if the tokenizer needs to include the space as part of the label    token. The strategy is thus to tokenize a label with a preceeding colon (as in the    prompts), i.e., ": positive", and check if the tokenization starts with the tokens    of ": ". If this is the case, then we should not strip the prompts, since the    tokenizer produces the whitespace token separately.    Args:        labels_to_be_generated:            The labels that are to be generated.        tokenizer:            The tokenizer used to tokenize the labels.    Returns:        Whether we should strip the prompts.    """strip_prompts=Trueforlabelinlabels_to_be_generated:colon_tokens=tokenizer(": ",add_special_tokens=False).input_idslabel_tokens=tokenizer(": "+label,add_special_tokens=False).input_idsifisinstance(colon_tokens,torch.Tensor):colon_tokens=list(colon_tokens.squeeze(0))ifisinstance(label_tokens,torch.Tensor):label_tokens=list(label_tokens.squeeze(0))label_tokens_start_with_colon_tokens=(label_tokens[:len(colon_tokens)]==colon_tokens)iflabel_tokens_start_with_colon_tokens:strip_prompts=Falsereturnstrip_promptsdef should_prefix_space_be_added_to_labels([docs]
labels_to_be_generated:list[str],tokenizer:"PreTrainedTokenizer")->bool:    """Determine if we should add a prefix space to the labels.    This is the case if the prompts are stripped and the tokenizer doesn't    automatically add prefix whitespaces to the labels.    Args:        labels_to_be_generated:            The labels that are to be generated.        tokenizer:            The tokenizer used to tokenize the labels.    Returns:        Whether we should add a prefix space to the labels.    """ifnotshould_prompts_be_stripped(labels_to_be_generated=labels_to_be_generated,tokenizer=tokenizer):returnFalsewhitespace_token=tokenizer.convert_ids_to_tokens(ids=tokenizer(" ",add_special_tokens=False).input_ids[0])[0]add_prefix_space=Trueforlabelinlabels_to_be_generated:label_tokens=tokenizer(label,add_special_tokens=False).input_idsifisinstance(label_tokens,torch.Tensor):label_tokens=list(label_tokens.squeeze(0))first_label_token:int=int(label_tokens[0])first_character_of_label=tokenizer.convert_ids_to_tokens(first_label_token)[0]has_prefix_space=first_character_of_label==whitespace_tokenifhas_prefix_space:add_prefix_space=Falsebreakreturnadd_prefix_spacedef get_bos_token([docs]
tokenizer:"PreTrainedTokenizer",)->tuple[str,int]|tuple[None,None]:    """Get the beginning-of-sequence token from a tokenizer.    Args:        tokenizer:            The tokenizer.    Returns:        A pair (token, token_id) representing the beginning-of-sequence token and its        token ID, or (None, None) if no BOS token is found.    """ifisinstance(tokenizer.bos_token,str)andisinstance(tokenizer.bos_token_id,int):returntokenizer.bos_token,tokenizer.bos_token_idvocab:dict[str,int]=tokenizer.get_vocab()candidate_bos_tokens=["<s>","<|begin_of_text|>","<|startoftext|>","[CLS]"]forcandidate_bos_tokenincandidate_bos_tokens:ifcandidate_bos_tokeninvocab:bos_token=candidate_bos_tokenbos_token_id=vocab[bos_token]breakelse:log_once("The model does not have a beginning-of-sequence token. Please ensure that ""this has been set in the tokenizer's configuration. Using no BOS token."" This may lead to unexpected behavior in the model.",level=logging.INFO,)returnNone,Nonelog_once(f"Beginning-of-sequence token was not set, but detected it as {bos_token!r} "f"with ID {bos_token_id}.",level=logging.DEBUG,)returnbos_token,bos_token_iddef get_eos_token([docs]
tokenizer:"PreTrainedTokenizer",)->tuple[str,int]|tuple[None,None]:    """Get the end-of-sequence token from a tokenizer.    Args:        tokenizer:            The tokenizer.    Returns:        A pair (token, token_id) representing the end-of-sequence token and its token        ID, or (None, None) if no EOS token is found.    """ifisinstance(tokenizer.eos_token,str)andisinstance(tokenizer.eos_token_id,int):returntokenizer.eos_token,tokenizer.eos_token_idvocab:dict[str,int]=tokenizer.get_vocab()candidate_eos_tokens=["</s>","<|end_of_text|>","<|endoftext|>","[SEP]"]forcandidate_eos_tokenincandidate_eos_tokens:ifcandidate_eos_tokeninvocab:eos_token=candidate_eos_tokeneos_token_id=vocab[eos_token]breakelse:log_once("The model does not have an end-of-sequence token. Please ensure that this ""has been set in the tokenizer's configuration. Using no EOS token. This ""may lead to unexpected behavior in the model.",level=logging.INFO,)returnNone,Nonelog_once(f"End-of-sequence token was not set, but detected it as {eos_token!r} with "f"ID {eos_token_id}.",level=logging.DEBUG,)returneos_token,eos_token_iddef get_pad_token([docs]
tokenizer:"PreTrainedTokenizer",)->tuple[str,int]|tuple[None,None]:    """Get the padding token from a tokenizer.    Args:        tokenizer:            The tokenizer.    Returns:        A pair (token, token_id) representing the padding token and its token ID, or        (None, None) if no padding token is found.    """# If the tokenizer already has a padding token, return itiftokenizer.pad_tokenisnotNoneandtokenizer.pad_token_idisnotNone:assertisinstance(tokenizer.pad_token,str),("Expected tokenizer.pad_token to be a string, but got "f"{type(tokenizer.pad_token)}.")assertisinstance(tokenizer.pad_token_id,int),("Expected tokenizer.pad_token_id to be an integer, but got "f"{type(tokenizer.pad_token_id)}.")return(tokenizer.pad_token,tokenizer.pad_token_id)# If the tokenizer has a BOS token, use it as the padding tokeniftokenizer.bos_tokenisnotNoneandtokenizer.bos_token_idisnotNone:assertisinstance(tokenizer.bos_token,str),("Expected tokenizer.bos_token to be a string, but got "f"{type(tokenizer.bos_token)}.")assertisinstance(tokenizer.bos_token_id,int),("Expected tokenizer.bos_token_id to be an integer, but got "f"{type(tokenizer.bos_token_id)}.")pad_token=tokenizer.bos_tokenpad_token_id=tokenizer.bos_token_id# If the tokenizer has an EOS token, use it as the padding tokeneliftokenizer.eos_tokenisnotNoneandtokenizer.eos_token_idisnotNone:assertisinstance(tokenizer.eos_token,str),("Expected tokenizer.eos_token to be a string, but got "f"{type(tokenizer.eos_token)}.")assertisinstance(tokenizer.eos_token_id,int),("Expected tokenizer.eos_token_id to be an integer, but got "f"{type(tokenizer.eos_token_id)}.")pad_token=tokenizer.eos_tokenpad_token_id=tokenizer.eos_token_id# Otherwise, try to find a candidate padding token in the vocabularyelse:pad_token_candidates=["<pad>","[pad]","<|endoftext|>","<｜end▁of▁sentence｜>","<|im_end|>",]pad_token_candidates.extend([c.upper()forcinpad_token_candidates])forcandidateinpad_token_candidates:ifcandidateintokenizer.get_vocab():pad_token=candidatepad_token_id=tokenizer.get_vocab()[candidate]breakelse:log_once("Could not identify a padding token for the model. Please ensure that ""this has been set in the tokenizer's configuration. Using no padding ""token. This may lead to unexpected behavior in the model.",level=logging.INFO,)returnNone,Nonelog_once(f"Padding token was not set, but detected it as {pad_token!r} with ID "f"{pad_token_id}.",level=logging.DEBUG,)returnpad_token,pad_token_id[docs]
def get_end_of_chat_token_ids(tokenizer:"PreTrainedTokenizer")->list[int]|None:    """Get the end token ID for chat models.    This is only relevant for tokenizers with a chat template.    Args:        tokenizer:            The tokenizer.    Returns:        The token IDs used to end chats, or None if the tokenizer does not have a chat        template.    Raises:        ValueError:            If the end-of-chat token could not be located.    """iftokenizer.chat_templateisNone:returnNoneuser_message:dict[str,str]=dict(role="user",content="X")token_ids:list[int]=tokenizer.apply_chat_template(conversation=[user_message])# type: ignore[assignment]foridx,tokeninenumerate(tokenizer.convert_ids_to_tokens(token_ids)):token_id=tokenizer.convert_tokens_to_ids(token)assertisinstance(token_id,int)token=tokenizer.decode([token_id])if"X"intoken:x_token_index=idxbreakelse:raiseValueError("Could not locate the end-of-chat token for the model.")end_of_chat_tokens=token_ids[x_token_index+1:]iflen(end_of_chat_tokens)==0:returnNonereturnend_of_chat_tokensdef get_first_label_token_mapping([docs]
dataset_config:"DatasetConfig",model_config:"ModelConfig",tokenizer:"PreTrainedTokenizer | None",generative_type:"GenerativeType | None",)->dict[str,str]|bool:    """Check if the model should output scores.    Args:        dataset_config:            The dataset configuration.        model_config:            The model configuration.        tokenizer:            The tokenizer, or None if not available.        generative_type:            The generative type, or None if not available.    Returns:        A mapping from labels to the first token in each label, or alternatively a        Boolean value indicating whether the model should output scores (if the mapping        is outputted then the model will always output scores).    """ifgenerative_type==GenerativeType.REASONING:log_once(f"The model {model_config.model_id!r} is a reasoning model and ""thus does not support logprobs, so we do not enable it.",level=logging.DEBUG,)returnFalse# If we do not have any tokenizer, then we cannot check if the model should output# scores and we just assume it should if the dataset supports itoutput_scores=dataset_config.task.task_groupinTASK_GROUPS_USING_LOGPROBSiftokenizerisNone:ifoutput_scores:log_once(f"We will use logprobs with the model {model_config.model_id!r} ""since the dataset supports it and no tokenizer is available.",level=logging.DEBUG,)else:log_once(f"We will not use logprobs with the model {model_config.model_id!r} ""since the dataset does not support it and no tokenizer is available.",level=logging.DEBUG,)returnoutput_scores# If there are labels associated with the dataset, and that the first token of each# label is distinct, then we can safely use the logprobsifoutput_scoresanddataset_config.labels:local_labels=[dataset_config.prompt_label_mapping[label].strip()forlabelindataset_config.labels]# Tokenize some text containing each label, which we will use to extract the# first token of each labelall_tokens:list[list[str]]iftokenizer.chat_templateisNone:add_prefix_space=should_prefix_space_be_added_to_labels(labels_to_be_generated=local_labels,tokenizer=tokenizer)all_tokens=[tokenizer.tokenize(text=f" {label}"ifadd_prefix_spaceelselabel)forlabelinlocal_labels]else:all_tokens=[tokenizer.convert_ids_to_tokens(ids=tokenizer.apply_chat_template(conversation=[dict(role="user",content=""),dict(role="assistant",content=label),],add_generation_prompt=True,tokenize=True,))forlabelinlocal_labels]# Remove any non-alphabetic characters from the tokensall_tokens=[[re.sub(pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",repl="",string=token.lower(),)fortokenintoken_list]fortoken_listinall_tokens]# Extract the first token of each labelfirst_tokens:list[str]=list()fortoken_list,labelinzip(all_tokens,local_labels):matching_tokens=[tokfortokintoken_listiftokandlabel.startswith(tok)]ifnotmatching_tokens:log_once(f"No matching token found in token_list for label '{label}', so ""we will not use logprobs with the model.",level=logging.DEBUG,)returnFalsefirst_tokens.append(matching_tokens[0])# Build a mapping from labels to the first token in each label if the first# tokens are distinctiflen(first_tokens)==len(set(first_tokens)):log_once("We will use logprobs with the model since the first tokens of the ""labels are distinct.",level=logging.DEBUG,)return{label:first_tokenforlabel,first_tokeninzip(local_labels,first_tokens)}else:log_once("We will not use logprobs with the model since the first tokens of the ""labels are not distinct. The first tokens for the labels "f"{local_labels} are {first_tokens}")returnFalse# Otherwise, we assume that the model should not output scores, to avoid potential# evaluation errors. This will force the label extraction to rely on word edit# distance instead of logprobs.log_once("We will not use logprobs with the model, since the dataset does not have ""labels.",level=logging.DEBUG,)returnFalse