euroeval.string_utils¶

[docs] module euroeval.string_utils
"""Utility functions related to string manipulation or structuring."""import collections.abc as cimport loggingimport reimport typing as timport demjson3import numpy as npfrom .exceptions importInvalidBenchmark,InvalidModelfrom .logging_utils importlogift.TYPE_CHECKING:from .data_models importModelIdComponentsdef scramble(text:str)->str:[docs]
    """Scramble a string in a bijective manner.    Args:        text:            The string to scramble.    Returns:        The scrambled string.    """rng=np.random.default_rng(seed=4242)permutation=rng.permutation(x=len(text))scrambled="".join(text[i]foriinpermutation)returnscrambleddef unscramble(scrambled_text:str)->str:[docs]
    """Unscramble a string in a bijective manner.    Args:        scrambled_text:            The scrambled string to unscramble.    Returns:        The unscrambled string.    """rng=np.random.default_rng(seed=4242)permutation=rng.permutation(x=len(scrambled_text))inverse_permutation=np.argsort(permutation)unscrambled="".join(scrambled_text[i]foriininverse_permutation)returnunscrambleddef extract_json_dict_from_string(s:str)->dict|None:[docs]
    """Extract a JSON dictionary from a string.    Args:        s:            The string to extract the JSON dictionary from.    Returns:        The extracted JSON dictionary, or None if no JSON dictionary could be found.    """json_regex=r"\{[^{}]*?\}"if(json_match:=re.search(pattern=json_regex,string=s,flags=re.DOTALL))isNone:log("The model output does not contain any JSON dictionary, so cannot parse "f"it. Skipping. Here is the output: {s!r}",level=logging.DEBUG,)returnNonejson_string=json_match.group()try:json_output=demjson3.decode(txt=json_string)exceptdemjson3.JSONDecodeError:log("The model output is not valid JSON, so cannot parse it. Skipping. "f"Here is the output: {json_string!r}",level=logging.DEBUG,)returnNoneifnotisinstance(json_output,dict):log("The model output is not a JSON dictionary, so cannot parse "f"it. Skipping. Here is the output: {json_string!r}",level=logging.DEBUG,)returnNoneelifnotall(isinstance(key,str)forkeyinjson_output.keys()):log("The model output is not a JSON dictionary with string keys, ""so cannot parse it. Skipping. Here is the output: "f"{json_string!r}",level=logging.DEBUG,)returnNonereturnjson_outputdef extract_multiple_choice_labels([docs]
prompt:str,candidate_labels:c.Sequence[str])->c.Sequence[str]:    """Extract multiple choice labels from a prompt.    Args:        prompt:            The prompt to extract the labels from.        candidate_labels:            The candidate labels to look for in the prompt.    Returns:        The extracted labels.    """sample_candidate_labels:list[str]=list()forcandidate_labelincandidate_labels:candidate_label_match=re.search(pattern=rf"\b{candidate_label}\. ",string=prompt,flags=re.IGNORECASE)ifcandidate_label_matchisnotNone:sample_candidate_labels.append(candidate_label)ifnotsample_candidate_labels:raiseInvalidBenchmark("Could not extract any candidate labels from the prompt. Please ensure ""that the candidate labels are present in the prompt, each followed by a ""dot and a space (e.g., 'a. '). The candidate labels are: "f"{', '.join(candidate_labels)}. Here is the prompt: {prompt!r}")returnsample_candidate_labelsdef split_model_id(model_id:str)->"ModelIdComponents":[docs]
    """Split a model ID into its components.    Args:        model_id:            The model ID to split.    Returns:        The split model ID.    Raises:        If the model ID is not valid.    """# Importing here to avoid circular importsfrom .data_models importModelIdComponents# Attempt to extract the model ID, revision, and param using regexmodel_id_match=re.match(pattern=r"^[^@#]+",string=model_id)revision_match=re.search(pattern=r"@([^@#]+)",string=model_id)param_match=re.search(pattern=r"#([^@#]+)",string=model_id)# If we cannot extract the model ID, raise an errorifmodel_id_matchisNone:raiseInvalidModel(f"The model ID {model_id!r} is not valid.")model_id=model_id_match.group()# Extract the revision and param and return the resultrevision=revision_match.group(1)ifrevision_matchisnotNoneelse"main"param=param_match.group(1)ifparam_matchisnotNoneelseNonereturnModelIdComponents(model_id=model_id,revision=revision,param=param)