euroeval.model_cache¶

[docs] module euroeval.model_cache
"""ModelCache class for caching model outputs."""import collections.abc as cimport hashlibimport jsonimport loggingimport sysfrom collections importdefaultdictfrom copy importdeepcopyfrom dataclasses importasdictfrom pathlib importPathfrom datasets importDatasetfrom .constants importNUM_GENERATION_TOKENS_FOR_CLASSIFICATIONfrom .data_models import(GenerativeModelOutput,HashableDict,SingleGenerativeModelOutput,)from .logging_utils importget_pbar,log,log_onceclass ModelCache:[docs]
    """A cache for model outputs.    Attributes:        model_cache_dir:            The directory to store the cache in.        cache_path:            The path to the cache file.        cache:            The model output cache.        max_generated_tokens:            The maximum number of tokens to generate for each example.        progress_bar:            Whether to show a progress bar when caching model outputs.        store_metadata:            Whether to store metadata for the model outputs.        indent_json_when_saving:            Whether to indent the JSON when saving the cache.    """def __init__(self,model_cache_dir:"Path",cache_name:str,max_generated_tokens:int,progress_bar:bool,store_metadata:bool,indent_json_when_saving:bool,)->None:        """Initialise the model output cache.        Args:            model_cache_dir:                The directory to store the cache in.            cache_name:                The name of the cache file.            max_generated_tokens:                The maximum number of tokens to generate for each example.            progress_bar:                Whether to show a progress bar when caching model outputs.            store_metadata:                Whether to store metadata for the model outputs.            indent_json_when_saving:                Whether to indent the JSON when saving the cache.        """self.model_cache_dir=model_cache_dirself.model_cache_dir.mkdir(parents=True,exist_ok=True)self.cache_path=self.model_cache_dir/cache_name.replace("/","--")self.max_generated_tokens=max_generated_tokensself.progress_bar=progress_barself.store_metadata=store_metadataself.indent_json_when_saving=indent_json_when_savingself.cache:dict[str,SingleGenerativeModelOutput]=dict()def load(self)->None:[docs]
        """Load the model output cache."""ifnotself.cache_path.exists():withself.cache_path.open("w")asf:json.dump(dict(),f,indent=2ifself.indent_json_when_savingelseNone,ensure_ascii=False,)try:withself.cache_path.open()asf:json_cache=json.load(f)exceptjson.JSONDecodeError:log(f"Failed to load the cache from {self.cache_path}. The cache will be "f"re-initialised.",level=logging.WARNING,)json_cache=dict()withself.cache_path.open("w")asf:json.dump(dict(),f,indent=2ifself.indent_json_when_savingelseNone,ensure_ascii=False,)cache:dict[str,SingleGenerativeModelOutput]=dict()forkeyinjson_cache:value_dict=json_cache[key]sequence=value_dict.pop("sequence",None)predicted_label=value_dict.pop("predicted_label",None)scores=value_dict.pop("scores",None)cache[key]=SingleGenerativeModelOutput(sequence=sequence,predicted_label=predicted_label,scores=scores,metadata=HashableDict(value_dict),)self.cache=cachedef save(self)->None:[docs]
        """Save the model output cache to disk."""# Unpack metadata to get a flat dict to dumpdumpable_cache:dict[str,dict]=defaultdict(dict)forkey,valueinself.cache.items():value_dict=asdict(value)metadata=value_dict.pop("metadata",dict())ifmetadataisNone:metadata=dict()value_dict|=metadataif"index"inmetadata:value_dict={"index":metadata.pop("index")}|value_dictdumpable_cache[key]=value_dicttry:self.cache_path.parent.mkdir(exist_ok=True,parents=True)self.cache_path.write_text(json.dumps(dumpable_cache,indent=2ifself.indent_json_when_savingelseNone,ensure_ascii=False,))exceptKeyError:log(f"Failed to load the cache from {self.cache_path}. The cache will be "f"re-initialised.",level=logging.WARNING,)self.cache=dict()withself.cache_path.open("w")asf:json.dump(dict(),f,indent=2ifself.indent_json_when_savingelseNone,ensure_ascii=False,)def _hash_key(self,key:str|c.Sequence[dict[str,str]])->str:        """Hash the key to use as an index in the cache.        Args:            key:                The key to hash.        Returns:            The hashed key.        """returnhashlib.md5(string=str(key).encode()).hexdigest()def __getitem__(self,key:str|c.Sequence[dict[str,str]])->SingleGenerativeModelOutput:        """Get an item from the cache.        Args:            key:                The key to use to index the cache.        Returns:            The model output.        """hashed_key=self._hash_key(key=key)returnself.cache[hashed_key]def __setitem__(self,key:str|c.Sequence[dict[str,str]],value:SingleGenerativeModelOutput)->None:        """Set an item in the cache.        Args:            key:                The key to use to index the cache.            value:                The value to set in the cache.        """hashed_key=self._hash_key(key=key)self.cache[hashed_key]=valuedef remove(self)->None:[docs]
        """Remove the cache from memory and delete it from disk."""self.cache_path.unlink()delself.cachedef __contains__(self,key:str|c.Sequence[dict[str,str]])->bool:        """Check if a key is in the cache.        Args:            key:                The key to check.        Returns:            Whether the key is in the cache.        """hashed_key=self._hash_key(key=key)returnhashed_keyinself.cachedef add_to_cache([docs]
self,model_inputs:dict,model_output:GenerativeModelOutput)->None:        """Add the model input/output to the cache.        Args:            model_inputs:                The model inputs.            model_output:                The model output.        """input_column="messages"if"messages"inmodel_inputselse"text"ifself.store_metadata:metadata=deepcopy(model_inputs)metadata.pop("messages"if"messages"!=input_columnelse"text",None)model_inputs=model_inputs[input_column]# Double check that the number of inputs and outputs matchifnotlen(model_inputs)==len(model_output.sequences):log(f"Number of model inputs ({len(model_inputs)}) does not match the "f"number of model outputs ({len(model_output.sequences)}). We will not "f"cache the model outputs.",level=logging.WARNING,)return# Store the generated sequences in the cache, one by onewithget_pbar(iterable=model_inputs,desc="Caching model outputs",disable=hasattr(sys,"_called_from_test")ornotself.progress_bar,)aspbar:forsample_idx,model_inputinenumerate(pbar):# Extract the scores from the model output, to be cached. We only store# the indices of the top scores, to save space. Further, we only store# the scores if the generated sequence is shorter than the maximum# lengthif(model_output.scoresisnotNoneandself.max_generated_tokens<=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION):assertmodel_output.scoresisnotNonescores=model_output.scores[sample_idx]else:ifmodel_output.scoresisnotNone:log_once("The generated sequence is longer than the maximum ""length for classification. Not caching the scores.",level=logging.DEBUG,)scores=Noneifself.store_metadata:single_metadata=HashableDict({metadata_column:metadata_values[sample_idx]formetadata_column,metadata_valuesinmetadata.items()})else:single_metadata=Noneself[model_input]=SingleGenerativeModelOutput(sequence=model_output.sequences[sample_idx],predicted_label=(model_output.predicted_labels[sample_idx]ifmodel_output.predicted_labelsisnotNoneelseNone),scores=scores,metadata=single_metadata,)def split_dataset_into_cached_and_non_cached([docs]
dataset:"Dataset",cache:ModelCache)->tuple["Dataset","Dataset"]:    """Split a dataset into a cached and non-cached part.    Args:        dataset:            The dataset to split.        cache:            The model output cache.    Returns:        The cached and non-cached parts of the dataset.    """# Get the sample indices of the non-cached examples, which are unique with respect# to the "text" column.input_column="messages"if"messages"indataset.column_nameselse"text"dataset_texts=dataset[input_column]unique_non_cached_ids=set()unique_texts=list()foridx,dataset_textinenumerate(dataset_texts):ifdataset_textnotincacheanddataset_textnotinunique_texts:unique_non_cached_ids.add(idx)unique_texts.append(dataset_text)# The cached examples are the ones that are not in the non-cached examples. This# means that if the dataset has duplicates, only a single copy of the duplicate# will be put in the non-cached part, and the rest in the cached part.cached_ids=set(range(len(dataset)))-unique_non_cached_idscached=dataset.select(cached_ids)non_cached=dataset.select(unique_non_cached_ids)assertisinstance(cached,Dataset),(f"Expected the cached dataset to be a Dataset, but got {type(cached)}")assertisinstance(non_cached,Dataset),(f"Expected the non-cached dataset to be a Dataset, but got {type(non_cached)}")returncached,non_cacheddef load_cached_model_outputs([docs]
cached_dataset:"Dataset",cache:ModelCache)->GenerativeModelOutput:    """Load the cached model outputs.    Args:        cached_dataset:            The dataset containing the cached examples.        cache:            The model output cache.    Returns:        The model output containing the cached sequences.    """input_column="messages"if"messages"incached_dataset.column_nameselse"text"cached_model_outputs:c.Sequence[SingleGenerativeModelOutput]=[cache[prompt]forpromptincached_dataset[input_column]]cached_sequences=[model_output.sequenceformodel_outputincached_model_outputs]ifcached_model_outputs[0].scoresisNone:returnGenerativeModelOutput(sequences=cached_sequences)cached_scores=[model_output.scoresor[]formodel_outputincached_model_outputs]returnGenerativeModelOutput(sequences=cached_sequences,scores=cached_scores)def create_model_cache_dir(cache_dir:str,model_id:str)->str:[docs]
    """Create cache directory for a model.    Args:        cache_dir:            The cache directory.        model_id:            The model ID.    Returns:        The path to the cache directory.    """# If the model ID is a path, we just use that as the cache dirifPath(model_id).is_dir():log_once(f"Since the model {model_id!r} is a local model, we will use the model ""directory directly as the model cache directory.",level=logging.DEBUG,)returnmodel_id# Otherwise, we create a cache dir based on the model IDmodel_cache_dir=Path(cache_dir,"model_cache",model_id.replace("/","--")).as_posix()log_once(f"Using the model cache directory {model_cache_dir!r} for the model "f"{model_id!r}.",level=logging.DEBUG,)returnmodel_cache_dir