euroeval.data_loading

[docs] module euroeval.data_loading
"""Functions related to the loading of the data."""import loggingimport sysimport timeimport requestsfrom datasets importDataset,DatasetDict,load_datasetfrom datasets.exceptions importDatasetsErrorfrom huggingface_hub.errors importHfHubHTTPErrorfrom numpy.random importGeneratorfrom .data_models importBenchmarkConfig,DatasetConfigfrom .exceptions importHuggingFaceHubDown,InvalidBenchmarkfrom .utils importunscramblelogger=logging.getLogger("euroeval")def load_data([docs]
rng:Generator,dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig")->list[DatasetDict]:    """Load the raw bootstrapped datasets.    Args:        rng:            The random number generator to use.        dataset_config:            The configuration for the dataset.        benchmark_config:            The configuration for the benchmark.    Returns:        A list of bootstrapped datasets, one for each iteration.    Raises:        InvalidBenchmark:            If the dataset cannot be loaded.        HuggingFaceHubDown:            If the Hugging Face Hub is down.    """dataset=load_raw_data(dataset_config=dataset_config,cache_dir=benchmark_config.cache_dir)ifnotbenchmark_config.evaluate_test_split:dataset["test"]=dataset["val"]# Remove empty examples from the datasetsfortext_featurein["tokens","text"]:iftext_featureindataset["train"].features:dataset=dataset.filter(lambdax:len(x[text_feature])>0)# If we are testing then truncate the test setifhasattr(sys,"_called_from_test"):dataset["test"]=dataset["test"].select(range(1))# Bootstrap the splitsbootstrapped_splits:dict[str,list[Dataset]]=dict()forsplitin["train","val","test"]:bootstrap_indices=rng.integers(0,len(dataset[split]),size=(benchmark_config.num_iterations,len(dataset[split])),)bootstrapped_splits[split]=[dataset[split].select(bootstrap_indices[idx])foridxinrange(benchmark_config.num_iterations)]datasets=[DatasetDict({split:bootstrapped_splits[split][idx]forsplitin["train","val","test"]})foridxinrange(benchmark_config.num_iterations)]returndatasets[docs]
def load_raw_data(dataset_config:"DatasetConfig",cache_dir:str)->DatasetDict:    """Load the raw dataset.    Args:        dataset_config:            The configuration for the dataset.        cache_dir:            The directory to cache the dataset.    Returns:        The dataset.    """num_attempts=5for_inrange(num_attempts):try:dataset=load_dataset(path=dataset_config.huggingface_id,cache_dir=cache_dir,token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),)breakexcept(FileNotFoundError,ConnectionError,DatasetsError,requests.ConnectionError,requests.ReadTimeout,):logger.warning(f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying...")time.sleep(1)continueexceptHfHubHTTPError:raiseHuggingFaceHubDown()else:raiseInvalidBenchmark(f"Failed to load dataset {dataset_config.huggingface_id!r} after "f"{num_attempts} attempts.")assertisinstance(dataset,DatasetDict)# type: ignore[used-before-def]required_keys=["train","val","test"]missing_keys=[keyforkeyinrequired_keysifkeynotindataset]ifmissing_keys:raiseInvalidBenchmark("The dataset is missing the following required splits: "f"{', '.join(missing_keys)}")returnDatasetDict({key:dataset[key]forkeyinrequired_keys})