euroeval.data_loading

[docs] module euroeval.data_loading
"""Functions related to the loading of the data."""import collections.abc as cimport loggingimport sysimport timeimport typing as timport requestsfrom datasets importDatasetDict,load_datasetfrom datasets.exceptions importDatasetsErrorfrom huggingface_hub.errors importHfHubHTTPErrorfrom numpy.random importGeneratorfrom .constants importSUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETSfrom .exceptions importHuggingFaceHubDown,InvalidBenchmarkfrom .logging_utils importlog,no_terminal_outputfrom .tasks importEUROPEAN_VALUESfrom .utils importunscrambleift.TYPE_CHECKING:from datasets importDatasetfrom .data_models importBenchmarkConfig,DatasetConfigdef load_data([docs]
rng:Generator,dataset_config:"DatasetConfig",benchmark_config:"BenchmarkConfig")->list["DatasetDict"]:    """Load the raw bootstrapped datasets.    Args:        rng:            The random number generator to use.        dataset_config:            The configuration for the dataset.        benchmark_config:            The configuration for the benchmark.    Returns:        A list of bootstrapped datasets, one for each iteration.    Raises:        InvalidBenchmark:            If the dataset cannot be loaded.        HuggingFaceHubDown:            If the Hugging Face Hub is down.    """dataset=load_raw_data(dataset_config=dataset_config,cache_dir=benchmark_config.cache_dir)ifnotbenchmark_config.evaluate_test_splitand"val"indataset:dataset["test"]=dataset["val"]# Remove empty examples from the datasetsfortext_featurein["tokens","text"]:forsplitindataset_config.splits:iftext_featureindataset[split].features:dataset=dataset.filter(lambdax:len(x[text_feature])>0)# If we are testing then truncate the test set, unless we need the full set for# evaluationifhasattr(sys,"_called_from_test")anddataset_config.task!=EUROPEAN_VALUES:dataset["test"]=dataset["test"].select(range(1))# Bootstrap the splits, if applicableifdataset_config.bootstrap_samples:bootstrapped_splits:dict[str,c.Sequence["Dataset"]]=dict()forsplitindataset_config.splits:bootstrap_indices=rng.integers(0,len(dataset[split]),size=(benchmark_config.num_iterations,len(dataset[split])),)bootstrapped_splits[split]=[dataset[split].select(bootstrap_indices[idx])foridxinrange(benchmark_config.num_iterations)]datasets=[DatasetDict({split:bootstrapped_splits[split][idx]forsplitindataset_config.splits})foridxinrange(benchmark_config.num_iterations)]else:datasets=[dataset]*benchmark_config.num_iterationsreturndatasets[docs]
def load_raw_data(dataset_config:"DatasetConfig",cache_dir:str)->"DatasetDict":    """Load the raw dataset.    Args:        dataset_config:            The configuration for the dataset.        cache_dir:            The directory to cache the dataset.    Returns:        The dataset.    """# Case where the dataset source is a Hugging Face IDifisinstance(dataset_config.source,str):num_attempts=5for_inrange(num_attempts):try:withno_terminal_output():dataset=load_dataset(path=dataset_config.source.split("::")[0],name=(dataset_config.source.split("::")[1]if"::"indataset_config.sourceelseNone),cache_dir=cache_dir,token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_"),)breakexcept(FileNotFoundError,ConnectionError,DatasetsError,requests.ConnectionError,requests.ReadTimeout,)ase:log(f"Failed to load dataset {dataset_config.source!r}, due to "f"the following error: {e}. Retrying...",level=logging.DEBUG,)time.sleep(1)continueexceptHfHubHTTPError:raiseHuggingFaceHubDown()else:raiseInvalidBenchmark(f"Failed to load dataset {dataset_config.source!r} after "f"{num_attempts} attempts. Run with verbose mode to see the individual ""errors.")# Case where the dataset source is a dictionary with keys "train", "val" and "test",# with the values pointing to local CSV fileselse:data_files={split:dataset_config.source[split]forsplitindataset_config.splitsifsplitindataset_config.source}# Get the file extension and ensure that all files have the same extensionfile_extensions={split:dataset_config.source[split].split(".")[-1]forsplitindataset_config.splitsifsplitindataset_config.source}iflen(set(file_extensions.values()))!=1:raiseInvalidBenchmark("All data files in a custom dataset must have the same file extension. "f"Got the extensions {', '.join(file_extensions.values())} for the "f"dataset {dataset_config.name!r}.")file_extension=list(file_extensions.values())[0]# Check that the file extension is supportediffile_extensionnotinSUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS:raiseInvalidBenchmark("Unsupported file extension for custom dataset. Supported file ""extensions are "f"{', '.join(SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS)}, but got "f"{file_extension!r}.")# Load the datasetwithno_terminal_output():dataset=load_dataset(path=file_extension,data_files=data_files,cache_dir=cache_dir)assertisinstance(dataset,DatasetDict)# type: ignore[used-before-def]missing_keys=[keyforkeyindataset_config.splitsifkeynotindataset]ifmissing_keys:raiseInvalidBenchmark("The dataset is missing the following required splits: "f"{', '.join(missing_keys)}")returnDatasetDict({key:dataset[key]forkeyindataset_config.splits})