euroeval.custom_dataset_configs¶

[docs] module euroeval.custom_dataset_configs
"""Load custom dataset configs."""import importlib.utilimport loggingfrom pathlib importPathfrom types importModuleTypefrom huggingface_hub importHfApifrom .data_models importDatasetConfigfrom .logging_utils importlog_oncefrom .utils importget_hf_token[docs]
def load_custom_datasets_module(custom_datasets_file:Path)->ModuleType|None:    """Load the custom datasets module if it exists.    Args:        custom_datasets_file:            The path to the custom datasets module.    Raises:        RuntimeError:            If the custom datasets module cannot be loaded.    """ifcustom_datasets_file.exists():spec=importlib.util.spec_from_file_location(name="custom_datasets_module",location=str(custom_datasets_file.resolve()))ifspecisNone:log_once("Could not load the spec for the custom datasets file from "f"{custom_datasets_file.resolve()}.",level=logging.ERROR,)returnNonemodule=importlib.util.module_from_spec(spec=spec)ifspec.loaderisNone:log_once("Could not load the module for the custom datasets file from "f"{custom_datasets_file.resolve()}.",level=logging.ERROR,)returnNonespec.loader.exec_module(module)returnmodulereturnNonedef try_get_dataset_config_from_repo([docs]
dataset_id:str,api_key:str|None,cache_dir:Path)->DatasetConfig|None:    """Try to get a dataset config from a Hugging Face dataset repository.    Args:        dataset_id:            The ID of the dataset to get the config for.        api_key:            The Hugging Face API key to use to check if the repositories have custom            dataset configs.        cache_dir:            The directory to store the cache in.    Returns:        The dataset config if it exists, otherwise None.    """# Check if the dataset ID is a Hugging Face dataset ID, abort if it isn'ttoken=get_hf_token(api_key=api_key)hf_api=HfApi(token=token)ifnothf_api.repo_exists(repo_id=dataset_id,repo_type="dataset"):returnNone# Check if the repository has a euroeval_config.py file, abort if it doesn'trepo_files=hf_api.list_repo_files(repo_id=dataset_id,repo_type="dataset",revision="main")if"euroeval_config.py"notinrepo_files:log_once(f"Dataset {dataset_id} does not have a euroeval_config.py file, so we ""cannot load it. Skipping.",level=logging.WARNING,)returnNone# Fetch the euroeval_config.py file, abort if loading failedexternal_config_path=cache_dir/"external_dataset_configs"/dataset_idexternal_config_path.mkdir(parents=True,exist_ok=True)hf_api.hf_hub_download(repo_id=dataset_id,repo_type="dataset",filename="euroeval_config.py",local_dir=external_config_path,local_dir_use_symlinks=False,)module=load_custom_datasets_module(custom_datasets_file=external_config_path/"euroeval_config.py")ifmoduleisNone:returnNone# Check that there is exactly one dataset config, abort if there isn'trepo_dataset_configs=[cfgforcfginvars(module).values()ifisinstance(cfg,DatasetConfig)]ifnotrepo_dataset_configs:returnNone# Already warned the user in this case, so we just skipeliflen(repo_dataset_configs)>1:log_once(f"Dataset {dataset_id} has multiple dataset configurations. Please ensure ""that only a single DatasetConfig is defined in the `euroeval_config.py` ""file.",level=logging.WARNING,)returnNone# Get the dataset split namessplits=[split["name"]forsplitinhf_api.dataset_info(repo_id=dataset_id).card_data.dataset_info["splits"]]train_split_candidates=sorted([splitforsplitinsplitsif"train"insplit.lower()],key=len)val_split_candidates=sorted([splitforsplitinsplitsif"val"insplit.lower()],key=len)test_split_candidates=sorted([splitforsplitinsplitsif"test"insplit.lower()],key=len)train_split=train_split_candidates[0]iftrain_split_candidateselseNoneval_split=val_split_candidates[0]ifval_split_candidateselseNonetest_split=test_split_candidates[0]iftest_split_candidateselseNoneiftest_splitisNone:log_once(f"Dataset {dataset_id} does not have a test split, so we cannot load it. ""Please ensure that the dataset has a test split.",level=logging.ERROR,)returnNone# Set up the config with the repo informationrepo_dataset_config=repo_dataset_configs[0]repo_dataset_config.name=dataset_idrepo_dataset_config.pretty_name=dataset_idrepo_dataset_config.source=dataset_idrepo_dataset_config.train_split=train_splitrepo_dataset_config.val_split=val_splitrepo_dataset_config.test_split=test_splitreturnrepo_dataset_config