1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152 | """Load custom dataset configs."""
import importlib.util
import logging
from pathlib import Path
from types import ModuleType
from huggingface_hub import HfApi
from .data_models import DatasetConfig
from .logging_utils import log_once
from .utils import get_hf_token
def load_custom_datasets_module(custom_datasets_file: Path) -> ModuleType | None:
"""Load the custom datasets module if it exists.
Args:
custom_datasets_file:
The path to the custom datasets module.
Raises:
RuntimeError:
If the custom datasets module cannot be loaded.
"""
if custom_datasets_file.exists():
spec = importlib.util.spec_from_file_location(
name="custom_datasets_module", location=str(custom_datasets_file.resolve())
)
if spec is None:
log_once(
"Could not load the spec for the custom datasets file from "
f"{custom_datasets_file.resolve()}.",
level=logging.ERROR,
)
return None
module = importlib.util.module_from_spec(spec=spec)
if spec.loader is None:
log_once(
"Could not load the module for the custom datasets file from "
f"{custom_datasets_file.resolve()}.",
level=logging.ERROR,
)
return None
spec.loader.exec_module(module)
return module
return None
def try_get_dataset_config_from_repo(
dataset_id: str, api_key: str | None, cache_dir: Path
) -> DatasetConfig | None:
"""Try to get a dataset config from a Hugging Face dataset repository.
Args:
dataset_id:
The ID of the dataset to get the config for.
api_key:
The Hugging Face API key to use to check if the repositories have custom
dataset configs.
cache_dir:
The directory to store the cache in.
Returns:
The dataset config if it exists, otherwise None.
"""
# Check if the dataset ID is a Hugging Face dataset ID, abort if it isn't
token = get_hf_token(api_key=api_key)
hf_api = HfApi(token=token)
if not hf_api.repo_exists(repo_id=dataset_id, repo_type="dataset"):
return None
# Check if the repository has a euroeval_config.py file, abort if it doesn't
repo_files = hf_api.list_repo_files(
repo_id=dataset_id, repo_type="dataset", revision="main"
)
if "euroeval_config.py" not in repo_files:
log_once(
f"Dataset {dataset_id} does not have a euroeval_config.py file, so we "
"cannot load it. Skipping.",
level=logging.WARNING,
)
return None
# Fetch the euroeval_config.py file, abort if loading failed
external_config_path = cache_dir / "external_dataset_configs" / dataset_id
external_config_path.mkdir(parents=True, exist_ok=True)
hf_api.hf_hub_download(
repo_id=dataset_id,
repo_type="dataset",
filename="euroeval_config.py",
local_dir=external_config_path,
local_dir_use_symlinks=False,
)
module = load_custom_datasets_module(
custom_datasets_file=external_config_path / "euroeval_config.py"
)
if module is None:
return None
# Check that there is exactly one dataset config, abort if there isn't
repo_dataset_configs = [
cfg for cfg in vars(module).values() if isinstance(cfg, DatasetConfig)
]
if not repo_dataset_configs:
return None # Already warned the user in this case, so we just skip
elif len(repo_dataset_configs) > 1:
log_once(
f"Dataset {dataset_id} has multiple dataset configurations. Please ensure "
"that only a single DatasetConfig is defined in the `euroeval_config.py` "
"file.",
level=logging.WARNING,
)
return None
# Get the dataset split names
splits = [
split["name"]
for split in hf_api.dataset_info(repo_id=dataset_id).card_data.dataset_info[
"splits"
]
]
train_split_candidates = sorted(
[split for split in splits if "train" in split.lower()], key=len
)
val_split_candidates = sorted(
[split for split in splits if "val" in split.lower()], key=len
)
test_split_candidates = sorted(
[split for split in splits if "test" in split.lower()], key=len
)
train_split = train_split_candidates[0] if train_split_candidates else None
val_split = val_split_candidates[0] if val_split_candidates else None
test_split = test_split_candidates[0] if test_split_candidates else None
if test_split is None:
log_once(
f"Dataset {dataset_id} does not have a test split, so we cannot load it. "
"Please ensure that the dataset has a test split.",
level=logging.ERROR,
)
return None
# Set up the config with the repo information
repo_dataset_config = repo_dataset_configs[0]
repo_dataset_config.name = dataset_id
repo_dataset_config.pretty_name = dataset_id
repo_dataset_config.source = dataset_id
repo_dataset_config.train_split = train_split
repo_dataset_config.val_split = val_split
repo_dataset_config.test_split = test_split
return repo_dataset_config
|