Skip to content

euroeval.custom_dataset_configs

[docs] module euroeval.custom_dataset_configs

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""Load custom dataset configs."""

import importlib.util
import logging
from pathlib import Path
from types import ModuleType

from huggingface_hub import HfApi

from .data_models import DatasetConfig
from .logging_utils import log_once
from .utils import get_hf_token


def load_custom_datasets_module(custom_datasets_file: Path) -> ModuleType | None:
    """Load the custom datasets module if it exists.

    Args:
        custom_datasets_file:
            The path to the custom datasets module.

    Raises:
        RuntimeError:
            If the custom datasets module cannot be loaded.
    """
    if custom_datasets_file.exists():
        spec = importlib.util.spec_from_file_location(
            name="custom_datasets_module", location=str(custom_datasets_file.resolve())
        )
        if spec is None:
            log_once(
                "Could not load the spec for the custom datasets file from "
                f"{custom_datasets_file.resolve()}.",
                level=logging.ERROR,
            )
            return None
        module = importlib.util.module_from_spec(spec=spec)
        if spec.loader is None:
            log_once(
                "Could not load the module for the custom datasets file from "
                f"{custom_datasets_file.resolve()}.",
                level=logging.ERROR,
            )
            return None
        spec.loader.exec_module(module)
        return module
    return None


def try_get_dataset_config_from_repo(
    dataset_id: str, api_key: str | None, cache_dir: Path
) -> DatasetConfig | None:
    """Try to get a dataset config from a Hugging Face dataset repository.

    Args:
        dataset_id:
            The ID of the dataset to get the config for.
        api_key:
            The Hugging Face API key to use to check if the repositories have custom
            dataset configs.
        cache_dir:
            The directory to store the cache in.

    Returns:
        The dataset config if it exists, otherwise None.
    """
    # Check if the dataset ID is a Hugging Face dataset ID, abort if it isn't
    token = get_hf_token(api_key=api_key)
    hf_api = HfApi(token=token)
    if not hf_api.repo_exists(repo_id=dataset_id, repo_type="dataset"):
        return None

    # Check if the repository has a euroeval_config.py file, abort if it doesn't
    repo_files = hf_api.list_repo_files(
        repo_id=dataset_id, repo_type="dataset", revision="main"
    )
    if "euroeval_config.py" not in repo_files:
        log_once(
            f"Dataset {dataset_id} does not have a euroeval_config.py file, so we "
            "cannot load it. Skipping.",
            level=logging.WARNING,
        )
        return None

    # Fetch the euroeval_config.py file, abort if loading failed
    external_config_path = cache_dir / "external_dataset_configs" / dataset_id
    external_config_path.mkdir(parents=True, exist_ok=True)
    hf_api.hf_hub_download(
        repo_id=dataset_id,
        repo_type="dataset",
        filename="euroeval_config.py",
        local_dir=external_config_path,
        local_dir_use_symlinks=False,
    )
    module = load_custom_datasets_module(
        custom_datasets_file=external_config_path / "euroeval_config.py"
    )
    if module is None:
        return None

    # Check that there is exactly one dataset config, abort if there isn't
    repo_dataset_configs = [
        cfg for cfg in vars(module).values() if isinstance(cfg, DatasetConfig)
    ]
    if not repo_dataset_configs:
        return None  # Already warned the user in this case, so we just skip
    elif len(repo_dataset_configs) > 1:
        log_once(
            f"Dataset {dataset_id} has multiple dataset configurations. Please ensure "
            "that only a single DatasetConfig is defined in the `euroeval_config.py` "
            "file.",
            level=logging.WARNING,
        )
        return None

    # Get the dataset split names
    splits = [
        split["name"]
        for split in hf_api.dataset_info(repo_id=dataset_id).card_data.dataset_info[
            "splits"
        ]
    ]
    train_split_candidates = sorted(
        [split for split in splits if "train" in split.lower()], key=len
    )
    val_split_candidates = sorted(
        [split for split in splits if "val" in split.lower()], key=len
    )
    test_split_candidates = sorted(
        [split for split in splits if "test" in split.lower()], key=len
    )
    train_split = train_split_candidates[0] if train_split_candidates else None
    val_split = val_split_candidates[0] if val_split_candidates else None
    test_split = test_split_candidates[0] if test_split_candidates else None
    if test_split is None:
        log_once(
            f"Dataset {dataset_id} does not have a test split, so we cannot load it. "
            "Please ensure that the dataset has a test split.",
            level=logging.ERROR,
        )
        return None

    # Set up the config with the repo information
    repo_dataset_config = repo_dataset_configs[0]
    repo_dataset_config.name = dataset_id
    repo_dataset_config.pretty_name = dataset_id
    repo_dataset_config.source = dataset_id
    repo_dataset_config.train_split = train_split
    repo_dataset_config.val_split = val_split
    repo_dataset_config.test_split = test_split

    return repo_dataset_config