1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193 | """Constants used throughout the project."""
import re
import typing as t
from .enums import TaskGroup
# Type variable used for generic typing
T = t.TypeVar("T", bound=object)
# This is used as input to generative models; it cannot be a special token
DUMMY_FILL_VALUE = 100
# This is the maximum allowed context length for models for the purpose of this
# benchmark. We will still report the models' true maximum context length in the
# metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
# all tokens in the context.
MAX_CONTEXT_LENGTH = 8_192
# We need to raise the amount of tokens generated for reasoning models, to give them
# time to think
REASONING_MAX_TOKENS = 8_192
# The Hugging Face Hub pipeline tags used to classify models as generative
GENERATIVE_PIPELINE_TAGS = [
"text-generation",
"text2text-generation",
"image-text-to-text",
"audio-text-to-text",
"video-text-to-text",
"any-to-any",
]
# Used to disallow non-generative models to be evaluated on these task groups
GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
# Local models are required to have one of these files in their directory
LOCAL_MODELS_REQUIRED_FILES = ["config.json", "adapter_config.json"]
# The number of top log probabilities to return for generative models. For several APIs
# this is the maximum number of log probabilities that can be returned
MAX_VLLM_LOGPROBS = 20
MAX_LITELLM_LOGPROBS = 8
# We make sure to remove these metric attributes after each iteration, to avoid memory
# leaks
METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
# Hugging Face Hub tags used to classify models as merge models
MERGE_TAGS = ["merge", "mergekit"]
# The minimum required CUDA compute capability for using bfloat16 in vLLM
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
# Threshold for language confidence scores.
# When a sample's language confidence score is greater than or equal to this value,
# its evaluation score is kept as is. Otherwise, the score is set to 0.
MIN_LANG_CONFIDENCE_SCORE = 0.75
# The candidates for end-of-sequence, beginning-of-sequence and padding tokens
EOS_TOKENS = ["</s>", "<|end_of_text|>", "<|endoftext|>", "[SEP]", "<|return|>"]
BOS_TOKENS = ["<s>", "<|begin_of_text|>", "<|startoftext|>", "[CLS]"]
PAD_TOKENS = [
"<pad>",
"<PAD>",
"[pad]",
"[PAD]",
"<|endoftext|>",
"<|end▁of▁sentence|>",
"<|im_end|>",
]
# Used to detect whether a model is a reasoning model
REASONING_TOKENS: list[tuple[str | re.Pattern, str | re.Pattern]] = [
("<think>", "</think>"),
("<reason>", "</reason>"),
("<reasoning>", "</reasoning>"),
(
re.compile(pattern=r"<\|channel\|>(analysis|commentary)<\|message\|>"),
"<|channel|>final<|message|>",
),
("<|START_THINKING|>", "<|END_THINKING|>"),
]
# These tokens are sometimes used by models to indicate the end of a generated
# response, but they do not use them as a proper EOS token, so we have to deal with them
# manually. We only use them as stop tokens if they actually appear in the model's
# output
CUSTOM_STOP_TOKENS = ["<sep>"]
# For classification tasks we force LiteLLM models to output a JSON dictionary with a
# single key and the values being restricted to the allowed labels. This is the key we
# use
LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
# These characters are stripped from JSON output when trying to identify the label
JSON_STRIP_CHARACTERS = ' {}\n\r":'
# The number of tokens we generate when evaluating generative models on classification
# tasks. We also use this to determine whether we should store logprobs in the model
# outputs (and cache).
NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
# We only allow loading local datasets in these file formats
SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS = ["csv"]
# These are default generation parameters, and can be overridden if a generative model
# has a `generation_config.json` file in its repository
GENERATION_KWARGS = {
"temperature": 0.0,
"top_p": 1.0,
"top_k": 0,
"repetition_penalty": 1.0,
}
# This is a mirror of `AttentionBackendEnum` in vLLM, but since we don't have access to
# this when running on CPU/MacOS (as we can only run an old vLLM version), we have to
# define it here
ATTENTION_BACKENDS: list[str] = [
"FLASH_ATTN",
"FLASH_ATTN_DIFFKV",
"TRITON_ATTN",
"ROCM_ATTN",
"ROCM_AITER_MLA",
"ROCM_AITER_TRITON_MLA",
"ROCM_AITER_FA",
"ROCM_AITER_MLA_SPARSE",
"TORCH_SDPA",
"FLASHINFER",
"FLASHINFER_MLA",
"TRITON_MLA",
"CUTLASS_MLA",
"FLASHMLA",
"FLASHMLA_SPARSE",
"FLASH_ATTN_MLA",
"IPEX",
"NO_ATTENTION",
"FLEX_ATTENTION",
"TREE_ATTN",
"ROCM_AITER_UNIFIED_ATTN",
"CPU_ATTN",
"CUSTOM",
]
# If a dataset configuration has more than this number of languages, we won't log any of
# the languages. This is for instance the case for the speed benchmark, which has all
# the languages. The threshold of 5 is somewhat arbitrary.
MAX_NUMBER_OF_LOGGING_LANGUAGES = 5
# Language-specific label for the choices section in multiple-choice datasets. The
# keys are ISO 639-1 language codes.
CHOICES_MAPPING: dict[str, str] = {
"ab": "Choices",
"bg": "Възможности",
"be": "Варыянты",
"ca": "Opcions",
"cs": "Výběr",
"da": "Svarmuligheder",
"de": "Antwortmöglichkeiten",
"el": "Επιλογές",
"en": "Choices",
"es": "Opciones",
"et": "Vastusevariandid",
"fi": "Vastausvaihtoehdot",
"fo": "Svarmøguleikar",
"fr": "Choix",
"hr": "Izbori",
"hu": "Válaszlehetőségek",
"is": "Svarmöguleikar",
"it": "Scelte",
"lt": "Pasirinkimai",
"lv": "Izvēles",
"nl": "Antwoordopties",
"no": "Svaralternativer",
"pl": "Opcje",
"pt": "Opções",
"ro": "Opțiuni",
"sk": "Možnosti",
"sl": "Možnosti",
"sq": "Opsione",
"sr": "Opcije",
"sv": "Svarsalternativ",
"uk": "Варіанти",
}
# Constants for the tool-calling task
TOOL_CALLING_CALLS_KEY = "tool_calls"
TOOL_CALLING_FUNCTION_KEY = "function"
TOOL_CALLING_ARGUMENTS_KEY = "arguments"
TOOL_CALLING_KEYS = [TOOL_CALLING_FUNCTION_KEY, TOOL_CALLING_ARGUMENTS_KEY]
# Every Eval Ever (EEE) schema version used when serialising benchmark results
EEE_SCHEMA_VERSION = "0.2.1"
|