1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141 | """Constants used throughout the project."""
import re
from typing import TypeVar
from .enums import TaskGroup
# Type variable used for generic typing
T = TypeVar("T", bound=object)
# This is used as input to generative models; it cannot be a special token
DUMMY_FILL_VALUE = 100
# This is the maximum allowed context length for models for the purpose of this
# benchmark. We will still report the models' true maximum context length in the
# metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
# all tokens in the context.
MAX_CONTEXT_LENGTH = 8_192
# We need to raise the amount of tokens generated for reasoning models, to give them
# time to think
REASONING_MAX_TOKENS = 8_192
# The Hugging Face Hub pipeline tags used to classify models as generative
GENERATIVE_PIPELINE_TAGS = [
"text-generation",
"text2text-generation",
"image-text-to-text",
"audio-text-to-text",
"video-text-to-text",
]
# Used to disallow non-generative models to be evaluated on these task groups
GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
# Local models are required to have one of these files in their directory
LOCAL_MODELS_REQUIRED_FILES = ["config.json", "adapter_config.json"]
# The number of top log probabilities to return for generative models. For several APIs
# this is the maximum number of log probabilities that can be returned
MAX_VLLM_LOGPROBS = 20
MAX_LITELLM_LOGPROBS = 8
# We make sure to remove these metric attributes after each iteration, to avoid memory
# leaks
METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
# Hugging Face Hub tags used to classify models as merge models
MERGE_TAGS = ["merge", "mergekit"]
# The minimum required CUDA compute capability for using bfloat16 in vLLM
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
# The candidates for end-of-sequence, beginning-of-sequence and padding tokens
EOS_TOKENS = ["</s>", "<|end_of_text|>", "<|endoftext|>", "[SEP]", "<|return|>"]
BOS_TOKENS = ["<s>", "<|begin_of_text|>", "<|startoftext|>", "[CLS]"]
PAD_TOKENS = [
"<pad>",
"<PAD>",
"[pad]",
"[PAD]",
"<|endoftext|>",
"<|end▁of▁sentence|>",
"<|im_end|>",
]
# Used to detect whether a model is a reasoning model
REASONING_TOKENS: list[tuple[str | re.Pattern, str | re.Pattern]] = [
("<think>", "</think>"),
("<reason>", "</reason>"),
("<reasoning>", "</reasoning>"),
(
re.compile(pattern=r"<\|channel\|>(analysis|commentary)<\|message\|>"),
"<|channel|>final<|message|>",
),
]
# These tokens are sometimes used by models to indicate the end of a generated
# response, but they do not use them as a proper EOS token, so we have to deal with them
# manually. We only use them as stop tokens if they actually appear in the model's
# output
CUSTOM_STOP_TOKENS = ["<sep>"]
# For classification tasks we force LiteLLM models to output a JSON dictionary with a
# single key and the values being restricted to the allowed labels. This is the key we
# use
LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
# These characters are stripped from JSON output when trying to identify the label
JSON_STRIP_CHARACTERS = ' {}\n\r":'
# The number of tokens we generate when evaluating generative models on classification
# tasks. We also use this to determine whether we should store logprobs in the model
# outputs (and cache).
NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
# We only allow loading local datasets in these file formats
SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS = ["csv"]
# These are default generation parameters, and can be overridden if a generative model
# has a `generation_config.json` file in its repository
GENERATION_KWARGS = {
"temperature": 0.0,
"top_p": 1.0,
"top_k": 0,
"repetition_penalty": 1.0,
}
# This is a mirror of `AttentionBackendEnum` in vLLM, but since we don't have access to
# this when running on CPU/MacOS (as we can only run an old vLLM version), we have to
# define it here
ATTENTION_BACKENDS: list[str] = [
"FLASH_ATTN",
"FLASH_ATTN_DIFFKV",
"TRITON_ATTN",
"ROCM_ATTN",
"ROCM_AITER_MLA",
"ROCM_AITER_TRITON_MLA",
"ROCM_AITER_FA",
"ROCM_AITER_MLA_SPARSE",
"TORCH_SDPA",
"FLASHINFER",
"FLASHINFER_MLA",
"TRITON_MLA",
"CUTLASS_MLA",
"FLASHMLA",
"FLASHMLA_SPARSE",
"FLASH_ATTN_MLA",
"IPEX",
"NO_ATTENTION",
"FLEX_ATTENTION",
"TREE_ATTN",
"ROCM_AITER_UNIFIED_ATTN",
"CPU_ATTN",
"CUSTOM",
]
# If a dataset configuration has more than this number of languages, we won't log any of
# the languages. This is for instance the case for the speed benchmark, which has all
# the languages. The threshold of 5 is somewhat arbitrary.
MAX_NUMBER_OF_LOGGING_LANGUAGES = 5
|