Skip to content

Commit 7a7129d

Browse files
jlamypoirierclaude
andauthored
Switch testing tokenizer from santacoder to gpt2 (#482)
Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 229fd3f commit 7a7129d

File tree

15 files changed

+229
-228
lines changed

15 files changed

+229
-228
lines changed

fast_llm/data/preparation/gpt_memmap/prepare.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def _load_dataset(self) -> datasets.Dataset:
8080
return dataset
8181

8282
def _get_croissant_metadata(self):
83-
token = huggingface_hub.HfFolder.get_token()
83+
token = huggingface_hub.get_token()
8484
try:
8585
# Retrieve the dataset metadata in croissant format
8686
url = f"https://huggingface.co/api/datasets/{self._config.dataset.path}/croissant"

tests/data/test_blending.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -31,25 +31,25 @@ def _get_blending_alt(probs: list[float], num_samples: int) -> tuple[np.ndarray,
3131

3232

3333
GPT_BLENDED_SAMPLES = [
34-
[49152, 46, 10, 819, 19, 45],
35-
[45, 69, 17, 86, 38826, 15],
36-
[49152, 83, 80, 20452, 45, 93],
37-
[15, 25, 51, 31, 32348, 64],
38-
[64, 17, 93, 78, 40, 1793],
39-
[1793, 1, 1746, 38, 27, 58],
40-
[93, 90, 39, 6, 75, 9],
41-
[58, 22885, 93, 37, 92, 76],
34+
[50256, 46, 10, 721, 19, 45],
35+
[45, 69, 17, 86, 92, 0],
36+
[50256, 83, 80, 29, 2, 45],
37+
[0, 15, 25, 51, 31, 27],
38+
[27, 0, 64, 17, 93, 78],
39+
[78, 3955, 43, 1, 1395, 38],
40+
[45, 93, 90, 39, 6, 75],
41+
[38, 27, 58, 40692, 93, 37],
4242
]
4343

4444
GPT_BLENDED_MIXED_SAMPLES = [
45-
[49152, 46, 10, 819, 19, 45],
45+
[50256, 46, 10, 721, 19, 45],
4646
[25492, 15877, 37874, 8570, 31649, 15521],
47-
[45, 69, 17, 86, 38826, 15],
47+
[45, 69, 17, 86, 92, 0],
4848
[3359, 20945, 33437, 32454, 42084, 45942],
49-
[15, 25, 51, 31, 32348, 64],
50-
[64, 17, 93, 78, 40, 1793],
49+
[0, 15, 25, 51, 31, 27],
50+
[27, 0, 64, 17, 93, 78],
5151
[15112, 36731, 47864, 35586, 33356, 37537],
52-
[1793, 1, 1746, 38, 27, 58],
52+
[78, 3955, 43, 1, 1395, 38],
5353
]
5454

5555

tests/data/test_concatenate.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@
1212
from tests.utils.dataset import get_common_test_dataset
1313

1414
GPT_CONCATENATED_SAMPLES = [
15-
[49152, 46, 10, 819, 19, 45],
16-
[45, 69, 17, 86, 38826, 15],
17-
[15, 25, 51, 31, 32348, 64],
18-
[64, 17, 93, 78, 40, 1793],
19-
[1793, 1, 1746, 38, 27, 58],
20-
[58, 22885, 93, 37, 92, 76],
21-
[76, 29, 19, 17365, 93, 46],
22-
[46, 83, 17211, 1, 785, 1023],
15+
[50256, 46, 10, 721, 19, 45],
16+
[45, 69, 17, 86, 92, 0],
17+
[0, 15, 25, 51, 31, 27],
18+
[27, 0, 64, 17, 93, 78],
19+
[78, 3955, 43, 1, 1395, 38],
20+
[38, 27, 58, 40692, 93, 37],
21+
[37, 92, 76, 29, 19, 29499],
22+
[29499, 93, 46, 83, 27159, 1],
2323
]
2424

2525

tests/data/test_dataset_discovery.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
{"type": "memmap", "path": "dataset_0.fast_llm_dataset"},
2626
{"type": "memmap", "path": "dataset_1.fast_llm_dataset"},
2727
],
28-
"weights": [44883, 43910],
28+
"weights": [47178, 46208],
2929
},
3030
),
3131
(
@@ -39,7 +39,7 @@
3939
{"type": "memmap", "path": "dataset0/dataset_0.fast_llm_dataset"},
4040
{"type": "memmap", "path": "dataset1/dataset_1.fast_llm_dataset"},
4141
],
42-
"weights": [44883, 43910],
42+
"weights": [47178, 46208],
4343
},
4444
),
4545
(
@@ -59,7 +59,7 @@
5959
{"type": "memmap", "path": "dataset/dataset_1.fast_llm_dataset"},
6060
{"type": "memmap", "path": "dataset_0.fast_llm_dataset"},
6161
],
62-
"weights": [43910, 44883],
62+
"weights": [46208, 47178],
6363
},
6464
),
6565
(
@@ -78,10 +78,10 @@
7878
{"type": "memmap", "path": "dataset_0.fast_llm_dataset"},
7979
{"type": "memmap", "path": "dataset_1.fast_llm_dataset"},
8080
],
81-
"weights": [44883, 43910],
81+
"weights": [47178, 46208],
8282
},
8383
],
84-
"weights": [44883, 88793],
84+
"weights": [47178, 93386],
8585
},
8686
),
8787
(
@@ -99,11 +99,11 @@
9999
{"type": "memmap", "path": "dataset/dataset_1.fast_llm_dataset"},
100100
{"type": "memmap", "path": "dataset/dataset_2.fast_llm_dataset"},
101101
],
102-
"weights": [43910, 44883],
102+
"weights": [46208, 47178],
103103
},
104104
{"type": "memmap", "path": "dataset_0.fast_llm_dataset"},
105105
],
106-
"weights": [88793, 44883],
106+
"weights": [93386, 47178],
107107
},
108108
),
109109
(
@@ -130,12 +130,12 @@
130130
{"type": "memmap", "path": "dataset1/dataset3/dataset_2.fast_llm_dataset"},
131131
{"type": "memmap", "path": "dataset1/dataset_1.fast_llm_dataset"},
132132
],
133-
"weights": [44883, 43910],
133+
"weights": [47178, 46208],
134134
},
135135
{"type": "memmap", "path": "dataset2/dataset_3.fast_llm_dataset"},
136136
{"type": "memmap", "path": "dataset_0.fast_llm_dataset"},
137137
],
138-
"weights": [88793, 43910, 44883],
138+
"weights": [93386, 46208, 47178],
139139
},
140140
),
141141
),

tests/data/test_fim.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@
99
from tests.utils.global_variables import TOKENIZER_PATH
1010

1111
GPT_FIM_SAMPLES = [
12-
[46, 10, 819, 19, 45, 88],
13-
[45, 69, 17, 86, 38826, 15],
14-
[86, 89, 32348, 64, 49152, 87],
15-
[64, 17, 93, 78, 40, 1793],
16-
[1793, 1, 1746, 38, 27, 58],
17-
[86, 89, 37, 92, 76, 49152],
18-
[86, 49152, 76, 29, 19, 89],
19-
[86, 49152, 46, 83, 17211, 1],
12+
[46, 10, 721, 19, 45, 88],
13+
[45, 69, 17, 86, 92, 0],
14+
[86, 89, 31, 27, 50256, 87],
15+
[27, 0, 64, 17, 93, 78],
16+
[78, 3955, 43, 1, 1395, 38],
17+
[86, 89, 55, 93, 37, 50256],
18+
[86, 50256, 37, 92, 76, 89],
19+
[86, 89, 1, 50256, 87, 50256],
2020
]
2121

2222

tests/data/test_image_patch.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from tests.data.test_preparator import COMMON_DATASET_LENGTH, COMMON_DATASET_TEXT
1515
from tests.utils.dataset import get_test_dataset_with_image_patches
1616

17-
DATASET_WITH_IMAGE_PATCHES_TOKENS = [55750, 56809, 59145, 59145]
17+
DATASET_WITH_IMAGE_PATCHES_TOKENS = [58021, 59080, 61416, 61416]
1818
DATASET_WITH_IMAGE_PATCHES_IMAGE_MD5 = {
1919
27: [],
2020
30: ["a2c34e404506fe664efcdb520642f260"],
@@ -37,11 +37,11 @@
3737
87: [(17, 4), (15, 12)],
3838
}
3939
DATASET_WITH_IMAGE_PATCHES_SAMPLES = {
40-
27: [49152, 63, 82, 11, 27799, 49152],
41-
30: [49152, 31, 2327, (4, 1), 27, 1448, 62, 43, 49152],
42-
31: [49152, 60, 55, (2, 4), 80, 30, (3, 4), 85, 22, 18, 49152],
43-
77: [49152, 13736, 85, 52, 22, 46, 5, 11807, 49152],
44-
87: [49152, 52, (4, 1), 89, (4, 3), 75, 11, 71, 49152],
40+
27: [50256, 63, 82, 11, 7456, 50256],
41+
30: [50256, 31, 13038, (4, 1), 27, 8220, 62, 43, 50256],
42+
31: [50256, 60, 55, (2, 4), 80, 30, (3, 4), 85, 4790, 50256],
43+
77: [50256, 73, 44179, 52, 22, 46, 5, 8226, 50256],
44+
87: [50256, 52, (4, 1), 89, (4, 3), 75, 11, 71, 50256],
4545
}
4646

4747

tests/data/test_loss_masking_spans.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@
1111
from tests.utils.dataset import get_test_dataset_with_loss_masking_spans
1212
from tests.utils.global_variables import TOKENIZER_NAME
1313

14-
DATASET_WITH_SPAN_TOKENS = 45577
14+
DATASET_WITH_SPAN_TOKENS = 47782
1515
DATASET_WITH_SPAN_SAMPLES = {
16-
27: [49152, 63, 82, 11, 27799, 49152],
17-
30: [49152, 31, 85, 78, 27, 1448, 62, 43, 49152],
18-
31: [49152, 60, 55, 80, 30, 85, 22, 18, 49152],
19-
77: [49152, 73, 80, 85, 52, 22, 46, 5, 88, 78, 49152],
20-
87: [49152, 52, 42536, 11, 71, 49152],
16+
27: [50256, 63, 82, 11, 7456, 50256],
17+
30: [50256, 31, 85, 78, 27, 8220, 62, 43, 50256],
18+
31: [50256, 60, 55, 80, 30, 85, 22, 18, 50256],
19+
77: [50256, 73, 80, 85, 52, 22, 46, 5, 88, 78, 50256],
20+
87: [50256, 52, 48274, 11, 71, 50256],
2121
}
2222
HF_LOSS_MASKING_SPANS = {
2323
27: [[0, 1]],

tests/data/test_preference_spans.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,17 +22,17 @@
2222
87: ["Uz", "l", ",h"],
2323
}
2424
DATASET_WITH_PREFERENCE_SPAN_SAMPLES = {
25-
27: [49152, 63, 82, 11, 49152, 49152, 63, 27799, 49152],
26-
30: [49152, 31, 85, 78, 27, 34, 49152, 49152, 31, 85, 46, 62, 43, 49152],
27-
31: [49152, 60, 55, 80, 30, 85, 49152, 49152, 60, 55, 80, 30, 22, 18, 49152],
28-
77: [49152, 73, 80, 85, 52, 22, 46, 49152, 49152, 73, 5, 11807, 49152],
29-
87: [49152, 52, 89, 75, 49152, 49152, 52, 89, 11, 71, 49152],
25+
27: [50256, 63, 82, 11, 50256, 50256, 63, 7456, 50256],
26+
30: [50256, 31, 85, 78, 27, 34, 50256, 50256, 31, 85, 46, 62, 43, 50256],
27+
31: [50256, 60, 55, 80, 30, 85, 50256, 50256, 60, 55, 80, 30, 4790, 50256],
28+
77: [50256, 73, 44179, 52, 22, 46, 50256, 50256, 73, 5, 8226, 50256],
29+
87: [50256, 52, 89, 75, 50256, 50256, 52, 89, 11, 71, 50256],
3030
}
3131
TOKEN_PREFERENCE_SPANS = {
3232
27: [(2, 5), (7, 9)],
3333
30: [(3, 7), (10, 14)],
34-
31: [(5, 7), (12, 15)],
35-
77: [(2, 8), (10, 13)],
34+
31: [(5, 7), (12, 14)],
35+
77: [(2, 7), (9, 12)],
3636
87: [(3, 5), (8, 11)],
3737
}
3838

tests/data/test_preparator.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from tests.utils.global_variables import DATASET_CACHE, TOKENIZER_NAME
2222

2323
COMMON_DATASET_LENGTH = 1000
24-
COMMON_DATASET_TOKENS = 44883
24+
COMMON_DATASET_TOKENS = 47178
2525
COMMON_DATASET_TEXT = {
2626
27: "`s,uh",
2727
30: "@vo<CO_L",
@@ -30,11 +30,11 @@
3030
87: "Uzl,h",
3131
}
3232
COMMON_DATASET_SAMPLES = {
33-
27: [49152, 63, 82, 11, 27799, 49152],
34-
30: [49152, 31, 2327, 27, 1448, 62, 43, 49152],
35-
31: [49152, 60, 55, 80, 30, 85, 22, 18, 49152],
36-
77: [49152, 13736, 85, 52, 22, 46, 5, 11807, 49152],
37-
87: [49152, 52, 42536, 11, 71, 49152],
33+
27: [50256, 63, 82, 11, 7456, 50256],
34+
30: [50256, 31, 13038, 27, 8220, 62, 43, 50256],
35+
31: [50256, 60, 55, 80, 30, 85, 4790, 50256],
36+
77: [50256, 73, 44179, 52, 22, 46, 5, 8226, 50256],
37+
87: [50256, 52, 48274, 11, 71, 50256],
3838
}
3939

4040

@@ -87,10 +87,10 @@ def test_preparator_sharded():
8787

8888
dataset_config = get_dataset_config(config, GPTDatasetFromFileConfig)._load_config()
8989
Assert.custom(isinstance, dataset_config, BlendedDatasetConfig)
90-
Assert.eq(dataset_config.weights, [0.33003587104248827, 0.3455874161709333, 0.3243767127865784])
90+
Assert.eq(dataset_config.weights, [0.32985713680105133, 0.34579676968078343, 0.32434609351816523])
9191
datasets_ = [dataset_config_.build() for dataset_config_ in dataset_config.datasets]
9292
Assert.eq([len(dataset) for dataset in datasets_], lengths := [334, 333, 333])
93-
Assert.eq([dataset.num_tokens for dataset in datasets_], [14813, 15511, 14559])
93+
Assert.eq([dataset.num_tokens for dataset in datasets_], [15562, 16314, 15302])
9494

9595
hf_dataset = datasets.load_from_disk(hf_path)["train"]
9696
tokenizer = TokenizerConfig(path=TOKENIZER_NAME).get_tokenizer()
@@ -112,14 +112,14 @@ def test_preparator_split():
112112
"training": {
113113
"type": "slice",
114114
"dataset": {"type": "memmap", "path": str(path / "shard_0_0.fast_llm_dataset")},
115-
"begin": 0,
115+
"begin": 0.0,
116116
"end": 0.501,
117117
},
118118
"validation": {
119119
"type": "slice",
120120
"dataset": {"type": "memmap", "path": str(path / "shard_0_0.fast_llm_dataset")},
121121
"begin": 0.501,
122-
"end": 1,
122+
"end": 1.0,
123123
},
124124
}
125125
Assert.eq(dataset_config, expected_config)
@@ -140,11 +140,11 @@ def test_preparator_split_sharded():
140140
{
141141
"type": "slice",
142142
"dataset": {"type": "memmap", "path": str(path / "shard_0_1.fast_llm_dataset")},
143-
"begin": 0,
143+
"begin": 0.0,
144144
"end": 0.5015015015015015,
145145
},
146146
],
147-
"weights": [0.6602629819478494, 0.3397370180521507],
147+
"weights": [0.6596583442838371, 0.3403416557161629],
148148
},
149149
"validation": {
150150
"type": "blended",
@@ -153,11 +153,11 @@ def test_preparator_split_sharded():
153153
"type": "slice",
154154
"dataset": {"type": "memmap", "path": str(path / "shard_0_1.fast_llm_dataset")},
155155
"begin": 0.5015015015015015,
156-
"end": 1,
156+
"end": 1.0,
157157
},
158158
{"type": "memmap", "path": str(path / "shard_0_2.fast_llm_dataset")},
159159
],
160-
"weights": [0.3514344262295082, 0.6485655737704918],
160+
"weights": [0.35125280875058296, 0.648747191249417],
161161
},
162162
}
163163
Assert.eq(dataset_config, expected_config)
@@ -191,7 +191,7 @@ def test_dataset_preparator_from_hub():
191191
tokenizer = preparator_config.tokenizer.get_tokenizer()
192192

193193
Assert.eq(len(dataset), len(hf_dataset), 1319)
194-
Assert.eq(dataset.num_tokens, 179248)
194+
Assert.eq(dataset.num_tokens, 131610)
195195
for index in range(0, 200, 8):
196196
Assert.eq(
197197
tokenizer.detokenize(dataset.get_document(index).tokens),

tests/data/test_sampling.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@
2828

2929

3030
GPT_MEMMAP_SAMPLES = [
31-
[49152, 46, 10, 819, 19, 45],
32-
[45, 69, 17, 86, 38826, 15],
33-
[15, 25, 51, 31, 32348, 64],
34-
[64, 17, 93, 78, 40, 1793],
35-
[1793, 1, 1746, 38, 27, 58],
36-
[58, 22885, 93, 37, 92, 76],
37-
[76, 29, 19, 17365, 93, 46],
38-
[46, 83, 17211, 1, 785, 1023],
31+
[50256, 46, 10, 721, 19, 45],
32+
[45, 69, 17, 86, 92, 0],
33+
[0, 15, 25, 51, 31, 27],
34+
[27, 0, 64, 17, 93, 78],
35+
[78, 3955, 43, 1, 1395, 38],
36+
[38, 27, 58, 40692, 93, 37],
37+
[37, 92, 76, 29, 19, 29499],
38+
[29499, 93, 46, 83, 27159, 1],
3939
]
4040

4141

0 commit comments

Comments
 (0)