2121from tests .utils .global_variables import DATASET_CACHE , TOKENIZER_NAME
2222
2323COMMON_DATASET_LENGTH = 1000
24- COMMON_DATASET_TOKENS = 44883
24+ COMMON_DATASET_TOKENS = 47178
2525COMMON_DATASET_TEXT = {
2626 27 : "`s,uh" ,
2727 30 : "@vo<CO_L" ,
3030 87 : "Uzl,h" ,
3131}
3232COMMON_DATASET_SAMPLES = {
33- 27 : [49152 , 63 , 82 , 11 , 27799 , 49152 ],
34- 30 : [49152 , 31 , 2327 , 27 , 1448 , 62 , 43 , 49152 ],
35- 31 : [49152 , 60 , 55 , 80 , 30 , 85 , 22 , 18 , 49152 ],
36- 77 : [49152 , 13736 , 85 , 52 , 22 , 46 , 5 , 11807 , 49152 ],
37- 87 : [49152 , 52 , 42536 , 11 , 71 , 49152 ],
33+ 27 : [50256 , 63 , 82 , 11 , 7456 , 50256 ],
34+ 30 : [50256 , 31 , 13038 , 27 , 8220 , 62 , 43 , 50256 ],
35+ 31 : [50256 , 60 , 55 , 80 , 30 , 85 , 4790 , 50256 ],
36+ 77 : [50256 , 73 , 44179 , 52 , 22 , 46 , 5 , 8226 , 50256 ],
37+ 87 : [50256 , 52 , 48274 , 11 , 71 , 50256 ],
3838}
3939
4040
@@ -87,10 +87,10 @@ def test_preparator_sharded():
8787
8888 dataset_config = get_dataset_config (config , GPTDatasetFromFileConfig )._load_config ()
8989 Assert .custom (isinstance , dataset_config , BlendedDatasetConfig )
90- Assert .eq (dataset_config .weights , [0.33003587104248827 , 0.3455874161709333 , 0.3243767127865784 ])
90+ Assert .eq (dataset_config .weights , [0.32985713680105133 , 0.34579676968078343 , 0.32434609351816523 ])
9191 datasets_ = [dataset_config_ .build () for dataset_config_ in dataset_config .datasets ]
9292 Assert .eq ([len (dataset ) for dataset in datasets_ ], lengths := [334 , 333 , 333 ])
93- Assert .eq ([dataset .num_tokens for dataset in datasets_ ], [14813 , 15511 , 14559 ])
93+ Assert .eq ([dataset .num_tokens for dataset in datasets_ ], [15562 , 16314 , 15302 ])
9494
9595 hf_dataset = datasets .load_from_disk (hf_path )["train" ]
9696 tokenizer = TokenizerConfig (path = TOKENIZER_NAME ).get_tokenizer ()
@@ -112,14 +112,14 @@ def test_preparator_split():
112112 "training" : {
113113 "type" : "slice" ,
114114 "dataset" : {"type" : "memmap" , "path" : str (path / "shard_0_0.fast_llm_dataset" )},
115- "begin" : 0 ,
115+ "begin" : 0.0 ,
116116 "end" : 0.501 ,
117117 },
118118 "validation" : {
119119 "type" : "slice" ,
120120 "dataset" : {"type" : "memmap" , "path" : str (path / "shard_0_0.fast_llm_dataset" )},
121121 "begin" : 0.501 ,
122- "end" : 1 ,
122+ "end" : 1.0 ,
123123 },
124124 }
125125 Assert .eq (dataset_config , expected_config )
@@ -140,11 +140,11 @@ def test_preparator_split_sharded():
140140 {
141141 "type" : "slice" ,
142142 "dataset" : {"type" : "memmap" , "path" : str (path / "shard_0_1.fast_llm_dataset" )},
143- "begin" : 0 ,
143+ "begin" : 0.0 ,
144144 "end" : 0.5015015015015015 ,
145145 },
146146 ],
147- "weights" : [0.6602629819478494 , 0.3397370180521507 ],
147+ "weights" : [0.6596583442838371 , 0.3403416557161629 ],
148148 },
149149 "validation" : {
150150 "type" : "blended" ,
@@ -153,11 +153,11 @@ def test_preparator_split_sharded():
153153 "type" : "slice" ,
154154 "dataset" : {"type" : "memmap" , "path" : str (path / "shard_0_1.fast_llm_dataset" )},
155155 "begin" : 0.5015015015015015 ,
156- "end" : 1 ,
156+ "end" : 1.0 ,
157157 },
158158 {"type" : "memmap" , "path" : str (path / "shard_0_2.fast_llm_dataset" )},
159159 ],
160- "weights" : [0.3514344262295082 , 0.6485655737704918 ],
160+ "weights" : [0.35125280875058296 , 0.648747191249417 ],
161161 },
162162 }
163163 Assert .eq (dataset_config , expected_config )
@@ -191,7 +191,7 @@ def test_dataset_preparator_from_hub():
191191 tokenizer = preparator_config .tokenizer .get_tokenizer ()
192192
193193 Assert .eq (len (dataset ), len (hf_dataset ), 1319 )
194- Assert .eq (dataset .num_tokens , 179248 )
194+ Assert .eq (dataset .num_tokens , 131610 )
195195 for index in range (0 , 200 , 8 ):
196196 Assert .eq (
197197 tokenizer .detokenize (dataset .get_document (index ).tokens ),
0 commit comments