Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fast_llm/data/preparation/gpt_memmap/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def _load_dataset(self) -> datasets.Dataset:
return dataset

def _get_croissant_metadata(self):
token = huggingface_hub.HfFolder.get_token()
token = huggingface_hub.get_token()
try:
# Retrieve the dataset metadata in croissant format
url = f"https://huggingface.co/api/datasets/{self._config.dataset.path}/croissant"
Expand Down
26 changes: 13 additions & 13 deletions tests/data/test_blending.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,25 +31,25 @@ def _get_blending_alt(probs: list[float], num_samples: int) -> tuple[np.ndarray,


GPT_BLENDED_SAMPLES = [
[49152, 46, 10, 819, 19, 45],
[45, 69, 17, 86, 38826, 15],
[49152, 83, 80, 20452, 45, 93],
[15, 25, 51, 31, 32348, 64],
[64, 17, 93, 78, 40, 1793],
[1793, 1, 1746, 38, 27, 58],
[93, 90, 39, 6, 75, 9],
[58, 22885, 93, 37, 92, 76],
[50256, 46, 10, 721, 19, 45],
[45, 69, 17, 86, 92, 0],
[50256, 83, 80, 29, 2, 45],
[0, 15, 25, 51, 31, 27],
[27, 0, 64, 17, 93, 78],
[78, 3955, 43, 1, 1395, 38],
[45, 93, 90, 39, 6, 75],
[38, 27, 58, 40692, 93, 37],
]

GPT_BLENDED_MIXED_SAMPLES = [
[49152, 46, 10, 819, 19, 45],
[50256, 46, 10, 721, 19, 45],
[25492, 15877, 37874, 8570, 31649, 15521],
[45, 69, 17, 86, 38826, 15],
[45, 69, 17, 86, 92, 0],
[3359, 20945, 33437, 32454, 42084, 45942],
[15, 25, 51, 31, 32348, 64],
[64, 17, 93, 78, 40, 1793],
[0, 15, 25, 51, 31, 27],
[27, 0, 64, 17, 93, 78],
[15112, 36731, 47864, 35586, 33356, 37537],
[1793, 1, 1746, 38, 27, 58],
[78, 3955, 43, 1, 1395, 38],
]


Expand Down
16 changes: 8 additions & 8 deletions tests/data/test_concatenate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@
from tests.utils.dataset import get_common_test_dataset

GPT_CONCATENATED_SAMPLES = [
[49152, 46, 10, 819, 19, 45],
[45, 69, 17, 86, 38826, 15],
[15, 25, 51, 31, 32348, 64],
[64, 17, 93, 78, 40, 1793],
[1793, 1, 1746, 38, 27, 58],
[58, 22885, 93, 37, 92, 76],
[76, 29, 19, 17365, 93, 46],
[46, 83, 17211, 1, 785, 1023],
[50256, 46, 10, 721, 19, 45],
[45, 69, 17, 86, 92, 0],
[0, 15, 25, 51, 31, 27],
[27, 0, 64, 17, 93, 78],
[78, 3955, 43, 1, 1395, 38],
[38, 27, 58, 40692, 93, 37],
[37, 92, 76, 29, 19, 29499],
[29499, 93, 46, 83, 27159, 1],
]


Expand Down
18 changes: 9 additions & 9 deletions tests/data/test_dataset_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
{"type": "memmap", "path": "dataset_0.fast_llm_dataset"},
{"type": "memmap", "path": "dataset_1.fast_llm_dataset"},
],
"weights": [44883, 43910],
"weights": [47178, 46208],
},
),
(
Expand All @@ -39,7 +39,7 @@
{"type": "memmap", "path": "dataset0/dataset_0.fast_llm_dataset"},
{"type": "memmap", "path": "dataset1/dataset_1.fast_llm_dataset"},
],
"weights": [44883, 43910],
"weights": [47178, 46208],
},
),
(
Expand All @@ -59,7 +59,7 @@
{"type": "memmap", "path": "dataset/dataset_1.fast_llm_dataset"},
{"type": "memmap", "path": "dataset_0.fast_llm_dataset"},
],
"weights": [43910, 44883],
"weights": [46208, 47178],
},
),
(
Expand All @@ -78,10 +78,10 @@
{"type": "memmap", "path": "dataset_0.fast_llm_dataset"},
{"type": "memmap", "path": "dataset_1.fast_llm_dataset"},
],
"weights": [44883, 43910],
"weights": [47178, 46208],
},
],
"weights": [44883, 88793],
"weights": [47178, 93386],
},
),
(
Expand All @@ -99,11 +99,11 @@
{"type": "memmap", "path": "dataset/dataset_1.fast_llm_dataset"},
{"type": "memmap", "path": "dataset/dataset_2.fast_llm_dataset"},
],
"weights": [43910, 44883],
"weights": [46208, 47178],
},
{"type": "memmap", "path": "dataset_0.fast_llm_dataset"},
],
"weights": [88793, 44883],
"weights": [93386, 47178],
},
),
(
Expand All @@ -130,12 +130,12 @@
{"type": "memmap", "path": "dataset1/dataset3/dataset_2.fast_llm_dataset"},
{"type": "memmap", "path": "dataset1/dataset_1.fast_llm_dataset"},
],
"weights": [44883, 43910],
"weights": [47178, 46208],
},
{"type": "memmap", "path": "dataset2/dataset_3.fast_llm_dataset"},
{"type": "memmap", "path": "dataset_0.fast_llm_dataset"},
],
"weights": [88793, 43910, 44883],
"weights": [93386, 46208, 47178],
},
),
),
Expand Down
16 changes: 8 additions & 8 deletions tests/data/test_fim.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
from tests.utils.global_variables import TOKENIZER_PATH

GPT_FIM_SAMPLES = [
[46, 10, 819, 19, 45, 88],
[45, 69, 17, 86, 38826, 15],
[86, 89, 32348, 64, 49152, 87],
[64, 17, 93, 78, 40, 1793],
[1793, 1, 1746, 38, 27, 58],
[86, 89, 37, 92, 76, 49152],
[86, 49152, 76, 29, 19, 89],
[86, 49152, 46, 83, 17211, 1],
[46, 10, 721, 19, 45, 88],
[45, 69, 17, 86, 92, 0],
[86, 89, 31, 27, 50256, 87],
[27, 0, 64, 17, 93, 78],
[78, 3955, 43, 1, 1395, 38],
[86, 89, 55, 93, 37, 50256],
[86, 50256, 37, 92, 76, 89],
[86, 89, 1, 50256, 87, 50256],
]


Expand Down
12 changes: 6 additions & 6 deletions tests/data/test_image_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from tests.data.test_preparator import COMMON_DATASET_LENGTH, COMMON_DATASET_TEXT
from tests.utils.dataset import get_test_dataset_with_image_patches

DATASET_WITH_IMAGE_PATCHES_TOKENS = [55750, 56809, 59145, 59145]
DATASET_WITH_IMAGE_PATCHES_TOKENS = [58021, 59080, 61416, 61416]
DATASET_WITH_IMAGE_PATCHES_IMAGE_MD5 = {
27: [],
30: ["a2c34e404506fe664efcdb520642f260"],
Expand All @@ -37,11 +37,11 @@
87: [(17, 4), (15, 12)],
}
DATASET_WITH_IMAGE_PATCHES_SAMPLES = {
27: [49152, 63, 82, 11, 27799, 49152],
30: [49152, 31, 2327, (4, 1), 27, 1448, 62, 43, 49152],
31: [49152, 60, 55, (2, 4), 80, 30, (3, 4), 85, 22, 18, 49152],
77: [49152, 13736, 85, 52, 22, 46, 5, 11807, 49152],
87: [49152, 52, (4, 1), 89, (4, 3), 75, 11, 71, 49152],
27: [50256, 63, 82, 11, 7456, 50256],
30: [50256, 31, 13038, (4, 1), 27, 8220, 62, 43, 50256],
31: [50256, 60, 55, (2, 4), 80, 30, (3, 4), 85, 4790, 50256],
77: [50256, 73, 44179, 52, 22, 46, 5, 8226, 50256],
87: [50256, 52, (4, 1), 89, (4, 3), 75, 11, 71, 50256],
}


Expand Down
12 changes: 6 additions & 6 deletions tests/data/test_loss_masking_spans.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@
from tests.utils.dataset import get_test_dataset_with_loss_masking_spans
from tests.utils.global_variables import TOKENIZER_NAME

DATASET_WITH_SPAN_TOKENS = 45577
DATASET_WITH_SPAN_TOKENS = 47782
DATASET_WITH_SPAN_SAMPLES = {
27: [49152, 63, 82, 11, 27799, 49152],
30: [49152, 31, 85, 78, 27, 1448, 62, 43, 49152],
31: [49152, 60, 55, 80, 30, 85, 22, 18, 49152],
77: [49152, 73, 80, 85, 52, 22, 46, 5, 88, 78, 49152],
87: [49152, 52, 42536, 11, 71, 49152],
27: [50256, 63, 82, 11, 7456, 50256],
30: [50256, 31, 85, 78, 27, 8220, 62, 43, 50256],
31: [50256, 60, 55, 80, 30, 85, 22, 18, 50256],
77: [50256, 73, 80, 85, 52, 22, 46, 5, 88, 78, 50256],
87: [50256, 52, 48274, 11, 71, 50256],
}
HF_LOSS_MASKING_SPANS = {
27: [[0, 1]],
Expand Down
14 changes: 7 additions & 7 deletions tests/data/test_preference_spans.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,17 @@
87: ["Uz", "l", ",h"],
}
DATASET_WITH_PREFERENCE_SPAN_SAMPLES = {
27: [49152, 63, 82, 11, 49152, 49152, 63, 27799, 49152],
30: [49152, 31, 85, 78, 27, 34, 49152, 49152, 31, 85, 46, 62, 43, 49152],
31: [49152, 60, 55, 80, 30, 85, 49152, 49152, 60, 55, 80, 30, 22, 18, 49152],
77: [49152, 73, 80, 85, 52, 22, 46, 49152, 49152, 73, 5, 11807, 49152],
87: [49152, 52, 89, 75, 49152, 49152, 52, 89, 11, 71, 49152],
27: [50256, 63, 82, 11, 50256, 50256, 63, 7456, 50256],
30: [50256, 31, 85, 78, 27, 34, 50256, 50256, 31, 85, 46, 62, 43, 50256],
31: [50256, 60, 55, 80, 30, 85, 50256, 50256, 60, 55, 80, 30, 4790, 50256],
77: [50256, 73, 44179, 52, 22, 46, 50256, 50256, 73, 5, 8226, 50256],
87: [50256, 52, 89, 75, 50256, 50256, 52, 89, 11, 71, 50256],
}
TOKEN_PREFERENCE_SPANS = {
27: [(2, 5), (7, 9)],
30: [(3, 7), (10, 14)],
31: [(5, 7), (12, 15)],
77: [(2, 8), (10, 13)],
31: [(5, 7), (12, 14)],
77: [(2, 7), (9, 12)],
87: [(3, 5), (8, 11)],
}

Expand Down
30 changes: 15 additions & 15 deletions tests/data/test_preparator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from tests.utils.global_variables import DATASET_CACHE, TOKENIZER_NAME

COMMON_DATASET_LENGTH = 1000
COMMON_DATASET_TOKENS = 44883
COMMON_DATASET_TOKENS = 47178
COMMON_DATASET_TEXT = {
27: "`s,uh",
30: "@vo<CO_L",
Expand All @@ -30,11 +30,11 @@
87: "Uzl,h",
}
COMMON_DATASET_SAMPLES = {
27: [49152, 63, 82, 11, 27799, 49152],
30: [49152, 31, 2327, 27, 1448, 62, 43, 49152],
31: [49152, 60, 55, 80, 30, 85, 22, 18, 49152],
77: [49152, 13736, 85, 52, 22, 46, 5, 11807, 49152],
87: [49152, 52, 42536, 11, 71, 49152],
27: [50256, 63, 82, 11, 7456, 50256],
30: [50256, 31, 13038, 27, 8220, 62, 43, 50256],
31: [50256, 60, 55, 80, 30, 85, 4790, 50256],
77: [50256, 73, 44179, 52, 22, 46, 5, 8226, 50256],
87: [50256, 52, 48274, 11, 71, 50256],
}


Expand Down Expand Up @@ -87,10 +87,10 @@ def test_preparator_sharded():

dataset_config = get_dataset_config(config, GPTDatasetFromFileConfig)._load_config()
Assert.custom(isinstance, dataset_config, BlendedDatasetConfig)
Assert.eq(dataset_config.weights, [0.33003587104248827, 0.3455874161709333, 0.3243767127865784])
Assert.eq(dataset_config.weights, [0.32985713680105133, 0.34579676968078343, 0.32434609351816523])
datasets_ = [dataset_config_.build() for dataset_config_ in dataset_config.datasets]
Assert.eq([len(dataset) for dataset in datasets_], lengths := [334, 333, 333])
Assert.eq([dataset.num_tokens for dataset in datasets_], [14813, 15511, 14559])
Assert.eq([dataset.num_tokens for dataset in datasets_], [15562, 16314, 15302])

hf_dataset = datasets.load_from_disk(hf_path)["train"]
tokenizer = TokenizerConfig(path=TOKENIZER_NAME).get_tokenizer()
Expand All @@ -112,14 +112,14 @@ def test_preparator_split():
"training": {
"type": "slice",
"dataset": {"type": "memmap", "path": str(path / "shard_0_0.fast_llm_dataset")},
"begin": 0,
"begin": 0.0,
"end": 0.501,
},
"validation": {
"type": "slice",
"dataset": {"type": "memmap", "path": str(path / "shard_0_0.fast_llm_dataset")},
"begin": 0.501,
"end": 1,
"end": 1.0,
},
}
Assert.eq(dataset_config, expected_config)
Expand All @@ -140,11 +140,11 @@ def test_preparator_split_sharded():
{
"type": "slice",
"dataset": {"type": "memmap", "path": str(path / "shard_0_1.fast_llm_dataset")},
"begin": 0,
"begin": 0.0,
"end": 0.5015015015015015,
},
],
"weights": [0.6602629819478494, 0.3397370180521507],
"weights": [0.6596583442838371, 0.3403416557161629],
},
"validation": {
"type": "blended",
Expand All @@ -153,11 +153,11 @@ def test_preparator_split_sharded():
"type": "slice",
"dataset": {"type": "memmap", "path": str(path / "shard_0_1.fast_llm_dataset")},
"begin": 0.5015015015015015,
"end": 1,
"end": 1.0,
},
{"type": "memmap", "path": str(path / "shard_0_2.fast_llm_dataset")},
],
"weights": [0.3514344262295082, 0.6485655737704918],
"weights": [0.35125280875058296, 0.648747191249417],
},
}
Assert.eq(dataset_config, expected_config)
Expand Down Expand Up @@ -191,7 +191,7 @@ def test_dataset_preparator_from_hub():
tokenizer = preparator_config.tokenizer.get_tokenizer()

Assert.eq(len(dataset), len(hf_dataset), 1319)
Assert.eq(dataset.num_tokens, 179248)
Assert.eq(dataset.num_tokens, 131610)
for index in range(0, 200, 8):
Assert.eq(
tokenizer.detokenize(dataset.get_document(index).tokens),
Expand Down
16 changes: 8 additions & 8 deletions tests/data/test_sampling.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@


GPT_MEMMAP_SAMPLES = [
[49152, 46, 10, 819, 19, 45],
[45, 69, 17, 86, 38826, 15],
[15, 25, 51, 31, 32348, 64],
[64, 17, 93, 78, 40, 1793],
[1793, 1, 1746, 38, 27, 58],
[58, 22885, 93, 37, 92, 76],
[76, 29, 19, 17365, 93, 46],
[46, 83, 17211, 1, 785, 1023],
[50256, 46, 10, 721, 19, 45],
[45, 69, 17, 86, 92, 0],
[0, 15, 25, 51, 31, 27],
[27, 0, 64, 17, 93, 78],
[78, 3955, 43, 1, 1395, 38],
[38, 27, 58, 40692, 93, 37],
[37, 92, 76, 29, 19, 29499],
[29499, 93, 46, 83, 27159, 1],
]


Expand Down
26 changes: 13 additions & 13 deletions tests/data/test_slice.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,20 @@
from tests.utils.dataset import get_common_test_dataset

GPT_SLICE_TRAINING_SAMPLES = [
[49152, 20, 59, 81, 15, 54],
[54, 76, 7909, 44, 41, 1],
[1, 71, 28, 10, 42, 15963],
[15963, 80, 59, 86, 4, 74],
[50256, 20, 59, 81, 15, 54],
[54, 76, 1026, 43421, 1, 71],
[71, 28, 10, 42, 21016, 80],
[80, 59, 86, 4, 74, 45],
]
GPT_SLICE_VALIDATION_SAMPLES = [
[49152, 3, 5621, 27, 7859, 13009],
[13009, 73, 32, 29, 32, 3],
[3, 89, 15, 45, 25, 75],
[75, 52, 13366, 88, 54, 19],
[19, 2, 74, 23, 92, 24747],
[24747, 42, 6, 477, 21, 47],
[47, 92, 31, 30, 463, 64],
[64, 23, 11, 56, 23555, 85],
[50256, 3, 381, 27, 62, 8],
[8, 10503, 73, 32, 29, 32],
[32, 3, 89, 15, 45, 25],
[25, 75, 7340, 40, 88, 54],
[54, 19, 2, 74, 23, 92],
[92, 65, 85, 42, 6, 304],
[304, 21, 47, 92, 31, 30],
[30, 8455, 23, 11, 56, 12805],
]


Expand All @@ -38,7 +38,7 @@ def test_gpt_slice(data_result_path):
{"type": "slice", "dataset": memmap_config, "begin": 0.025, "end": 0.1},
DatasetSliceConfig[LanguageModelDocument],
).build()
compare_indexed_dataset_tokens(dataset, 75, 3399, {i - 25: sample for i, sample in COMMON_DATASET_SAMPLES.items()})
compare_indexed_dataset_tokens(dataset, 75, 3575, {i - 25: sample for i, sample in COMMON_DATASET_SAMPLES.items()})
sampled = dataset.sample(*get_sampling_config(8, sequence_length=5, preprocessing=preprocessing))
validate_indexed_dataset_sampling(sampled, GPT_SLICE_VALIDATION_SAMPLES)

Expand Down
Loading
Loading