diff --git a/fast_llm/data/preparation/gpt_memmap/prepare.py b/fast_llm/data/preparation/gpt_memmap/prepare.py index 70a1e13e8..71aa1199b 100644 --- a/fast_llm/data/preparation/gpt_memmap/prepare.py +++ b/fast_llm/data/preparation/gpt_memmap/prepare.py @@ -80,7 +80,7 @@ def _load_dataset(self) -> datasets.Dataset: return dataset def _get_croissant_metadata(self): - token = huggingface_hub.HfFolder.get_token() + token = huggingface_hub.get_token() try: # Retrieve the dataset metadata in croissant format url = f"https://huggingface.co/api/datasets/{self._config.dataset.path}/croissant" diff --git a/tests/data/test_blending.py b/tests/data/test_blending.py index edbe479cc..1407397f6 100644 --- a/tests/data/test_blending.py +++ b/tests/data/test_blending.py @@ -31,25 +31,25 @@ def _get_blending_alt(probs: list[float], num_samples: int) -> tuple[np.ndarray, GPT_BLENDED_SAMPLES = [ - [49152, 46, 10, 819, 19, 45], - [45, 69, 17, 86, 38826, 15], - [49152, 83, 80, 20452, 45, 93], - [15, 25, 51, 31, 32348, 64], - [64, 17, 93, 78, 40, 1793], - [1793, 1, 1746, 38, 27, 58], - [93, 90, 39, 6, 75, 9], - [58, 22885, 93, 37, 92, 76], + [50256, 46, 10, 721, 19, 45], + [45, 69, 17, 86, 92, 0], + [50256, 83, 80, 29, 2, 45], + [0, 15, 25, 51, 31, 27], + [27, 0, 64, 17, 93, 78], + [78, 3955, 43, 1, 1395, 38], + [45, 93, 90, 39, 6, 75], + [38, 27, 58, 40692, 93, 37], ] GPT_BLENDED_MIXED_SAMPLES = [ - [49152, 46, 10, 819, 19, 45], + [50256, 46, 10, 721, 19, 45], [25492, 15877, 37874, 8570, 31649, 15521], - [45, 69, 17, 86, 38826, 15], + [45, 69, 17, 86, 92, 0], [3359, 20945, 33437, 32454, 42084, 45942], - [15, 25, 51, 31, 32348, 64], - [64, 17, 93, 78, 40, 1793], + [0, 15, 25, 51, 31, 27], + [27, 0, 64, 17, 93, 78], [15112, 36731, 47864, 35586, 33356, 37537], - [1793, 1, 1746, 38, 27, 58], + [78, 3955, 43, 1, 1395, 38], ] diff --git a/tests/data/test_concatenate.py b/tests/data/test_concatenate.py index 6774374bb..6a232f028 100644 --- a/tests/data/test_concatenate.py +++ b/tests/data/test_concatenate.py @@ -12,14 +12,14 @@ from tests.utils.dataset import get_common_test_dataset GPT_CONCATENATED_SAMPLES = [ - [49152, 46, 10, 819, 19, 45], - [45, 69, 17, 86, 38826, 15], - [15, 25, 51, 31, 32348, 64], - [64, 17, 93, 78, 40, 1793], - [1793, 1, 1746, 38, 27, 58], - [58, 22885, 93, 37, 92, 76], - [76, 29, 19, 17365, 93, 46], - [46, 83, 17211, 1, 785, 1023], + [50256, 46, 10, 721, 19, 45], + [45, 69, 17, 86, 92, 0], + [0, 15, 25, 51, 31, 27], + [27, 0, 64, 17, 93, 78], + [78, 3955, 43, 1, 1395, 38], + [38, 27, 58, 40692, 93, 37], + [37, 92, 76, 29, 19, 29499], + [29499, 93, 46, 83, 27159, 1], ] diff --git a/tests/data/test_dataset_discovery.py b/tests/data/test_dataset_discovery.py index 0dd9c31a4..cbe635163 100644 --- a/tests/data/test_dataset_discovery.py +++ b/tests/data/test_dataset_discovery.py @@ -25,7 +25,7 @@ {"type": "memmap", "path": "dataset_0.fast_llm_dataset"}, {"type": "memmap", "path": "dataset_1.fast_llm_dataset"}, ], - "weights": [44883, 43910], + "weights": [47178, 46208], }, ), ( @@ -39,7 +39,7 @@ {"type": "memmap", "path": "dataset0/dataset_0.fast_llm_dataset"}, {"type": "memmap", "path": "dataset1/dataset_1.fast_llm_dataset"}, ], - "weights": [44883, 43910], + "weights": [47178, 46208], }, ), ( @@ -59,7 +59,7 @@ {"type": "memmap", "path": "dataset/dataset_1.fast_llm_dataset"}, {"type": "memmap", "path": "dataset_0.fast_llm_dataset"}, ], - "weights": [43910, 44883], + "weights": [46208, 47178], }, ), ( @@ -78,10 +78,10 @@ {"type": "memmap", "path": "dataset_0.fast_llm_dataset"}, {"type": "memmap", "path": "dataset_1.fast_llm_dataset"}, ], - "weights": [44883, 43910], + "weights": [47178, 46208], }, ], - "weights": [44883, 88793], + "weights": [47178, 93386], }, ), ( @@ -99,11 +99,11 @@ {"type": "memmap", "path": "dataset/dataset_1.fast_llm_dataset"}, {"type": "memmap", "path": "dataset/dataset_2.fast_llm_dataset"}, ], - "weights": [43910, 44883], + "weights": [46208, 47178], }, {"type": "memmap", "path": "dataset_0.fast_llm_dataset"}, ], - "weights": [88793, 44883], + "weights": [93386, 47178], }, ), ( @@ -130,12 +130,12 @@ {"type": "memmap", "path": "dataset1/dataset3/dataset_2.fast_llm_dataset"}, {"type": "memmap", "path": "dataset1/dataset_1.fast_llm_dataset"}, ], - "weights": [44883, 43910], + "weights": [47178, 46208], }, {"type": "memmap", "path": "dataset2/dataset_3.fast_llm_dataset"}, {"type": "memmap", "path": "dataset_0.fast_llm_dataset"}, ], - "weights": [88793, 43910, 44883], + "weights": [93386, 46208, 47178], }, ), ), diff --git a/tests/data/test_fim.py b/tests/data/test_fim.py index 25e42fb97..884be4554 100644 --- a/tests/data/test_fim.py +++ b/tests/data/test_fim.py @@ -9,14 +9,14 @@ from tests.utils.global_variables import TOKENIZER_PATH GPT_FIM_SAMPLES = [ - [46, 10, 819, 19, 45, 88], - [45, 69, 17, 86, 38826, 15], - [86, 89, 32348, 64, 49152, 87], - [64, 17, 93, 78, 40, 1793], - [1793, 1, 1746, 38, 27, 58], - [86, 89, 37, 92, 76, 49152], - [86, 49152, 76, 29, 19, 89], - [86, 49152, 46, 83, 17211, 1], + [46, 10, 721, 19, 45, 88], + [45, 69, 17, 86, 92, 0], + [86, 89, 31, 27, 50256, 87], + [27, 0, 64, 17, 93, 78], + [78, 3955, 43, 1, 1395, 38], + [86, 89, 55, 93, 37, 50256], + [86, 50256, 37, 92, 76, 89], + [86, 89, 1, 50256, 87, 50256], ] diff --git a/tests/data/test_image_patch.py b/tests/data/test_image_patch.py index 9d613c2ec..34cb4f32f 100644 --- a/tests/data/test_image_patch.py +++ b/tests/data/test_image_patch.py @@ -14,7 +14,7 @@ from tests.data.test_preparator import COMMON_DATASET_LENGTH, COMMON_DATASET_TEXT from tests.utils.dataset import get_test_dataset_with_image_patches -DATASET_WITH_IMAGE_PATCHES_TOKENS = [55750, 56809, 59145, 59145] +DATASET_WITH_IMAGE_PATCHES_TOKENS = [58021, 59080, 61416, 61416] DATASET_WITH_IMAGE_PATCHES_IMAGE_MD5 = { 27: [], 30: ["a2c34e404506fe664efcdb520642f260"], @@ -37,11 +37,11 @@ 87: [(17, 4), (15, 12)], } DATASET_WITH_IMAGE_PATCHES_SAMPLES = { - 27: [49152, 63, 82, 11, 27799, 49152], - 30: [49152, 31, 2327, (4, 1), 27, 1448, 62, 43, 49152], - 31: [49152, 60, 55, (2, 4), 80, 30, (3, 4), 85, 22, 18, 49152], - 77: [49152, 13736, 85, 52, 22, 46, 5, 11807, 49152], - 87: [49152, 52, (4, 1), 89, (4, 3), 75, 11, 71, 49152], + 27: [50256, 63, 82, 11, 7456, 50256], + 30: [50256, 31, 13038, (4, 1), 27, 8220, 62, 43, 50256], + 31: [50256, 60, 55, (2, 4), 80, 30, (3, 4), 85, 4790, 50256], + 77: [50256, 73, 44179, 52, 22, 46, 5, 8226, 50256], + 87: [50256, 52, (4, 1), 89, (4, 3), 75, 11, 71, 50256], } diff --git a/tests/data/test_loss_masking_spans.py b/tests/data/test_loss_masking_spans.py index f0a35e9b8..efec8395c 100644 --- a/tests/data/test_loss_masking_spans.py +++ b/tests/data/test_loss_masking_spans.py @@ -11,13 +11,13 @@ from tests.utils.dataset import get_test_dataset_with_loss_masking_spans from tests.utils.global_variables import TOKENIZER_NAME -DATASET_WITH_SPAN_TOKENS = 45577 +DATASET_WITH_SPAN_TOKENS = 47782 DATASET_WITH_SPAN_SAMPLES = { - 27: [49152, 63, 82, 11, 27799, 49152], - 30: [49152, 31, 85, 78, 27, 1448, 62, 43, 49152], - 31: [49152, 60, 55, 80, 30, 85, 22, 18, 49152], - 77: [49152, 73, 80, 85, 52, 22, 46, 5, 88, 78, 49152], - 87: [49152, 52, 42536, 11, 71, 49152], + 27: [50256, 63, 82, 11, 7456, 50256], + 30: [50256, 31, 85, 78, 27, 8220, 62, 43, 50256], + 31: [50256, 60, 55, 80, 30, 85, 22, 18, 50256], + 77: [50256, 73, 80, 85, 52, 22, 46, 5, 88, 78, 50256], + 87: [50256, 52, 48274, 11, 71, 50256], } HF_LOSS_MASKING_SPANS = { 27: [[0, 1]], diff --git a/tests/data/test_preference_spans.py b/tests/data/test_preference_spans.py index 36f8f77af..d3d46a1de 100644 --- a/tests/data/test_preference_spans.py +++ b/tests/data/test_preference_spans.py @@ -22,17 +22,17 @@ 87: ["Uz", "l", ",h"], } DATASET_WITH_PREFERENCE_SPAN_SAMPLES = { - 27: [49152, 63, 82, 11, 49152, 49152, 63, 27799, 49152], - 30: [49152, 31, 85, 78, 27, 34, 49152, 49152, 31, 85, 46, 62, 43, 49152], - 31: [49152, 60, 55, 80, 30, 85, 49152, 49152, 60, 55, 80, 30, 22, 18, 49152], - 77: [49152, 73, 80, 85, 52, 22, 46, 49152, 49152, 73, 5, 11807, 49152], - 87: [49152, 52, 89, 75, 49152, 49152, 52, 89, 11, 71, 49152], + 27: [50256, 63, 82, 11, 50256, 50256, 63, 7456, 50256], + 30: [50256, 31, 85, 78, 27, 34, 50256, 50256, 31, 85, 46, 62, 43, 50256], + 31: [50256, 60, 55, 80, 30, 85, 50256, 50256, 60, 55, 80, 30, 4790, 50256], + 77: [50256, 73, 44179, 52, 22, 46, 50256, 50256, 73, 5, 8226, 50256], + 87: [50256, 52, 89, 75, 50256, 50256, 52, 89, 11, 71, 50256], } TOKEN_PREFERENCE_SPANS = { 27: [(2, 5), (7, 9)], 30: [(3, 7), (10, 14)], - 31: [(5, 7), (12, 15)], - 77: [(2, 8), (10, 13)], + 31: [(5, 7), (12, 14)], + 77: [(2, 7), (9, 12)], 87: [(3, 5), (8, 11)], } diff --git a/tests/data/test_preparator.py b/tests/data/test_preparator.py index 763517cde..79db01b55 100644 --- a/tests/data/test_preparator.py +++ b/tests/data/test_preparator.py @@ -21,7 +21,7 @@ from tests.utils.global_variables import DATASET_CACHE, TOKENIZER_NAME COMMON_DATASET_LENGTH = 1000 -COMMON_DATASET_TOKENS = 44883 +COMMON_DATASET_TOKENS = 47178 COMMON_DATASET_TEXT = { 27: "`s,uh", 30: "@vo Tokenizer: @pytest.mark.parametrize( ("spans", "expected_token_spans", "expected_tokens"), ( - ([], [], [7196, 5297]), # No span - ([(1, 3)], [(1, 2)], [71, 325, 303, 5297]), # Simple span - ([(2, 2)], [(1, 1)], [284, 47443, 5297]), # Empty span - ([(0, 11)], [(0, 2)], [7196, 5297]), # Full span - ([(1, 4), (6, 7)], [(1, 2), (4, 5)], [71, 1498, 78, 207, 86, 2231]), # Two spans - ([(1, 6), (4, 7)], [(1, 4), (2, 5)], [71, 1498, 78, 207, 86, 2231]), # Overlapping spans - ([(1, 7), (4, 6)], [(1, 5), (2, 4)], [71, 1498, 78, 207, 86, 2231]), # Nested spans - ([(1, 5), (5, 7)], [(1, 3), (3, 4)], [71, 325, 303, 365, 2231]), # Consecutive spans - ([(2, 4), (2, 4)], [(1, 2), (1, 2)], [284, 683, 78, 5297]), # Duplicate spans - ([(2, 3), (5, 8), (9, 11)], [(1, 2), (3, 4), (5, 6)], [284, 75, 303, 48485, 81, 1382]), # Three spans + ([], [], [31373, 995]), # No span + ([(1, 3)], [(1, 2)], [71, 417, 5439, 995]), # Simple span + ([(2, 2)], [(1, 1)], [258, 18798, 995]), # Empty span + ([(0, 11)], [(0, 2)], [31373, 995]), # Full span + ([(1, 4), (6, 7)], [(1, 2), (4, 5)], [71, 695, 78, 220, 86, 1764]), # Two spans + ([(1, 6), (4, 7)], [(1, 4), (2, 5)], [71, 695, 78, 220, 86, 1764]), # Overlapping spans + ([(1, 7), (4, 6)], [(1, 5), (2, 4)], [71, 695, 78, 220, 86, 1764]), # Nested spans + ([(1, 5), (5, 7)], [(1, 2), (2, 3)], [71, 11109, 266, 1764]), # Consecutive spans + ([(2, 4), (2, 4)], [(1, 2), (1, 2)], [258, 297, 78, 995]), # Duplicate spans + ([(2, 3), (5, 8), (9, 11)], [(1, 2), (3, 4), (5, 6)], [258, 75, 5439, 24486, 81, 335]), # Three spans ), ) def test_tokenize_with_spans(common_tokenizer, spans, expected_token_spans, expected_tokens, extra_tokens): @@ -79,14 +79,13 @@ def test_validate_chat_template_with_markers(common_tokenizer): ("messages", "expected_tokens", "expected_loss_masking_spans"), ( # Single turn: full assistant turn (Hello) is trainable - # 15 tokens, trainable indices 7-13, loss mask spans cover 0-6 and 14 + # 17 tokens, loss mask spans cover 0-7 and 16 ( [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello"}], - [49152, 27, 789, 29, 16946, 750, 789, 2293, 17822, 29, 7371, 750, 17822, 29, 49152], - [(0, 7), (14, 15)], + [50256, 27, 7220, 29, 17250, 3556, 7220, 6927, 562, 10167, 29, 15496, 3556, 562, 10167, 29, 50256], + [(0, 7), (16, 17)], ), # Multi-turn: both assistant turns are fully trainable - # 27 tokens, trainable indices 7-13 and 19-25 ( [ {"role": "user", "content": "A"}, @@ -95,38 +94,41 @@ def test_validate_chat_template_with_markers(common_tokenizer): {"role": "assistant", "content": "D"}, ], [ - 49152, + 50256, 27, - 789, + 7220, 29, 32, - 750, - 789, - 2293, - 17822, + 3556, + 7220, + 6927, + 562, + 10167, 29, 33, - 750, - 17822, - 2293, - 789, + 3556, + 562, + 10167, + 6927, + 7220, 29, 34, - 750, - 789, - 2293, - 17822, + 3556, + 7220, + 6927, + 562, + 10167, 29, 35, - 750, - 17822, + 3556, + 562, + 10167, 29, - 49152, + 50256, ], - [(0, 7), (14, 19), (26, 27)], + [(0, 7), (16, 21), (30, 31)], ), # System + user + assistant: full assistant turn trainable - # 23 tokens, trainable indices 15-21 ( [ {"role": "system", "content": "You are helpful."}, @@ -134,41 +136,41 @@ def test_validate_chat_template_with_markers(common_tokenizer): {"role": "assistant", "content": "Hello"}, ], [ - 49152, + 50256, 27, - 3144, + 10057, 29, - 5815, - 1139, - 44569, - 6928, - 3144, - 2293, - 789, + 1639, + 389, + 7613, + 25970, + 10057, + 6927, + 7220, 29, - 16946, - 750, - 789, - 2293, - 17822, + 17250, + 3556, + 7220, + 6927, + 562, + 10167, 29, - 7371, - 750, - 17822, + 15496, + 3556, + 562, + 10167, 29, - 49152, + 50256, ], - [(0, 15), (22, 23)], + [(0, 15), (24, 25)], ), # User only: no trainable tokens - # 9 tokens, no trainable indices ( [{"role": "user", "content": "Hi"}], - [49152, 27, 789, 29, 16946, 750, 789, 29, 49152], + [50256, 27, 7220, 29, 17250, 3556, 7220, 29, 50256], [(0, 9)], ), - # Long multi-turn (85 tokens, 3 assistant responses with tags, tests span machinery) - # Trainable: indices 27-40, 49-62, 70-83 + # Long multi-turn (3 assistant responses with tags, tests span machinery) ( [ {"role": "system", "content": "You are a helpful assistant that answers questions."}, @@ -180,93 +182,92 @@ def test_validate_chat_template_with_markers(common_tokenizer): {"role": "assistant", "content": "The capital of Italy is Rome."}, ], [ - 49152, + 50256, 27, - 3144, + 10057, 29, - 5815, - 1139, - 373, - 44569, - 2424, - 11886, - 954, - 15737, - 14516, - 6928, - 3144, - 2293, - 789, + 1639, + 389, + 257, + 7613, + 8796, + 326, + 7429, + 2683, + 25970, + 10057, + 6927, + 7220, 29, - 13938, - 438, - 331, - 25016, - 457, - 12409, + 2061, + 318, + 262, + 3139, + 286, + 4881, + 30, + 3556, + 7220, + 6927, 562, - 35838, - 789, - 2293, - 17822, + 10167, 29, - 2111, - 25016, - 457, - 12409, + 464, + 3139, + 286, + 4881, + 318, + 6342, + 25970, 562, - 438, - 4235, - 280, - 6928, - 17822, - 2293, - 789, + 10167, + 6927, + 7220, 29, - 13938, - 5028, - 759, - 42226, - 35838, - 789, - 2293, - 17822, - 29, - 2111, - 25016, - 457, - 759, - 42226, - 438, - 29784, + 2061, + 546, + 4486, + 30, 3556, - 6928, - 17822, - 2293, - 789, + 7220, + 6927, + 562, + 10167, 29, - 1996, - 4413, - 3326, - 35838, - 789, - 2293, - 17822, + 464, + 3139, + 286, + 4486, + 318, + 11307, + 25970, + 562, + 10167, + 6927, + 7220, 29, - 2111, - 25016, - 457, - 4413, - 3326, - 438, - 613, - 1361, - 6928, - 17822, + 1870, + 8031, + 30, + 3556, + 7220, + 6927, + 562, + 10167, + 29, + 464, + 3139, + 286, + 8031, + 318, + 10598, + 25970, + 562, + 10167, 29, - 49152, + 50256, ], - [(0, 27), (41, 49), (63, 70), (84, 85)], + [(0, 26), (40, 48), (62, 69), (83, 84)], ), ), ) diff --git a/tests/layers/test_ssm.py b/tests/layers/test_ssm.py index 9c31ec80f..fba0b4265 100644 --- a/tests/layers/test_ssm.py +++ b/tests/layers/test_ssm.py @@ -146,7 +146,7 @@ def test_gdn(testing_device, use_backup, monkeypatch): "use_backup", [ pytest.param(False, marks=pytest.mark.skipif(not _kda_available, reason="KDA fused kernels not available")), - True, + pytest.param(True, marks=pytest.mark.skipif(not _kda_available, reason="KDA fla package not available")), ], ids=["fast", "backup"], ) diff --git a/tests/utils/dataset.py b/tests/utils/dataset.py index ce68e3f98..a2ea2f46e 100644 --- a/tests/utils/dataset.py +++ b/tests/utils/dataset.py @@ -18,7 +18,7 @@ def download_santacoder_tokenizer(): if not TOKENIZER_FILE.is_file(): import transformers - transformers.AutoTokenizer.from_pretrained("bigcode/santacoder").save_pretrained(TOKENIZER_PATH) + transformers.AutoTokenizer.from_pretrained("gpt2").save_pretrained(TOKENIZER_PATH) def get_random_text( diff --git a/tests/utils/global_variables.py b/tests/utils/global_variables.py index 20a0c7219..25de18072 100644 --- a/tests/utils/global_variables.py +++ b/tests/utils/global_variables.py @@ -35,7 +35,7 @@ def set_testing_global_variables(): # TODO: Fixtures TOKENIZER_PATH = SHARED_RESULT_PATH / "tokenizer" TOKENIZER_FILE = TOKENIZER_PATH / "tokenizer.json" -TOKENIZER_NAME = "bigcode/santacoder" +TOKENIZER_NAME = "gpt2" DATASET_CACHE = SHARED_RESULT_PATH / "dataset"