From 53b077a4d81cfe38ed02d6e1116bc4e4cc6da486 Mon Sep 17 00:00:00 2001 From: Ronay Ak Date: Wed, 13 May 2026 19:33:29 +0000 Subject: [PATCH 01/25] add support for data_dir_list in [num_samples, path] form Signed-off-by: Ronay Ak Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- .../datasets/llm/retrieval_dataset.py | 145 +++++++++++++++--- nemo_automodel/recipes/base_recipe.py | 34 +++- .../datasets/llm/test_retrieval_dataset.py | 96 ++++++++++++ tests/unit_tests/recipes/test_base_recipe.py | 19 +++ 4 files changed, 266 insertions(+), 28 deletions(-) diff --git a/nemo_automodel/components/datasets/llm/retrieval_dataset.py b/nemo_automodel/components/datasets/llm/retrieval_dataset.py index 060cdbf9b5..cea5f78b03 100644 --- a/nemo_automodel/components/datasets/llm/retrieval_dataset.py +++ b/nemo_automodel/components/datasets/llm/retrieval_dataset.py @@ -15,16 +15,19 @@ import json import logging import os +import random from abc import ABC, abstractmethod from copy import deepcopy from dataclasses import dataclass -from typing import List, Optional, Union +from typing import Any, List, Optional, Tuple, Union from datasets import Dataset, concatenate_datasets, load_dataset from huggingface_hub import HfApi, hf_hub_download EXAMPLE_TEMPLATE = {"text": "", "image": "", "nr_ocr": ""} +_OVERSAMPLING_WARNED_CORPORA: set[str] = set() + class AbstractDataset(ABC): @abstractmethod @@ -134,7 +137,8 @@ def load_corpus_metadata(path: str): if not os.path.isfile(path_metadata): raise ValueError("Metadata File for Corpus does not exist: " + path_metadata) - metadata = json.load(open(path_metadata, "r")) + with open(path_metadata, "r") as f: + metadata = json.load(f) return metadata @@ -170,21 +174,89 @@ def add_corpus(qa_corpus_paths: Union[dict, list], corpus_dict: dict): corpus_dict[corpus_id] = CorpusInfo(corpus_metadata, corpus) -def load_datasets(data_dir_list: Union[List[str], str], concatenate: bool = True): +def _parse_data_entry(entry: Union[str, Tuple[Optional[int], str], List[Any]]) -> Tuple[Optional[int], str]: + """ + Parse a data entry. + + Supported forms: + - "path_or_hf_uri": use all samples + - [num_samples, "path_or_hf_uri"]: sample num_samples once from that source + """ + if isinstance(entry, str): + return None, entry + + if isinstance(entry, (list, tuple)) and len(entry) == 2: + num_samples, path = entry + if num_samples is not None: + if isinstance(num_samples, bool) or not isinstance(num_samples, int): + raise ValueError(f"num_samples must be an integer or None, got {type(num_samples)}") + if num_samples < 0: + raise ValueError(f"num_samples must be non-negative, got {num_samples}") + if not isinstance(path, str): + raise ValueError(f"path must be a string, got {type(path)}") + return num_samples, path + + raise ValueError(f"Invalid data entry format: {entry}. Expected a string path or [num_samples, path]") + + +def _normalize_data_entries( + data_dir_list: Union[List[Union[str, Tuple[Optional[int], str], List[Any]]], Tuple[Optional[int], str], str], +) -> List[Tuple[Optional[int], str]]: + """Normalize a single source or list of sources into parsed entries.""" + if isinstance(data_dir_list, str): + entries = [data_dir_list] + elif isinstance(data_dir_list, tuple): + entries = [data_dir_list] + elif isinstance(data_dir_list, list): + if len(data_dir_list) == 2 and (data_dir_list[0] is None or isinstance(data_dir_list[0], int)): + entries = [data_dir_list] + else: + entries = data_dir_list + else: + raise ValueError( + f"Invalid data_dir_list format: {data_dir_list}. Expected a string path, [num_samples, path], " + "or a list of those entries" + ) + + return [_parse_data_entry(entry) for entry in entries] + + +def _sample_data_items(data_items: List[dict], num_samples: Optional[int], source: str, seed: int) -> List[dict]: + if num_samples is None: + return data_items + if num_samples >= len(data_items): + logging.warning( + f"Requested {num_samples} samples but source {source} only has {len(data_items)} examples. Using all." + ) + return data_items + + rng = random.Random(seed) + sampled_items = rng.sample(data_items, num_samples) + logging.info(f"Randomly sampled {num_samples} examples from {source} (total: {len(data_items)})") + return sampled_items + + +def load_datasets( + data_dir_list: Union[List[Union[str, Tuple[Optional[int], str], List[Any]]], Tuple[Optional[int], str], str], + concatenate: bool = True, + seed: int = 42, +): """ Load datasets from JSON files. - Copied from nemo-retriever-research/src/data/datasets.py + Entries can be strings (use all samples) or [num_samples, path] pairs + (sample a fixed subset once while loading). Returns: Tuple of (dataset, corpus_dict) """ REQUIRED_FIELDS = ["question_id", "question", "corpus_id", "pos_doc", "neg_doc"] - if not isinstance(data_dir_list, list): - data_dir_list = [data_dir_list] + data_entries = _normalize_data_entries(data_dir_list) + if not data_entries: + raise ValueError("data_dir_list must contain at least one source") corpus_dict = {} datasets = [] - for data_dir in data_dir_list: + for num_samples, data_dir in data_entries: with open(data_dir, "r") as f: train_data = json.load(f) qa_corpus_paths = train_data["corpus"] @@ -201,9 +273,11 @@ def load_datasets(data_dir_list: Union[List[str], str], concatenate: bool = True add_corpus(qa_corpus_paths, corpus_dict) + data_items = _sample_data_items(train_data["data"], num_samples, data_dir, seed) + # Extract only the required fields for training, ignoring extra fields normalized_data = [] - for item in train_data["data"]: + for item in data_items: # Extract only the essential fields we need missing = [f for f in REQUIRED_FIELDS if f not in item] if missing: @@ -386,19 +460,20 @@ def _load_hf_subset(repo_id: str, subset: str): return normalized_data, corpus_info -def _load_hf_sources(hf_uris: List[str]): +def _load_hf_sources(hf_entries: List[Tuple[Optional[int], str]], seed: int = 42): """Load one or more ``hf://`` URIs and return ``(Dataset, corpus_dict)``.""" hf_data: List[dict] = [] corpus_dict: dict = {} - for uri in hf_uris: + for num_samples, uri in hf_entries: repo_id, subset = _parse_hf_uri(uri) subsets = [subset] if subset is not None else _list_hf_subsets(repo_id) + source_data: List[dict] = [] for sub in subsets: logging.info(f"Loading HF subset: {repo_id}/{sub}") data_list, corpus_info = _load_hf_subset(repo_id, sub) - hf_data.extend(data_list) + source_data.extend(data_list) if corpus_info.corpus_id in corpus_dict: existing = corpus_dict[corpus_info.corpus_id] if existing.path != corpus_info.path: @@ -409,6 +484,8 @@ def _load_hf_sources(hf_uris: List[str]): else: corpus_dict[corpus_info.corpus_id] = corpus_info + hf_data.extend(_sample_data_items(source_data, num_samples, uri, seed)) + return Dataset.from_list(hf_data), corpus_dict @@ -454,6 +531,18 @@ def _transform_func(examples, num_neg_docs, corpus_dict, use_dataset_instruction f"neg_doc is empty for example {i_example} but {num_neg_docs} negative(s) requested " f"(n_passages > 1). Provide negatives." ) + cur_corpus_id = corpus_ids[i_example] + if ( + num_neg_docs > 0 + and len(negatives) < num_neg_docs + and cur_corpus_id not in _OVERSAMPLING_WARNED_CORPORA + ): + _OVERSAMPLING_WARNED_CORPORA.add(cur_corpus_id) + logging.warning( + f"corpus_id={cur_corpus_id}: a sample has only {len(negatives)} negatives " + f"(< num_neg_docs={num_neg_docs}). Oversampling will repeat negatives. " + "This warning is logged once per corpus." + ) if num_neg_docs > 0: neg_ids = [i for i in range(len(negatives))] cur_neg_ids = [neg_ids[idx % len(neg_ids)] for idx in range(num_neg_docs)] @@ -472,6 +561,11 @@ def _transform_func(examples, num_neg_docs, corpus_dict, use_dataset_instruction cur_pos_neg_text = [] cur_pos_neg_image = [] cur_corpus_id = corpus_ids[idx_doc] + if cur_corpus_id not in corpus_dict: + raise ValueError( + f"Unknown corpus_id '{cur_corpus_id}' in retrieval example. " + f"Available corpus ids: {sorted(corpus_dict.keys())}" + ) for doc in docs: cur_id = doc["id"] @@ -558,7 +652,7 @@ def transform(examples): def make_retrieval_dataset( - data_dir_list: Union[List[str], str] = None, + data_dir_list: Union[List[Union[str, Tuple[Optional[int], str], List[Any]]], Tuple[Optional[int], str], str] = None, model_type: str = "bi_encoder", data_type: str = "train", n_passages: int = 5, @@ -574,11 +668,13 @@ def make_retrieval_dataset( Entries in *data_dir_list* can be local JSON file paths **or** ``hf://`` URIs pointing to a HuggingFace dataset repository (e.g. - ``hf://nvidia/embed-nemotron-dataset-v1/SciFact``). Uses ``set_transform()`` - for lazy evaluation — tokenization is handled by the collator. + ``hf://nvidia/embed-nemotron-dataset-v1/SciFact``). A source can also be + provided as ``[num_samples, path_or_uri]`` to sample a fixed subset once + while loading. Uses ``set_transform()`` for lazy evaluation — tokenization + is handled by the collator. Args: - data_dir_list: Path(s) to JSON file(s) or ``hf://`` URIs. + data_dir_list: Path(s) to JSON file(s), ``hf://`` URIs, or [num_samples, source] entries. model_type: "bi_encoder" (default) or "cross_encoder" data_type: Type of data ("train" or "eval") n_passages: Number of passages (1 positive + n-1 negatives) @@ -612,24 +708,25 @@ def make_retrieval_dataset( if data_dir_list is None: raise ValueError("data_dir_list is required") - if not isinstance(data_dir_list, list): - data_dir_list = [data_dir_list] + data_entries = _normalize_data_entries(data_dir_list) + if not data_entries: + raise ValueError("data_dir_list must contain at least one source") - hf_uris = [p for p in data_dir_list if p.startswith(_HF_PREFIX)] - local_paths = [p for p in data_dir_list if not p.startswith(_HF_PREFIX)] + hf_entries = [(num_samples, path) for num_samples, path in data_entries if path.startswith(_HF_PREFIX)] + local_entries = [(num_samples, path) for num_samples, path in data_entries if not path.startswith(_HF_PREFIX)] - logging.info(f"Loading data from {len(data_dir_list)} source(s) ({len(hf_uris)} HF, {len(local_paths)} local)") + logging.info(f"Loading data from {len(data_entries)} source(s) ({len(hf_entries)} HF, {len(local_entries)} local)") datasets_list = [] corpus_dict: dict = {} - if hf_uris: - hf_dataset, hf_corpus = _load_hf_sources(hf_uris) + if hf_entries: + hf_dataset, hf_corpus = _load_hf_sources(hf_entries, seed=seed) datasets_list.append(hf_dataset) corpus_dict.update(hf_corpus) - if local_paths: - local_dataset, local_corpus = load_datasets(local_paths, concatenate=True) + if local_entries: + local_dataset, local_corpus = load_datasets(local_entries, concatenate=True, seed=seed) datasets_list.append(local_dataset) for cid, cinfo in local_corpus.items(): if cid in corpus_dict and corpus_dict[cid].path != cinfo.path: diff --git a/nemo_automodel/recipes/base_recipe.py b/nemo_automodel/recipes/base_recipe.py index 71e85cb18e..bd13768797 100644 --- a/nemo_automodel/recipes/base_recipe.py +++ b/nemo_automodel/recipes/base_recipe.py @@ -182,6 +182,21 @@ def _format_missing_checkpoint_dir_error(checkpoint_dir: str, restore_from: str, return "\n".join(error_msg) +def _format_checkpoint_load_error( + checkpoint_dir: str, ckpt_dir: str, restore_from: str | None, original_error: Exception +) -> str: + """Format a helpful message when a checkpoint exists but cannot be loaded.""" + return "\n".join( + [ + "Failed to load an auto-detected checkpoint from the current checkpoint.checkpoint_dir.", + f"Checkpoint: {ckpt_dir}", + "To start a fresh run, use a different checkpoint.checkpoint_dir or remove the existing checkpoint.", + "To resume, make sure the current command matches the saved run.", + f"Original error: {type(original_error).__name__}: {original_error}", + ] + ) + + def _is_rank_0() -> bool: """True if distributed is not initialized or this process is rank 0. TODO(@akoumpa): deprecate in favor of deviemesh api @@ -561,10 +576,21 @@ def load_checkpoint(self, restore_from: str | None = None): if is_rank_0: print(f"Loading checkpoint from {ckpt_dir}", flush=True) - model, optimizer, scheduler = self._load_checkpoint_tracked_state(ckpt_dir) - - self.checkpointer.load_model(model, os.path.join(ckpt_dir, "model")) - self.checkpointer.load_optimizer(optimizer, model, ckpt_dir, scheduler) + try: + model, optimizer, scheduler = self._load_checkpoint_tracked_state(ckpt_dir) + self.checkpointer.load_model(model, os.path.join(ckpt_dir, "model")) + self.checkpointer.load_optimizer(optimizer, model, ckpt_dir, scheduler) + except Exception as e: + if restore_from: + raise + raise RuntimeError( + _format_checkpoint_load_error( + checkpoint_dir=self.checkpointer.config.checkpoint_dir, + ckpt_dir=ckpt_dir, + restore_from=restore_from, + original_error=e, + ) + ) from e def _log_experiment_details(self): """Log metadata and config on main rank using YAML markers.""" diff --git a/tests/unit_tests/datasets/llm/test_retrieval_dataset.py b/tests/unit_tests/datasets/llm/test_retrieval_dataset.py index d4425ad477..49a9e95795 100644 --- a/tests/unit_tests/datasets/llm/test_retrieval_dataset.py +++ b/tests/unit_tests/datasets/llm/test_retrieval_dataset.py @@ -387,6 +387,102 @@ def test_load_datasets_type_coercion_and_concatenate_false(tmp_path, monkeypatch assert "C" in corpus_dict +def test_parse_data_entry(): + assert rd._parse_data_entry("/tmp/data.json") == (None, "/tmp/data.json") + assert rd._parse_data_entry([3, "/tmp/data.json"]) == (3, "/tmp/data.json") + assert rd._parse_data_entry((3, "/tmp/data.json")) == (3, "/tmp/data.json") + + with pytest.raises(ValueError, match="num_samples must be non-negative"): + rd._parse_data_entry([-1, "/tmp/data.json"]) + with pytest.raises(ValueError, match="num_samples must be an integer"): + rd._parse_data_entry(["3", "/tmp/data.json"]) + with pytest.raises(ValueError, match="path must be a string"): + rd._parse_data_entry([3, 4]) + + +def test_load_datasets_samples_single_top_level_entry_once(tmp_path, monkeypatch): + corpus_dir = tmp_path / "corpus_sample_single" + corpus_dir.mkdir() + (corpus_dir / "merlin_metadata.json").write_text(json.dumps({"class": "TextQADataset", "corpus_id": "S"})) + monkeypatch.setattr( + rd, + "load_dataset", + _mock_hf_load_dataset_returning( + [{"id": "p", "text": "P"}, {"id": "n1", "text": "N1"}, {"id": "n2", "text": "N2"}] + ), + ) + + train_file = _make_train_file(tmp_path, corpus_dir, data_len=5, corpus_id="S") + + dataset_a, _ = rd.load_datasets([2, str(train_file)], seed=7) + dataset_b, _ = rd.load_datasets([2, str(train_file)], seed=7) + dataset_c, _ = rd.load_datasets([2, str(train_file)], seed=8) + + assert len(dataset_a) == 2 + assert dataset_a["question_id"] == dataset_b["question_id"] + assert dataset_a["question_id"] != dataset_c["question_id"] + + +def test_make_retrieval_dataset_mixed_sampled_and_full_entries(tmp_path, monkeypatch): + corpus_dir = tmp_path / "corpus_mixed" + corpus_dir.mkdir() + (corpus_dir / "merlin_metadata.json").write_text(json.dumps({"class": "TextQADataset", "corpus_id": "M"})) + monkeypatch.setattr( + rd, + "load_dataset", + _mock_hf_load_dataset_returning( + [{"id": "p", "text": "P"}, {"id": "n1", "text": "N1"}, {"id": "n2", "text": "N2"}] + ), + ) + + sampled_file = tmp_path / "sampled.json" + sampled_file.write_text( + json.dumps( + { + "corpus": [{"path": str(corpus_dir)}], + "data": [ + { + "question_id": f"s{i}", + "question": f"S{i}", + "corpus_id": "M", + "pos_doc": [{"id": "p"}], + "neg_doc": [{"id": "n1"}], + } + for i in range(5) + ], + } + ) + ) + full_file = tmp_path / "full.json" + full_file.write_text( + json.dumps( + { + "corpus": [{"path": str(corpus_dir)}], + "data": [ + { + "question_id": f"f{i}", + "question": f"F{i}", + "corpus_id": "M", + "pos_doc": [{"id": "p"}], + "neg_doc": [{"id": "n2"}], + } + for i in range(3) + ], + } + ) + ) + + ds = rd.make_retrieval_dataset( + data_dir_list=[[2, str(sampled_file)], str(full_file)], + data_type="train", + n_passages=2, + seed=123, + ) + + assert len(ds) == 5 + assert len(ds[0]["doc_text"]) == 2 + + def test_transform_func_positive_else_and_text_empty_branch(): # Covers line 198 (positives not list) and 228 (text empty and no image) corpus = DummyCorpus({"p": {"text": "", "image": "", "nr_ocr": ""}, "n": {"text": "n", "image": "", "nr_ocr": ""}}) diff --git a/tests/unit_tests/recipes/test_base_recipe.py b/tests/unit_tests/recipes/test_base_recipe.py index 3b11ac8a45..619a535f2e 100644 --- a/tests/unit_tests/recipes/test_base_recipe.py +++ b/tests/unit_tests/recipes/test_base_recipe.py @@ -326,6 +326,25 @@ def test_load_checkpoint_auto_detect_restores_latest(tmp_path): assert torch.allclose(recipe_inst.model.weight, weight_after_step) +def test_load_checkpoint_autodetect_failure_has_helpful_message(tmp_path, monkeypatch): + """Auto-resume failures should point users at checkpoint_dir reuse instead of surfacing only low-level errors.""" + (tmp_path / "epoch_0_step_100").mkdir() + recipe_inst = _ToyRecipe(tmp_path) + + def fail_tracked_state(_ckpt_dir): + raise IndexError("tuple index out of range") + + monkeypatch.setattr(recipe_inst, "_load_checkpoint_tracked_state", fail_tracked_state) + + with pytest.raises(RuntimeError) as exc: + recipe_inst.load_checkpoint(restore_from=None) + + msg = str(exc.value) + assert "Failed to load an auto-detected checkpoint" in msg + assert "use a different checkpoint.checkpoint_dir or remove the existing checkpoint" in msg + assert "To resume, make sure the current command matches the saved run." in msg + + def test_load_checkpoint_with_latest_keyword(tmp_path): """ Test that restore_from='LATEST' loads the latest checkpoint. From 05c7253c1bd8e89453b5825c7beeab87027936fd Mon Sep 17 00:00:00 2001 From: Ronay Ak Date: Mon, 18 May 2026 15:51:39 +0000 Subject: [PATCH 02/25] fix checkpoint_dir error msg Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- nemo_automodel/recipes/base_recipe.py | 9 +++------ tests/unit_tests/recipes/test_base_recipe.py | 1 + 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/nemo_automodel/recipes/base_recipe.py b/nemo_automodel/recipes/base_recipe.py index bd13768797..a80b96d42a 100644 --- a/nemo_automodel/recipes/base_recipe.py +++ b/nemo_automodel/recipes/base_recipe.py @@ -182,13 +182,11 @@ def _format_missing_checkpoint_dir_error(checkpoint_dir: str, restore_from: str, return "\n".join(error_msg) -def _format_checkpoint_load_error( - checkpoint_dir: str, ckpt_dir: str, restore_from: str | None, original_error: Exception -) -> str: - """Format a helpful message when a checkpoint exists but cannot be loaded.""" +def _format_checkpoint_load_error(checkpoint_dir: str, ckpt_dir: str, original_error: Exception) -> str: + """Format a helpful message when an auto-detected checkpoint exists but cannot be loaded.""" return "\n".join( [ - "Failed to load an auto-detected checkpoint from the current checkpoint.checkpoint_dir.", + f"Failed to load an auto-detected checkpoint from checkpoint_dir={checkpoint_dir!r}.", f"Checkpoint: {ckpt_dir}", "To start a fresh run, use a different checkpoint.checkpoint_dir or remove the existing checkpoint.", "To resume, make sure the current command matches the saved run.", @@ -587,7 +585,6 @@ def load_checkpoint(self, restore_from: str | None = None): _format_checkpoint_load_error( checkpoint_dir=self.checkpointer.config.checkpoint_dir, ckpt_dir=ckpt_dir, - restore_from=restore_from, original_error=e, ) ) from e diff --git a/tests/unit_tests/recipes/test_base_recipe.py b/tests/unit_tests/recipes/test_base_recipe.py index 619a535f2e..68c1b08aa1 100644 --- a/tests/unit_tests/recipes/test_base_recipe.py +++ b/tests/unit_tests/recipes/test_base_recipe.py @@ -341,6 +341,7 @@ def fail_tracked_state(_ckpt_dir): msg = str(exc.value) assert "Failed to load an auto-detected checkpoint" in msg + assert f"checkpoint_dir={str(tmp_path)!r}" in msg assert "use a different checkpoint.checkpoint_dir or remove the existing checkpoint" in msg assert "To resume, make sure the current command matches the saved run." in msg From 78030f6e51a32553801138e7041b4f23b9035a86 Mon Sep 17 00:00:00 2001 From: Ronay Ak Date: Mon, 18 May 2026 17:26:20 +0000 Subject: [PATCH 03/25] revert back the checkpoint_dir error msg Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- .../datasets/llm/retrieval_dataset.py | 6 +--- nemo_automodel/recipes/base_recipe.py | 31 +++---------------- tests/unit_tests/recipes/test_base_recipe.py | 20 ------------ 3 files changed, 5 insertions(+), 52 deletions(-) diff --git a/nemo_automodel/components/datasets/llm/retrieval_dataset.py b/nemo_automodel/components/datasets/llm/retrieval_dataset.py index cea5f78b03..cba48538cf 100644 --- a/nemo_automodel/components/datasets/llm/retrieval_dataset.py +++ b/nemo_automodel/components/datasets/llm/retrieval_dataset.py @@ -532,11 +532,7 @@ def _transform_func(examples, num_neg_docs, corpus_dict, use_dataset_instruction f"(n_passages > 1). Provide negatives." ) cur_corpus_id = corpus_ids[i_example] - if ( - num_neg_docs > 0 - and len(negatives) < num_neg_docs - and cur_corpus_id not in _OVERSAMPLING_WARNED_CORPORA - ): + if num_neg_docs > 0 and len(negatives) < num_neg_docs and cur_corpus_id not in _OVERSAMPLING_WARNED_CORPORA: _OVERSAMPLING_WARNED_CORPORA.add(cur_corpus_id) logging.warning( f"corpus_id={cur_corpus_id}: a sample has only {len(negatives)} negatives " diff --git a/nemo_automodel/recipes/base_recipe.py b/nemo_automodel/recipes/base_recipe.py index a80b96d42a..71e85cb18e 100644 --- a/nemo_automodel/recipes/base_recipe.py +++ b/nemo_automodel/recipes/base_recipe.py @@ -182,19 +182,6 @@ def _format_missing_checkpoint_dir_error(checkpoint_dir: str, restore_from: str, return "\n".join(error_msg) -def _format_checkpoint_load_error(checkpoint_dir: str, ckpt_dir: str, original_error: Exception) -> str: - """Format a helpful message when an auto-detected checkpoint exists but cannot be loaded.""" - return "\n".join( - [ - f"Failed to load an auto-detected checkpoint from checkpoint_dir={checkpoint_dir!r}.", - f"Checkpoint: {ckpt_dir}", - "To start a fresh run, use a different checkpoint.checkpoint_dir or remove the existing checkpoint.", - "To resume, make sure the current command matches the saved run.", - f"Original error: {type(original_error).__name__}: {original_error}", - ] - ) - - def _is_rank_0() -> bool: """True if distributed is not initialized or this process is rank 0. TODO(@akoumpa): deprecate in favor of deviemesh api @@ -574,20 +561,10 @@ def load_checkpoint(self, restore_from: str | None = None): if is_rank_0: print(f"Loading checkpoint from {ckpt_dir}", flush=True) - try: - model, optimizer, scheduler = self._load_checkpoint_tracked_state(ckpt_dir) - self.checkpointer.load_model(model, os.path.join(ckpt_dir, "model")) - self.checkpointer.load_optimizer(optimizer, model, ckpt_dir, scheduler) - except Exception as e: - if restore_from: - raise - raise RuntimeError( - _format_checkpoint_load_error( - checkpoint_dir=self.checkpointer.config.checkpoint_dir, - ckpt_dir=ckpt_dir, - original_error=e, - ) - ) from e + model, optimizer, scheduler = self._load_checkpoint_tracked_state(ckpt_dir) + + self.checkpointer.load_model(model, os.path.join(ckpt_dir, "model")) + self.checkpointer.load_optimizer(optimizer, model, ckpt_dir, scheduler) def _log_experiment_details(self): """Log metadata and config on main rank using YAML markers.""" diff --git a/tests/unit_tests/recipes/test_base_recipe.py b/tests/unit_tests/recipes/test_base_recipe.py index 68c1b08aa1..3b11ac8a45 100644 --- a/tests/unit_tests/recipes/test_base_recipe.py +++ b/tests/unit_tests/recipes/test_base_recipe.py @@ -326,26 +326,6 @@ def test_load_checkpoint_auto_detect_restores_latest(tmp_path): assert torch.allclose(recipe_inst.model.weight, weight_after_step) -def test_load_checkpoint_autodetect_failure_has_helpful_message(tmp_path, monkeypatch): - """Auto-resume failures should point users at checkpoint_dir reuse instead of surfacing only low-level errors.""" - (tmp_path / "epoch_0_step_100").mkdir() - recipe_inst = _ToyRecipe(tmp_path) - - def fail_tracked_state(_ckpt_dir): - raise IndexError("tuple index out of range") - - monkeypatch.setattr(recipe_inst, "_load_checkpoint_tracked_state", fail_tracked_state) - - with pytest.raises(RuntimeError) as exc: - recipe_inst.load_checkpoint(restore_from=None) - - msg = str(exc.value) - assert "Failed to load an auto-detected checkpoint" in msg - assert f"checkpoint_dir={str(tmp_path)!r}" in msg - assert "use a different checkpoint.checkpoint_dir or remove the existing checkpoint" in msg - assert "To resume, make sure the current command matches the saved run." in msg - - def test_load_checkpoint_with_latest_keyword(tmp_path): """ Test that restore_from='LATEST' loads the latest checkpoint. From 4672f873b64223cb8641d85235186e24a53406c4 Mon Sep 17 00:00:00 2001 From: Ronay Ak Date: Mon, 18 May 2026 20:18:39 +0000 Subject: [PATCH 04/25] use dict sampled-source form Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- .../datasets/llm/retrieval_dataset.py | 67 +++++++++++-------- .../datasets/llm/test_retrieval_dataset.py | 45 ++++++++++--- 2 files changed, 72 insertions(+), 40 deletions(-) diff --git a/nemo_automodel/components/datasets/llm/retrieval_dataset.py b/nemo_automodel/components/datasets/llm/retrieval_dataset.py index cba48538cf..0823a7abdd 100644 --- a/nemo_automodel/components/datasets/llm/retrieval_dataset.py +++ b/nemo_automodel/components/datasets/llm/retrieval_dataset.py @@ -174,19 +174,30 @@ def add_corpus(qa_corpus_paths: Union[dict, list], corpus_dict: dict): corpus_dict[corpus_id] = CorpusInfo(corpus_metadata, corpus) -def _parse_data_entry(entry: Union[str, Tuple[Optional[int], str], List[Any]]) -> Tuple[Optional[int], str]: +DataEntry = Union[str, dict[str, Any]] + + +def _parse_data_entry(entry: DataEntry) -> Tuple[Optional[int], str]: """ Parse a data entry. Supported forms: - "path_or_hf_uri": use all samples - - [num_samples, "path_or_hf_uri"]: sample num_samples once from that source + - {"path": "path_or_hf_uri", "num_samples": N}: sample N examples once from that source """ if isinstance(entry, str): return None, entry - if isinstance(entry, (list, tuple)) and len(entry) == 2: - num_samples, path = entry + if isinstance(entry, dict): + allowed_keys = {"path", "num_samples"} + unknown_keys = set(entry) - allowed_keys + if unknown_keys: + raise ValueError(f"Unsupported data entry field(s): {sorted(unknown_keys)}") + if "path" not in entry: + raise ValueError("data entry dictionary must contain a 'path' field") + + path = entry["path"] + num_samples = entry.get("num_samples") if num_samples is not None: if isinstance(num_samples, bool) or not isinstance(num_samples, int): raise ValueError(f"num_samples must be an integer or None, got {type(num_samples)}") @@ -196,29 +207,22 @@ def _parse_data_entry(entry: Union[str, Tuple[Optional[int], str], List[Any]]) - raise ValueError(f"path must be a string, got {type(path)}") return num_samples, path - raise ValueError(f"Invalid data entry format: {entry}. Expected a string path or [num_samples, path]") + raise ValueError(f"Invalid data entry format: {entry}. Expected a string path or a dictionary with 'path'") -def _normalize_data_entries( - data_dir_list: Union[List[Union[str, Tuple[Optional[int], str], List[Any]]], Tuple[Optional[int], str], str], -) -> List[Tuple[Optional[int], str]]: +def _normalize_data_entries(data_dir_list: Union[List[DataEntry], DataEntry]) -> List[Tuple[Optional[int], str]]: """Normalize a single source or list of sources into parsed entries.""" - if isinstance(data_dir_list, str): - entries = [data_dir_list] - elif isinstance(data_dir_list, tuple): + if isinstance(data_dir_list, (str, dict)): entries = [data_dir_list] elif isinstance(data_dir_list, list): - if len(data_dir_list) == 2 and (data_dir_list[0] is None or isinstance(data_dir_list[0], int)): - entries = [data_dir_list] - else: - entries = data_dir_list + entries = data_dir_list else: raise ValueError( - f"Invalid data_dir_list format: {data_dir_list}. Expected a string path, [num_samples, path], " - "or a list of those entries" + f"Invalid data_dir_list format: {data_dir_list}. Expected a string path, a dictionary entry, " + "or a list of those entries." ) - return [_parse_data_entry(entry) for entry in entries] + return [entry if isinstance(entry, tuple) else _parse_data_entry(entry) for entry in entries] def _sample_data_items(data_items: List[dict], num_samples: Optional[int], source: str, seed: int) -> List[dict]: @@ -237,15 +241,15 @@ def _sample_data_items(data_items: List[dict], num_samples: Optional[int], sourc def load_datasets( - data_dir_list: Union[List[Union[str, Tuple[Optional[int], str], List[Any]]], Tuple[Optional[int], str], str], + data_dir_list: Union[List[DataEntry], DataEntry], concatenate: bool = True, seed: int = 42, ): """ Load datasets from JSON files. - Entries can be strings (use all samples) or [num_samples, path] pairs - (sample a fixed subset once while loading). + Entries can be strings (use all samples) or dictionaries with path and optional + num_samples fields (sample a fixed subset once while loading). Returns: Tuple of (dataset, corpus_dict) @@ -288,10 +292,12 @@ def load_datasets( "corpus_id": item["corpus_id"], } # Extract pos_doc with only id field + if not item["pos_doc"]: + raise ValueError(f"pos_doc cannot be empty in train_data item: {item}") normalized_item["pos_doc"] = [] for doc in item["pos_doc"]: if isinstance(doc, dict) and "id" in doc: - normalized_item["pos_doc"].append({"id": doc["id"]}) + normalized_item["pos_doc"].append({"id": str(doc["id"])}) else: # Handle case where doc might be just a string ID doc_id = doc if isinstance(doc, str) else str(doc) @@ -300,7 +306,7 @@ def load_datasets( normalized_item["neg_doc"] = [] for doc in item["neg_doc"]: if isinstance(doc, dict) and "id" in doc: - normalized_item["neg_doc"].append({"id": doc["id"]}) + normalized_item["neg_doc"].append({"id": str(doc["id"])}) else: # Handle case where doc might be just a string ID doc_id = doc if isinstance(doc, str) else str(doc) @@ -648,7 +654,7 @@ def transform(examples): def make_retrieval_dataset( - data_dir_list: Union[List[Union[str, Tuple[Optional[int], str], List[Any]]], Tuple[Optional[int], str], str] = None, + data_dir_list: Union[List[DataEntry], DataEntry] = None, model_type: str = "bi_encoder", data_type: str = "train", n_passages: int = 5, @@ -665,18 +671,21 @@ def make_retrieval_dataset( Entries in *data_dir_list* can be local JSON file paths **or** ``hf://`` URIs pointing to a HuggingFace dataset repository (e.g. ``hf://nvidia/embed-nemotron-dataset-v1/SciFact``). A source can also be - provided as ``[num_samples, path_or_uri]`` to sample a fixed subset once - while loading. Uses ``set_transform()`` for lazy evaluation — tokenization - is handled by the collator. + provided as ``{"path": path_or_uri, "num_samples": N}`` to sample a fixed + subset once while loading. Uses ``set_transform()`` for lazy evaluation — + tokenization is handled by the collator. Args: - data_dir_list: Path(s) to JSON file(s), ``hf://`` URIs, or [num_samples, source] entries. + data_dir_list: Path(s) to JSON file(s), ``hf://`` URIs, or dictionary entries with path and + num_samples. model_type: "bi_encoder" (default) or "cross_encoder" data_type: Type of data ("train" or "eval") n_passages: Number of passages (1 positive + n-1 negatives) eval_negative_size: Number of negative documents for evaluation seed: Random seed for reproducibility (for shuffling if needed) - do_shuffle: Whether to shuffle the dataset + do_shuffle: Shuffle dataset rows before subset selection. Only applied when + ``max_train_samples`` is set; otherwise iteration order is controlled by + the dataloader's sampler (e.g. ``StatefulDistributedSampler``). max_train_samples: Maximum number of training samples to use train_data_select_offset: Offset for selecting training samples use_dataset_instruction: Whether to use instruction from dataset's metadata diff --git a/tests/unit_tests/datasets/llm/test_retrieval_dataset.py b/tests/unit_tests/datasets/llm/test_retrieval_dataset.py index 49a9e95795..b7f71dc02c 100644 --- a/tests/unit_tests/datasets/llm/test_retrieval_dataset.py +++ b/tests/unit_tests/datasets/llm/test_retrieval_dataset.py @@ -205,6 +205,23 @@ def test_load_datasets_normalizes_and_errors(tmp_path, monkeypatch): with pytest.raises(ValueError): rd.load_datasets(str(f_bad)) + empty_pos = { + "corpus": [{"path": str(corpus_dir)}], + "data": [ + { + "question_id": "q-empty", + "question": "What?", + "corpus_id": "corpusA", + "pos_doc": [], + "neg_doc": [{"id": "n1"}], + } + ], + } + f_empty_pos = tmp_path / "empty_pos.json" + f_empty_pos.write_text(json.dumps(empty_pos)) + with pytest.raises(ValueError, match="pos_doc cannot be empty"): + rd.load_datasets(str(f_empty_pos)) + def test_transform_func_single_batched(): corpus_dict = { @@ -372,8 +389,8 @@ def test_load_datasets_type_coercion_and_concatenate_false(tmp_path, monkeypatch "question_id": "q", "question": "Q", "corpus_id": "C", - "pos_doc": [101], # int -> coerced to "101" via lines 140-141 - "neg_doc": [202, "x"], # 202 -> "202" via lines 149-150; "x" unchanged + "pos_doc": [{"id": 101}], + "neg_doc": [{"id": 202}, "x"], } ], } @@ -389,15 +406,21 @@ def test_load_datasets_type_coercion_and_concatenate_false(tmp_path, monkeypatch def test_parse_data_entry(): assert rd._parse_data_entry("/tmp/data.json") == (None, "/tmp/data.json") - assert rd._parse_data_entry([3, "/tmp/data.json"]) == (3, "/tmp/data.json") - assert rd._parse_data_entry((3, "/tmp/data.json")) == (3, "/tmp/data.json") + assert rd._parse_data_entry({"path": "/tmp/data.json", "num_samples": 3}) == (3, "/tmp/data.json") + assert rd._parse_data_entry({"path": "/tmp/data.json"}) == (None, "/tmp/data.json") with pytest.raises(ValueError, match="num_samples must be non-negative"): - rd._parse_data_entry([-1, "/tmp/data.json"]) + rd._parse_data_entry({"path": "/tmp/data.json", "num_samples": -1}) with pytest.raises(ValueError, match="num_samples must be an integer"): - rd._parse_data_entry(["3", "/tmp/data.json"]) + rd._parse_data_entry({"path": "/tmp/data.json", "num_samples": "3"}) with pytest.raises(ValueError, match="path must be a string"): - rd._parse_data_entry([3, 4]) + rd._parse_data_entry({"path": 4, "num_samples": 3}) + with pytest.raises(ValueError, match="must contain a 'path' field"): + rd._parse_data_entry({"num_samples": 3}) + with pytest.raises(ValueError, match="Unsupported data entry field"): + rd._parse_data_entry({"path": "/tmp/data.json", "sample_fraction": 0.5}) + with pytest.raises(ValueError, match="Invalid data entry format"): + rd._parse_data_entry([3, "/tmp/data.json"]) def test_load_datasets_samples_single_top_level_entry_once(tmp_path, monkeypatch): @@ -414,9 +437,9 @@ def test_load_datasets_samples_single_top_level_entry_once(tmp_path, monkeypatch train_file = _make_train_file(tmp_path, corpus_dir, data_len=5, corpus_id="S") - dataset_a, _ = rd.load_datasets([2, str(train_file)], seed=7) - dataset_b, _ = rd.load_datasets([2, str(train_file)], seed=7) - dataset_c, _ = rd.load_datasets([2, str(train_file)], seed=8) + dataset_a, _ = rd.load_datasets({"path": str(train_file), "num_samples": 2}, seed=7) + dataset_b, _ = rd.load_datasets({"path": str(train_file), "num_samples": 2}, seed=7) + dataset_c, _ = rd.load_datasets({"path": str(train_file), "num_samples": 2}, seed=8) assert len(dataset_a) == 2 assert dataset_a["question_id"] == dataset_b["question_id"] @@ -473,7 +496,7 @@ def test_make_retrieval_dataset_mixed_sampled_and_full_entries(tmp_path, monkeyp ) ds = rd.make_retrieval_dataset( - data_dir_list=[[2, str(sampled_file)], str(full_file)], + data_dir_list=[{"path": str(sampled_file), "num_samples": 2}, str(full_file)], data_type="train", n_passages=2, seed=123, From 853b404ce25f681f54d95799ec6c149cf62618ae Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 15:13:34 +0100 Subject: [PATCH 05/25] docs(retrieval): add runnable fine-tuning guide Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-dataset.md | 33 +- docs/guides/llm/retrieval-finetuning.md | 302 ++++++++++++++++++ docs/guides/overview.md | 12 +- docs/index.md | 3 +- .../retrieval/cross_encoder/llama3_2_1b.yaml | 7 +- 5 files changed, 347 insertions(+), 10 deletions(-) create mode 100644 docs/guides/llm/retrieval-finetuning.md diff --git a/docs/guides/llm/retrieval-dataset.md b/docs/guides/llm/retrieval-dataset.md index cafd6ab950..626a64c789 100644 --- a/docs/guides/llm/retrieval-dataset.md +++ b/docs/guides/llm/retrieval-dataset.md @@ -15,7 +15,11 @@ The dataset factory `nemo_automodel.components.datasets.llm.make_retrieval_datas ## Supported Input Formats -NeMo Automodel supports **two** input schemas: +NeMo Automodel supports **two** input schemas. They use different dataset factories: + +- Use `nemo_automodel.components.datasets.llm.make_retrieval_dataset` for corpus ID-based JSON and `hf://` sources. +- Use `nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset` for inline JSONL where + document text is stored directly in each record. ### Corpus ID-Based JSON (Merlin/NeMo-Retriever Style) @@ -78,7 +82,7 @@ This is convenient for custom fine-tuning pipelines where the documents are incl ## YAML Usage (Dataset + Collator) -Use the dataset factory plus the bi-encoder collator: +Use the corpus/HF dataset factory plus the bi-encoder collator for corpus ID-based JSON or `hf://` sources: ```yaml dataloader: @@ -86,7 +90,7 @@ dataloader: dataset: _target_: nemo_automodel.components.datasets.llm.make_retrieval_dataset data_dir_list: - - /abs/path/to/train.jsonl # or train.json (corpus-id format) + - /abs/path/to/train.json # or hf://nvidia/embed-nemotron-dataset-v1/FEVER data_type: train n_passages: 5 # 1 positive + 4 negatives do_shuffle: true @@ -100,7 +104,28 @@ dataloader: pad_to_multiple_of: 8 ``` +Use the inline dataset factory for inline JSONL: + +```yaml +dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + dataset: + _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset + data_dir_list: + - /abs/path/to/train.jsonl + data_type: train + n_passages: 5 # 1 positive + 4 negatives + do_shuffle: true + collate_fn: + _target_: nemo_automodel.components.datasets.llm.BiEncoderCollator + q_max_len: 512 + p_max_len: 512 + query_prefix: "query:" + passage_prefix: "passage:" + pad_to_multiple_of: 8 +``` + ## Requirements - `pos_doc` must be **non-empty**. -- If training requests negatives (e.g., `n_passages > 1`), `neg_doc` must contain **at least one** document. \ No newline at end of file +- If training requests negatives (e.g., `n_passages > 1`), `neg_doc` must contain **at least one** document. diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md new file mode 100644 index 0000000000..3992a1eb2b --- /dev/null +++ b/docs/guides/llm/retrieval-finetuning.md @@ -0,0 +1,302 @@ +# Retrieval Fine-Tuning (Bi-Encoder and Cross-Encoder) + +## Introduction + +Retrieval models optimize a model for search, retrieval-augmented generation (RAG), semantic similarity, and reranking. +NeMo AutoModel provides two retrieval fine-tuning recipes: + +- **Bi-encoder fine-tuning** trains one encoder to produce query and passage embeddings. Use it when you need fast + nearest-neighbor search over a document index. +- **Cross-encoder fine-tuning** trains a reranker that scores a query and passage together. Use it after a retriever has + produced a shortlist and you want stronger ranking quality. + +Both recipes use retrieval examples where the first passage is positive and the remaining passages are negatives. A +common workflow is to train a bi-encoder, use it to mine harder negatives, then train either a stronger bi-encoder or a +cross-encoder reranker. + +## Quickstart + +Run the Llama 3.2 1B bi-encoder example: + +```bash +automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 8 +``` + +Run the matching cross-encoder example: + +```bash +automodel examples/retrieval/cross_encoder/llama3_2_1b.yaml --nproc-per-node 8 +``` + +Adjust `--nproc-per-node` to the number of GPUs on your machine. The examples use FSDP2 and bfloat16 by default. + +:::{tip} +For a small smoke test, override the schedule and sample count from the command line: + +```bash +automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 1 \ + --step_scheduler.max_steps 10 \ + --dataloader.dataset.max_train_samples 128 +``` +::: + +## Choose a Recipe + +| Use case | Recipe | Model target | Collator | Loss | +|----------|--------|--------------|----------|------| +| Dense retrieval, embedding search, RAG candidate generation | `TrainBiEncoderRecipe` | `nemo_automodel.NeMoAutoModelBiEncoder.from_pretrained` | `BiEncoderCollator` | Cross entropy over one positive plus negatives | +| Reranking a retrieved candidate set | `TrainCrossEncoderRecipe` | `nemo_automodel.NeMoAutoModelCrossEncoder.from_pretrained` | `CrossEncoderCollator` | Cross entropy over one positive plus negatives | + +The bi-encoder computes a query embedding and passage embeddings independently. The cross-encoder formats each +query-passage pair into one sequence and predicts a score for each candidate passage. + +## Prepare Data + +Use the retrieval dataset format described in [Retrieval Dataset](retrieval-dataset.md). Each record needs: + +- a query, stored as `question` or `query` +- at least one positive passage in `pos_doc` +- negatives in `neg_doc` when `n_passages > 1` + +For quick custom experiments, inline JSONL is the simplest format. Use the inline dataset factory for these files: + +```json +{"query":"What does NVLink do?","pos_doc":"NVLink is a high-bandwidth GPU interconnect.","neg_doc":["CUDA is a programming model.","Tensor Cores accelerate matrix math."]} +{"query":"What is retrieval augmented generation?","pos_doc":"RAG grounds generation by retrieving relevant context.","neg_doc":["Beam search expands candidate tokens.","Dropout regularizes training."]} +``` + +For larger corpora, use the corpus ID-based JSON format from the dataset guide. Use +`nemo_automodel.components.datasets.llm.make_retrieval_dataset` for corpus ID-based JSON and for `hf://` sources that +already follow the AutoModel retrieval schema, such as: + +```yaml +data_dir_list: + - hf://nvidia/embed-nemotron-dataset-v1/FEVER + - hf://nvidia/embed-nemotron-dataset-v1/SyntheticClassificationData +``` + +`n_passages` controls how many passages are sampled for each query. For example, `n_passages: 5` means one positive +and four negatives. If a record has fewer negatives than requested, negatives are repeated to fill the group. + +## Configure a Bi-Encoder + +A bi-encoder config has four important parts: the model, tokenizer, retrieval dataset, and `BiEncoderCollator`. + +```yaml +recipe: TrainBiEncoderRecipe + +temperature: 0.02 + +model: + _target_: nemo_automodel.NeMoAutoModelBiEncoder.from_pretrained + pretrained_model_name_or_path: meta-llama/Llama-3.2-1B + pooling: avg + l2_normalize: true + torch_dtype: bfloat16 + +tokenizer: + _target_: nemo_automodel.NeMoAutoTokenizer.from_pretrained + pretrained_model_name_or_path: meta-llama/Llama-3.2-1B + add_eos_token: false + +dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + dataset: + _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset + model_type: bi_encoder + data_dir_list: + - /path/to/train.jsonl + data_type: train + n_passages: 5 + seed: 42 + do_shuffle: true + collate_fn: + _target_: nemo_automodel.components.datasets.llm.BiEncoderCollator + q_max_len: 512 + p_max_len: 512 + query_prefix: "query:" + passage_prefix: "passage:" + pad_to_multiple_of: 8 + shuffle: true + num_workers: 0 +``` + +Important knobs: + +- `pooling`: controls how token hidden states become one embedding. Common choices are `avg`, `cls`, and `last`. +- `l2_normalize`: normalizes embeddings before scoring. When enabled, the recipe divides scores by `temperature`. +- `q_max_len` and `p_max_len`: set separate truncation lengths for queries and passages. +- `query_prefix` and `passage_prefix`: add task-specific text before tokenization. Keep these aligned between training, + hard-negative mining, and inference. +- `do_distributed_inbatch_negative`: optional model setting that treats passages from other data-parallel ranks as + additional negatives. It is useful for larger distributed runs and uses document IDs from corpus-backed datasets to + avoid treating the same document as a negative for itself. + +The complete example is +[`examples/retrieval/bi_encoder/llama3_2_1b.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/bi_encoder/llama3_2_1b.yaml). + +## Configure a Cross-Encoder + +A cross-encoder config uses the same retrieval dataset factory, but sets `model_type: cross_encoder` and uses +`CrossEncoderCollator`. The dataset transform flattens each query with its positive and negative passages so the model +scores each query-passage pair. + +```yaml +recipe: TrainCrossEncoderRecipe + +model: + _target_: nemo_automodel.NeMoAutoModelCrossEncoder.from_pretrained + pretrained_model_name_or_path: meta-llama/Llama-3.2-1B + num_labels: 1 + pooling: avg + temperature: 1.0 + torch_dtype: bfloat16 + +tokenizer: + _target_: nemo_automodel.NeMoAutoTokenizer.from_pretrained + pretrained_model_name_or_path: meta-llama/Llama-3.2-1B + add_eos_token: false + +dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + dataset: + _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset + model_type: cross_encoder + data_dir_list: + - /path/to/train.jsonl + data_type: train + n_passages: 5 + seed: 42 + do_shuffle: true + collate_fn: + _target_: nemo_automodel.components.datasets.llm.CrossEncoderCollator + rerank_max_length: 512 + prompt_template: "question:{query} \n \n passage:{passage}" + pad_to_multiple_of: 8 + shuffle: true + num_workers: 0 +``` + +Important knobs: + +- `rerank_max_length`: maximum combined query-passage sequence length. +- `prompt_template`: controls how the pair is serialized before tokenization. It must include `{query}` and `{passage}`. +- `n_passages`: number of candidates per query. The positive passage must remain first in each group because labels point + to index `0`. + +The complete example is +[`examples/retrieval/cross_encoder/llama3_2_1b.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/cross_encoder/llama3_2_1b.yaml). + +## Add Validation + +Both examples include a commented `validation_dataloader` block. Enable it when you have a held-out retrieval file: + +```yaml +validation_dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + dataset: + _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset + model_type: bi_encoder + data_dir_list: + - /path/to/validation.jsonl + data_type: eval + n_passages: 5 + seed: 42 + do_shuffle: false + collate_fn: + _target_: nemo_automodel.components.datasets.llm.BiEncoderCollator + q_max_len: 512 + p_max_len: 512 + query_prefix: "query:" + passage_prefix: "passage:" + pad_to_multiple_of: 8 + batch_size: 2 + shuffle: false + num_workers: 0 +``` + +Validation logs `val_loss`, `val_acc1`, and `val_mrr`. For cross-encoder validation, use `model_type: cross_encoder` and +`CrossEncoderCollator` instead. + +## Enable LoRA + +Retrieval recipes support the same PEFT block used by other AutoModel fine-tuning recipes. Uncomment or add `peft` to +train LoRA adapters instead of updating every weight: + +```yaml +peft: + _target_: nemo_automodel.components._peft.lora.PeftConfig + target_modules: + - q_proj + - k_proj + - v_proj + - o_proj + - gate_proj + - up_proj + - down_proj + exclude_modules: [] + match_all_linear: true + dim: 16 + alpha: 32 + use_triton: true +``` + +Use LoRA when you need lower memory use or want to ship a small adapter. Use full fine-tuning when you can afford the +memory and want maximum adaptation. + +## Mine Hard Negatives + +After an initial bi-encoder run, mine harder negatives with the saved encoder checkpoint: + +```bash +torchrun --nproc_per_node=8 examples/retrieval/data_utils/mine_hard_negatives.py \ + --config examples/retrieval/data_utils/mining_config.yaml \ + --mining.model_name_or_path /path/to/encoder/checkpoint \ + --mining.train_qa_file_path /path/to/input.json \ + --mining.train_file_output_path /path/to/output.json \ + --mining.cache_embeddings_dir /path/to/cache +``` + +Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: + +- `hard_negatives_to_mine`: number of negatives to add per query. +- `hard_neg_margin` and `hard_neg_margin_type`: filter near-positive candidates. +- `query_prefix` and `passage_prefix`: keep these consistent with the bi-encoder training config. +- `query_max_length` and `passage_max_length`: keep these consistent with training unless you intentionally change + truncation. +- `use_negatives_from_file`: include existing negatives from the input file when mining. + +Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. + +## Save and Use the Checkpoint + +Set checkpointing in the config: + +```yaml +checkpoint: + enabled: true + checkpoint_dir: ./output/llama3_2_1b_encoder/checkpoints + model_save_format: safetensors + save_consolidated: true +``` + +With `save_consolidated: true`, AutoModel writes a Hugging Face-compatible consolidated checkpoint under the checkpoint +directory. Use that directory for downstream embedding generation, hard-negative mining, or serving. + +## Troubleshooting + +| Symptom | Check | +|---------|-------| +| Training fails with empty negatives | Ensure every record has `neg_doc` when `n_passages > 1`. | +| Loss does not move | Verify the positive passage is first and negatives are not duplicates of the positive. | +| Poor retrieval quality | Mine harder negatives and align train/inference prefixes. | +| OOM at startup or first batch | Lower `local_batch_size`, `q_max_len`, `p_max_len`, or `rerank_max_length`; use LoRA for larger backbones. | +| Different mining and training behavior | Match tokenizer settings, prefixes, and max lengths across training and mining. | + +## Related Files + +- Bi-encoder recipe: [`nemo_automodel/recipes/retrieval/train_bi_encoder.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/retrieval/train_bi_encoder.py) +- Cross-encoder recipe: [`nemo_automodel/recipes/retrieval/train_cross_encoder.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/retrieval/train_cross_encoder.py) +- Retrieval dataset guide: [Retrieval Dataset](retrieval-dataset.md) +- Llama-Embed-Nemotron-8B example: + [`examples/retrieval/bi_encoder/llama_embed_nemotron_8b/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/retrieval/bi_encoder/llama_embed_nemotron_8b) diff --git a/docs/guides/overview.md b/docs/guides/overview.md index 15c7cbaf98..4694f36ff6 100644 --- a/docs/guides/overview.md +++ b/docs/guides/overview.md @@ -40,6 +40,16 @@ Recipes for distilling knowledge from a large teacher model into a smaller, more - Example model: Llama 3.2 1B - How-to guide: [Knowledge distillation](llm/knowledge-distillation.md) +### Retrieval Fine-Tuning + +Bi-encoder and cross-encoder recipes for embedding search, RAG candidate generation, and reranking. + +- Folder: [examples/retrieval](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/retrieval) +- Example models: Llama 3.2 1B, Llama-Embed-Nemotron-8B +- How-to guides: + - [Retrieval fine-tuning](llm/retrieval-finetuning.md) + - [Retrieval dataset](llm/retrieval-dataset.md) + ### Benchmark Configurations Curated configurations for benchmarking different training stacks and settings (e.g., Torch vs. TransformerEngine + DeepEP, various model sizes, MoE variants). @@ -68,7 +78,7 @@ Simple generation script and configurations for VLMs. ## Diffusion Models (Text-to-Image & Text-to-Video) Text-to-image and text-to-video diffusion models can generate visual content from natural language descriptions. Fine-tuning lets you adapt these models to a specific style, domain, or dataset — for example, generating product videos in your brand's aesthetic. Pretraining gives you full control when no existing model fits your needs. - + This section walks through the full workflow in NeMo AutoModel: preparing your dataset, training the model, and generating outputs. ### Fine-Tuning diff --git a/docs/index.md b/docs/index.md index c08ac6f5d0..f032f6cdab 100644 --- a/docs/index.md +++ b/docs/index.md @@ -96,7 +96,7 @@ Find the right guide for your task -- fine-tuning, pretraining, distillation, di | **Fine-tune dLLM** | You want to fine-tune a diffusion language model (e.g., LLaDA) using masked denoising | Instruction / chat dataset | dLLM | [Fine-tune dLLM](guides/dllm/finetune.md) | | **Fine-tune Diffusion** | You want to fine-tune a diffusion model for image or video generation | Video / Image dataset | Diffusion | [Fine-tune Diffusion](guides/diffusion/finetune.md) | | **Fine-tune VLM-MoE** | You need large-scale vision-language training with sparse MoE efficiency | Image + text dataset | VLM (MoE) | [Fine-tune VLM-MoE](guides/vlm/qwen3-5.md) | -| **Embedding fine-tune** | You want to improve text similarity for search, retrieval, or RAG | Text pairs / retrieval corpus | LLM | {bdg-info}`Coming Soon` | +| **Embedding fine-tune** | You want to improve text similarity for search, retrieval, or RAG | Text pairs / retrieval corpus | LLM | [Fine-tune retrieval](guides/llm/retrieval-finetuning.md) | | **Fine-tune a large MoE** | You are adapting a large sparse MoE model (DeepSeek-V3, GLM-5, etc.) to your domain | Text dataset (e.g., HellaSwag) | LLM (MoE) | [Fine-tune MoE](guides/llm/large-moe-finetune.md) | | **Fine-tune DeepSeek V4 Flash** | You want to fine-tune the DeepSeek V4 Flash hybrid-attention MoE (SWA / CSA / HCA + hash-routing) | Text dataset (e.g., HellaSwag) | LLM (MoE) | [Fine-tune DeepSeek V4 Flash](guides/llm/dsv4-flash.md) | | **Fine-tune Hy3-preview** | You want to fine-tune Tencent's 295B MoE with sigmoid routing and per-head QK RMSNorm | Text dataset (e.g., HellaSwag) | LLM (MoE) | [Fine-tune Hy3-preview](guides/llm/hy3.md) | @@ -252,6 +252,7 @@ Hy3-preview Pretraining NanoGPT Pretraining Sequence Classification +Retrieval Fine-Tuning Gemma 3 / 3n Gemma 4 Qwen3.5-VL diff --git a/examples/retrieval/cross_encoder/llama3_2_1b.yaml b/examples/retrieval/cross_encoder/llama3_2_1b.yaml index 8b8b3bf773..58a87d98f2 100644 --- a/examples/retrieval/cross_encoder/llama3_2_1b.yaml +++ b/examples/retrieval/cross_encoder/llama3_2_1b.yaml @@ -52,9 +52,8 @@ dataloader: _target_: nemo_automodel.components.datasets.llm.make_retrieval_dataset model_type: cross_encoder data_dir_list: - - retriever_models_research/training_datasets/nqsh_shuffled_50k.json - - retriever_models_research/training_datasets/mldr_en_perc95_small.json - - retriever_models_research/training_datasets/miracl_train_es_llama3_1b_4m_512len.json + - hf://nvidia/embed-nemotron-dataset-v1/FEVER + - hf://nvidia/embed-nemotron-dataset-v1/SyntheticClassificationData data_type: train n_passages: 5 seed: 42 @@ -136,4 +135,4 @@ distributed: # project: # entity: # name: -# save_dir: \ No newline at end of file +# save_dir: From b9c5245eb9c5fb261dabbbbff3d6048e9f084fc4 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 15:18:39 +0100 Subject: [PATCH 06/25] docs(retrieval): expand fine-tuning workflow Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-finetuning.md | 212 ++++++++++++++++++++++-- 1 file changed, 197 insertions(+), 15 deletions(-) diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 3992a1eb2b..6ac1d973b4 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -14,8 +14,34 @@ Both recipes use retrieval examples where the first passage is positive and the common workflow is to train a bi-encoder, use it to mine harder negatives, then train either a stronger bi-encoder or a cross-encoder reranker. +## Workflow Overview + +Most retrieval projects move through the same loop: + +```text +Prepare retrieval data + -> Train a bi-encoder + -> Validate candidate-group ranking quality + -> Mine hard negatives + -> Retrain the bi-encoder or train a cross-encoder reranker + -> Use the consolidated checkpoint for indexing, mining, reranking, or serving +``` + +Start with a bi-encoder when you need embeddings for approximate nearest-neighbor search. Add hard-negative mining after +the first pass if the model mostly sees easy negatives. Train a cross-encoder when a separate retriever already produces +a small candidate set and you want a stronger reranking stage. + ## Quickstart +Before running the examples: + +- Use an AutoModel environment with the full GPU training dependencies installed. The NGC container is the safest path + for multi-GPU runs. +- From a source checkout, use `uv run automodel ...`; from an installed environment, use `automodel ...`. +- Accept access terms for the configured Hugging Face model and set `HF_TOKEN`, or replace the model path with a model + your environment can download. +- Make sure every rank can read the dataset paths or `hf://` sources. + Run the Llama 3.2 1B bi-encoder example: ```bash @@ -35,17 +61,29 @@ For a small smoke test, override the schedule and sample count from the command ```bash automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 1 \ + --step_scheduler.global_batch_size 4 \ + --step_scheduler.local_batch_size 1 \ --step_scheduler.max_steps 10 \ - --dataloader.dataset.max_train_samples 128 + --dataloader.dataset.max_train_samples 40 ``` ::: ## Choose a Recipe -| Use case | Recipe | Model target | Collator | Loss | -|----------|--------|--------------|----------|------| -| Dense retrieval, embedding search, RAG candidate generation | `TrainBiEncoderRecipe` | `nemo_automodel.NeMoAutoModelBiEncoder.from_pretrained` | `BiEncoderCollator` | Cross entropy over one positive plus negatives | -| Reranking a retrieved candidate set | `TrainCrossEncoderRecipe` | `nemo_automodel.NeMoAutoModelCrossEncoder.from_pretrained` | `CrossEncoderCollator` | Cross entropy over one positive plus negatives | +| Need | Use | Why | +|------|-----|-----| +| Search across a large corpus | Bi-encoder | Encodes queries and passages independently, so passage embeddings can be indexed once. | +| RAG candidate generation | Bi-encoder | Produces dense vectors for approximate nearest-neighbor retrieval. | +| Better ranking for a small shortlist | Cross-encoder | Scores each query-passage pair jointly, which is slower but usually more accurate. | +| Better negatives for the next training run | Hard-negative mining | Uses a trained bi-encoder to find confusing passages for each query. | + +| Component | Bi-encoder | Cross-encoder | +|-----------|------------|---------------| +| Recipe | `TrainBiEncoderRecipe` | `TrainCrossEncoderRecipe` | +| Model target | `nemo_automodel.NeMoAutoModelBiEncoder.from_pretrained` | `nemo_automodel.NeMoAutoModelCrossEncoder.from_pretrained` | +| Dataset mode | `model_type: bi_encoder` | `model_type: cross_encoder` | +| Collator | `BiEncoderCollator` | `CrossEncoderCollator` | +| Training objective | Cross entropy over one positive plus negatives | Cross entropy over one positive plus negatives | The bi-encoder computes a query embedding and passage embeddings independently. The cross-encoder formats each query-passage pair into one sequence and predicts a score for each candidate passage. @@ -78,9 +116,92 @@ data_dir_list: `n_passages` controls how many passages are sampled for each query. For example, `n_passages: 5` means one positive and four negatives. If a record has fewer negatives than requested, negatives are repeated to fill the group. +## Minimal Config Anatomy + +This minimal bi-encoder config shows the pieces that must be present in a runnable retrieval fine-tuning job. The +sections below explain the model-specific parts in more detail. + +```yaml +recipe: TrainBiEncoderRecipe +seed: 42 +temperature: 0.02 + +step_scheduler: + global_batch_size: 4 + local_batch_size: 1 + max_steps: 10 + ckpt_every_steps: 10 + val_every_steps: 10 + num_epochs: 1 + +dist_env: + backend: nccl + timeout_minutes: 30 + +model: + _target_: nemo_automodel.NeMoAutoModelBiEncoder.from_pretrained + pretrained_model_name_or_path: meta-llama/Llama-3.2-1B + pooling: avg + l2_normalize: true + torch_dtype: bfloat16 + +tokenizer: + _target_: nemo_automodel.NeMoAutoTokenizer.from_pretrained + pretrained_model_name_or_path: meta-llama/Llama-3.2-1B + add_eos_token: false + +dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + dataset: + _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset + model_type: bi_encoder + data_dir_list: + - /path/to/train.jsonl + data_type: train + n_passages: 5 + seed: 42 + do_shuffle: true + max_train_samples: 40 + collate_fn: + _target_: nemo_automodel.components.datasets.llm.BiEncoderCollator + q_max_len: 512 + p_max_len: 512 + query_prefix: "query:" + passage_prefix: "passage:" + pad_to_multiple_of: 8 + shuffle: true + num_workers: 0 + +optimizer: + _target_: torch.optim.AdamW + lr: 5.0e-6 + weight_decay: 0.01 + +lr_scheduler: + lr_warmup_steps: 2 + lr_decay_style: linear + +checkpoint: + enabled: true + checkpoint_dir: ./output/llama3_2_1b_encoder/checkpoints + model_save_format: safetensors + save_consolidated: true + +distributed: + strategy: fsdp2 + dp_size: none + tp_size: 1 + cp_size: 1 +``` + +For a cross-encoder, change `recipe`, `model._target_`, `dataloader.dataset.model_type`, and `dataloader.collate_fn` +to the cross-encoder values shown below. + ## Configure a Bi-Encoder -A bi-encoder config has four important parts: the model, tokenizer, retrieval dataset, and `BiEncoderCollator`. +A bi-encoder config has four important parts: the model, tokenizer, retrieval dataset, and `BiEncoderCollator`. This +snippet is an excerpt; keep the scheduler, optimizer, checkpoint, and distributed sections from the full config or one +of the examples. ```yaml recipe: TrainBiEncoderRecipe @@ -139,7 +260,8 @@ The complete example is A cross-encoder config uses the same retrieval dataset factory, but sets `model_type: cross_encoder` and uses `CrossEncoderCollator`. The dataset transform flattens each query with its positive and negative passages so the model -scores each query-passage pair. +scores each query-passage pair. This snippet is an excerpt; keep the same scheduler, optimizer, checkpoint, and +distributed structure as the bi-encoder config. ```yaml recipe: TrainCrossEncoderRecipe @@ -187,6 +309,22 @@ Important knobs: The complete example is [`examples/retrieval/cross_encoder/llama3_2_1b.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/cross_encoder/llama3_2_1b.yaml). +## Distributed Launch and Batch Size + +Launch examples with `automodel --nproc-per-node `. The retrieval recipes support data-parallel +training through the configured distributed strategy; pipeline parallelism is not supported for encoder recipes today. + +The step scheduler computes gradient accumulation from: + +```text +gradient_accumulation_steps = global_batch_size / (local_batch_size * data_parallel_size) +``` + +`global_batch_size` must be divisible by `local_batch_size * data_parallel_size`, and the result must be at least `1`. +For memory pressure, reduce `step_scheduler.local_batch_size` first, then sequence lengths (`q_max_len`, `p_max_len`, or +`rerank_max_length`), then `n_passages`. Bi-encoders scale memory with query length plus `n_passages` passage sequences; +cross-encoders scale with `n_passages` combined query-passage sequences. + ## Add Validation Both examples include a commented `validation_dataloader` block. Enable it when you have a held-out retrieval file: @@ -215,8 +353,13 @@ validation_dataloader: num_workers: 0 ``` -Validation logs `val_loss`, `val_acc1`, and `val_mrr`. For cross-encoder validation, use `model_type: cross_encoder` and -`CrossEncoderCollator` instead. +Validation logs `val_loss`, `val_acc1`, and `val_mrr` to `validation.jsonl` under `checkpoint.checkpoint_dir`. These +metrics measure ranking within each candidate group in the validation file; they are not full-corpus Recall@K or nDCG +metrics. For cross-encoder validation, use `model_type: cross_encoder` and `CrossEncoderCollator` instead. + +```bash +tail -n 5 ./output/llama3_2_1b_encoder/checkpoints/validation.jsonl +``` ## Enable LoRA @@ -246,17 +389,25 @@ memory and want maximum adaptation. ## Mine Hard Negatives -After an initial bi-encoder run, mine harder negatives with the saved encoder checkpoint: +After an initial bi-encoder run, mine harder negatives with the consolidated encoder checkpoint: ```bash torchrun --nproc_per_node=8 examples/retrieval/data_utils/mine_hard_negatives.py \ --config examples/retrieval/data_utils/mining_config.yaml \ - --mining.model_name_or_path /path/to/encoder/checkpoint \ + --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated \ --mining.train_qa_file_path /path/to/input.json \ --mining.train_file_output_path /path/to/output.json \ - --mining.cache_embeddings_dir /path/to/cache + --mining.cache_embeddings_dir /path/to/cache \ + --mining.query_prefix "query:" \ + --mining.passage_prefix "passage:" \ + --mining.query_max_length 512 \ + --mining.passage_max_length 512 ``` +Hard-negative mining expects the corpus ID-based retrieval JSON format described in the dataset guide, not the inline +JSONL shortcut. The input must reference one corpus so the miner can build a passage embedding cache, retrieve +candidates, and write mined negatives back to each query. + Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: - `hard_negatives_to_mine`: number of negatives to add per query. @@ -265,10 +416,12 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: - `query_max_length` and `passage_max_length`: keep these consistent with training unless you intentionally change truncation. - `use_negatives_from_file`: include existing negatives from the input file when mining. +- `cache_embeddings_dir`: required for distributed mining so ranks can share cached passage embeddings. Rank `0` + assembles the final embedding cache and score outputs, so plan memory and local disk accordingly. Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. -## Save and Use the Checkpoint +## Save, Resume, and Use the Checkpoint Set checkpointing in the config: @@ -280,8 +433,36 @@ checkpoint: save_consolidated: true ``` -With `save_consolidated: true`, AutoModel writes a Hugging Face-compatible consolidated checkpoint under the checkpoint -directory. Use that directory for downstream embedding generation, hard-negative mining, or serving. +Each save creates a versioned directory such as: + +```text +./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_500/ +``` + +With `save_consolidated: true`, AutoModel also writes a Hugging Face-compatible model under: + +```text +./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_500/model/consolidated/ +``` + +The `LATEST` symlink points to the most recent checkpoint. To resume from the latest compatible checkpoint, set: + +```yaml +checkpoint: + enabled: true + checkpoint_dir: ./output/llama3_2_1b_encoder/checkpoints + restore_from: LATEST +``` + +Use a unique `checkpoint_dir` per experiment unless you intentionally want to resume an existing run. + +## Use the Model + +Use a bi-encoder checkpoint to encode passages, build an approximate nearest-neighbor index, encode queries, and search +the index. Keep the same tokenizer, pooling, normalization, prefixes, and max lengths that you used for training. + +Use a cross-encoder checkpoint to rerank a shortlist from a retriever. Cross-encoders score each query-passage pair +jointly, so they are usually too expensive for first-stage full-corpus search. ## Troubleshooting @@ -291,6 +472,7 @@ directory. Use that directory for downstream embedding generation, hard-negative | Loss does not move | Verify the positive passage is first and negatives are not duplicates of the positive. | | Poor retrieval quality | Mine harder negatives and align train/inference prefixes. | | OOM at startup or first batch | Lower `local_batch_size`, `q_max_len`, `p_max_len`, or `rerank_max_length`; use LoRA for larger backbones. | +| Batch-size assertion fails | Set `global_batch_size` to a multiple of `local_batch_size * data_parallel_size`. | | Different mining and training behavior | Match tokenizer settings, prefixes, and max lengths across training and mining. | ## Related Files From d8da5302a96310bf0bf74dd78e29a7ddd767ddc2 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 15:21:09 +0100 Subject: [PATCH 07/25] docs(retrieval): polish retrieval guide references Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-dataset.md | 20 ++++++++++++++++---- docs/guides/llm/retrieval-finetuning.md | 22 ++++++++++++++++++++-- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/docs/guides/llm/retrieval-dataset.md b/docs/guides/llm/retrieval-dataset.md index 626a64c789..53b25d0ccb 100644 --- a/docs/guides/llm/retrieval-dataset.md +++ b/docs/guides/llm/retrieval-dataset.md @@ -1,17 +1,26 @@ # Retrieval Dataset (Embedding Fine-tuning) -NeMo Automodel supports **retrieval model fine-tuning** using a retrieval-style dataset: each training example is a **query** paired with **one positive** document and **one or more negative** documents. +NeMo Automodel supports **retrieval model fine-tuning** using a retrieval-style dataset: each training example is a +**query** paired with **one positive** document and **one or more negative** documents. -This dataset is used by the retrieval recipes (see `examples/retrieval/bi_encoder/` and `examples/retrieval/cross_encoder/`) together with the `BiEncoderCollator`. +This dataset is used by the retrieval recipes (see `examples/retrieval/bi_encoder/` and +`examples/retrieval/cross_encoder/`) together with the retrieval collators. For an end-to-end training workflow, see +[Retrieval Fine-Tuning](retrieval-finetuning.md). ## What the Bi-Encoder Consumes -The dataset factory `nemo_automodel.components.datasets.llm.make_retrieval_dataset` returns a Hugging Face `datasets.Dataset`. At runtime it transforms each raw record into the training-time schema: +The dataset factory `nemo_automodel.components.datasets.llm.make_retrieval_dataset` returns a Hugging Face +`datasets.Dataset`. At runtime it transforms each raw record into the training-time schema: - `question`: query string - `doc_text`: list of document texts in the order `[positive, negative_1, negative_2, ...]` - `doc_image`: list of images (or empty strings), aligned with `doc_text` -- `query_instruction` / `passage_instruction`: optional, used when `use_dataset_instruction: true` and the corpus provides instructions via metadata +- `query_instruction` / `passage_instruction`: optional, used when `use_dataset_instruction: true` and the corpus + provides instructions via metadata + +The cross-encoder recipe consumes the same raw retrieval records, but sets `model_type: cross_encoder`. Its dataset +transform flattens each query with its positive and negative passages, and `CrossEncoderCollator` serializes each +query-passage pair for reranking. ## Supported Input Formats @@ -125,6 +134,9 @@ dataloader: pad_to_multiple_of: 8 ``` +For cross-encoder training, keep the same dataset factory and set `model_type: cross_encoder`, then replace the collator +with `nemo_automodel.components.datasets.llm.CrossEncoderCollator`. + ## Requirements - `pos_doc` must be **non-empty**. diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 6ac1d973b4..4f8c075905 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -54,7 +54,9 @@ Run the matching cross-encoder example: automodel examples/retrieval/cross_encoder/llama3_2_1b.yaml --nproc-per-node 8 ``` -Adjust `--nproc-per-node` to the number of GPUs on your machine. The examples use FSDP2 and bfloat16 by default. +Adjust `--nproc-per-node` to the number of GPUs on your machine. The examples use FSDP2 and bfloat16 by default. If the +first launch downloads a model, reads from a slow filesystem, or runs across multiple nodes, increase +`dist_env.timeout_minutes` beyond the short example default. :::{tip} For a small smoke test, override the schedule and sample count from the command line: @@ -251,7 +253,9 @@ Important knobs: hard-negative mining, and inference. - `do_distributed_inbatch_negative`: optional model setting that treats passages from other data-parallel ranks as additional negatives. It is useful for larger distributed runs and uses document IDs from corpus-backed datasets to - avoid treating the same document as a negative for itself. + avoid treating the same document as a negative for itself. It all-gathers passage embeddings across data-parallel + ranks, so expect extra communication and score-matrix memory. Use corpus-backed data when you need same-document + masking, and keep it disabled for ColBERT-style pooling. The complete example is [`examples/retrieval/bi_encoder/llama3_2_1b.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/bi_encoder/llama3_2_1b.yaml). @@ -361,6 +365,18 @@ metrics. For cross-encoder validation, use `model_type: cross_encoder` and `Cros tail -n 5 ./output/llama3_2_1b_encoder/checkpoints/validation.jsonl ``` +## Monitor Training + +Training metrics are written to `training.jsonl` under `checkpoint.checkpoint_dir`. Watch `loss`, `grad_norm`, learning +rate, GPU memory, and step time during smoke tests before scaling to a longer run: + +```bash +tail -f ./output/llama3_2_1b_encoder/checkpoints/training.jsonl +``` + +The examples include a commented `wandb` block. Enable it when you want remote tracking, and tune +`step_scheduler.log_remote_every_steps` to control remote logging cadence. + ## Enable LoRA Retrieval recipes support the same PEFT block used by other AutoModel fine-tuning recipes. Uncomment or add `peft` to @@ -469,9 +485,11 @@ jointly, so they are usually too expensive for first-stage full-corpus search. | Symptom | Check | |---------|-------| | Training fails with empty negatives | Ensure every record has `neg_doc` when `n_passages > 1`. | +| Dataset records fail to load | Check the supported schemas in [Retrieval Dataset](retrieval-dataset.md). | | Loss does not move | Verify the positive passage is first and negatives are not duplicates of the positive. | | Poor retrieval quality | Mine harder negatives and align train/inference prefixes. | | OOM at startup or first batch | Lower `local_batch_size`, `q_max_len`, `p_max_len`, or `rerank_max_length`; use LoRA for larger backbones. | +| Distributed launch times out | Increase `dist_env.timeout_minutes`, especially for first model downloads, slow filesystems, or multi-node runs. | | Batch-size assertion fails | Set `global_batch_size` to a multiple of `local_batch_size * data_parallel_size`. | | Different mining and training behavior | Match tokenizer settings, prefixes, and max lengths across training and mining. | From 99cbce93adf7bff207b689654ee62f24d828af1b Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 16:36:50 +0100 Subject: [PATCH 08/25] docs(retrieval): clarify dataset schemas Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-dataset.md | 80 +++++++++++++++++++++++-- docs/guides/llm/retrieval-finetuning.md | 56 ++++++++++++----- 2 files changed, 114 insertions(+), 22 deletions(-) diff --git a/docs/guides/llm/retrieval-dataset.md b/docs/guides/llm/retrieval-dataset.md index 53b25d0ccb..e1a903bb88 100644 --- a/docs/guides/llm/retrieval-dataset.md +++ b/docs/guides/llm/retrieval-dataset.md @@ -1,4 +1,4 @@ -# Retrieval Dataset (Embedding Fine-tuning) +# Retrieval Dataset for Bi-Encoders and Cross-Encoders NeMo Automodel supports **retrieval model fine-tuning** using a retrieval-style dataset: each training example is a **query** paired with **one positive** document and **one or more negative** documents. @@ -7,7 +7,7 @@ This dataset is used by the retrieval recipes (see `examples/retrieval/bi_encode `examples/retrieval/cross_encoder/`) together with the retrieval collators. For an end-to-end training workflow, see [Retrieval Fine-Tuning](retrieval-finetuning.md). -## What the Bi-Encoder Consumes +## Raw Records and Runtime Schemas The dataset factory `nemo_automodel.components.datasets.llm.make_retrieval_dataset` returns a Hugging Face `datasets.Dataset`. At runtime it transforms each raw record into the training-time schema: @@ -22,6 +22,10 @@ The cross-encoder recipe consumes the same raw retrieval records, but sets `mode transform flattens each query with its positive and negative passages, and `CrossEncoderCollator` serializes each query-passage pair for reranking. +Training uses exactly one positive passage per example: the first item in `pos_doc`. For datasets with multiple +relevant passages, either choose a canonical positive, expand the record into one example per positive, or add a +multi-positive loss/masking strategy before training. + ## Supported Input Formats NeMo Automodel supports **two** input schemas. They use different dataset factories: @@ -30,6 +34,12 @@ NeMo Automodel supports **two** input schemas. They use different dataset factor - Use `nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset` for inline JSONL where document text is stored directly in each record. +| Source | Query field | Required document fields | Best for | +|--------|-------------|--------------------------|----------| +| Corpus ID JSON | `question` | `pos_doc`, `neg_doc`, and `corpus_id` IDs resolved through a local corpus | Production data, hard-negative mining, same-document masking | +| `hf://` AutoModel schema | `question` | `pos_doc`, optional `neg_doc`, and a companion HF corpus split | Tutorial runs and shared AutoModel retrieval datasets | +| Inline JSONL | `query` or `question` | Inline text in `pos_doc` and `neg_doc` | Small custom runs when you do not need mining or document-ID masking | + ### Corpus ID-Based JSON (Merlin/NeMo-Retriever Style) This is the format used by NeMo retriever pipelines where documents live in a separate **corpus** and training examples reference documents by **ID**. @@ -55,7 +65,9 @@ This is the format used by NeMo retriever pipelines where documents live in a se **Corpus requirements** -Each corpus directory must contain a `merlin_metadata.json` file. +Each corpus directory must contain a `merlin_metadata.json` file and a Hugging Face-loadable `train` split with at least +`id` and `text` columns. For `class: TextQADataset`, AutoModel calls `datasets.load_dataset()["train"]`, +then resolves `pos_doc` and `neg_doc` IDs against that split. Minimal example: @@ -63,11 +75,44 @@ Minimal example: { "class": "TextQADataset", "corpus_id": "wiki_corpus" } ``` +Minimal local layout: + +```text +retrieval-data/ + train.json + wiki_corpus/ + merlin_metadata.json + train.parquet # or another load_dataset-compatible train split with id,text columns +``` + +The `corpus_id` in `merlin_metadata.json` must match the `corpus_id` in each training record. Relative corpus paths in +`train.json` are resolved relative to the JSON file. + :::{note} - `pos_doc` and `neg_doc` can be lists of `{"id": ...}` dicts or raw IDs (they are normalized internally). +- Training uses `pos_doc[0]` as the positive. Additional positives are ignored unless you expand the data before + training. - If you set `use_dataset_instruction: true`, optional fields like `query_instruction` and `passage_instruction` in `merlin_metadata.json` are surfaced to the collator. ::: +### Hugging Face `hf://` Sources + +Direct `hf://` loading expects the AutoModel retrieval schema, not arbitrary Hugging Face retrieval datasets. The URI +format is: + +```text +hf://// +``` + +Each subset must provide: + +- `/dataset_metadata.json` with `corpus_id` metadata and `ids_only: false` +- a `_corpus` train split with `id` and `text` columns +- a `` train split with `question` and `pos_doc`; `neg_doc` may be absent but must be available before training + with `n_passages > 1` + +Datasets with BEIR, DPR, MS MARCO, MIRACL, or other layouts need a preprocessing step before direct `hf://` loading. + ### Inline-Text JSONL (No Corpus Required) This is convenient for custom fine-tuning pipelines where the documents are included **inline**. @@ -84,7 +129,9 @@ This is convenient for custom fine-tuning pipelines where the documents are incl - `pos_doc` and `neg_doc` can be either: - strings (interpreted as document text), or - lists of strings, or - - dicts with at least `text` (optionally `image`, `nr_ocr`) for multimodal use cases. + - dicts with at least `text`. +- The current LLM retrieval collators tokenize text only. Do not rely on inline `image` or OCR fields unless you add a + custom preprocessing and collator path. - If `corpus_id` is not provided, it defaults to `__inline__`. - `use_dataset_instruction: true` has no effect for pure inline records (instructions come from corpus metadata). ::: @@ -98,6 +145,7 @@ dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader dataset: _target_: nemo_automodel.components.datasets.llm.make_retrieval_dataset + model_type: bi_encoder data_dir_list: - /abs/path/to/train.json # or hf://nvidia/embed-nemotron-dataset-v1/FEVER data_type: train @@ -120,6 +168,7 @@ dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader dataset: _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset + model_type: bi_encoder data_dir_list: - /abs/path/to/train.jsonl data_type: train @@ -134,10 +183,29 @@ dataloader: pad_to_multiple_of: 8 ``` -For cross-encoder training, keep the same dataset factory and set `model_type: cross_encoder`, then replace the collator -with `nemo_automodel.components.datasets.llm.CrossEncoderCollator`. +For cross-encoder training, keep the same dataset factory, set `model_type: cross_encoder`, and use +`CrossEncoderCollator` arguments: + +```yaml +dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + dataset: + _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset + model_type: cross_encoder + data_dir_list: + - /abs/path/to/train.jsonl + data_type: train + n_passages: 5 + do_shuffle: true + collate_fn: + _target_: nemo_automodel.components.datasets.llm.CrossEncoderCollator + rerank_max_length: 512 + prompt_template: "question:{query} \n \n passage:{passage}" + pad_to_multiple_of: 8 +``` ## Requirements - `pos_doc` must be **non-empty**. +- `neg_doc` must be present. It may be empty only when `n_passages: 1`. - If training requests negatives (e.g., `n_passages > 1`), `neg_doc` must contain **at least one** document. diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 4f8c075905..d113c94a34 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -92,13 +92,28 @@ query-passage pair into one sequence and predicts a score for each candidate pas ## Prepare Data -Use the retrieval dataset format described in [Retrieval Dataset](retrieval-dataset.md). Each record needs: +Use the retrieval dataset format described in [Retrieval Dataset](retrieval-dataset.md). Choose the data path that +matches the workflow you need: -- a query, stored as `question` or `query` -- at least one positive passage in `pos_doc` -- negatives in `neg_doc` when `n_passages > 1` +| Data path | Use when | Notes | +|-----------|----------|-------| +| `hf://` AutoModel retrieval schema | You want a tutorial run or shared public dataset | Requires an AutoModel-style HF subset with a companion corpus split. | +| Inline JSONL | You want a small custom run without hard-negative mining | Documents are embedded directly in each record; no document IDs are available for same-document masking. | +| Corpus ID-based JSON | You need hard-negative mining, reusable corpora, or same-document masking | Records reference document IDs in a local corpus that can be loaded by Hugging Face `datasets`. | -For quick custom experiments, inline JSONL is the simplest format. Use the inline dataset factory for these files: +The key field requirements differ by source: + +| Source | Required query field | Required document fields | +|--------|----------------------|--------------------------| +| Corpus ID JSON | `question` | `question_id`, `corpus_id`, non-empty `pos_doc`, and present `neg_doc` | +| `hf://` AutoModel schema | `question` | non-empty `pos_doc`; `neg_doc` is optional in the source but required before training with negatives | +| Inline JSONL | `query` or `question` | non-empty `pos_doc`, and present `neg_doc` | + +`neg_doc` must be present for local JSON and JSONL sources. It may be `[]` only when `n_passages: 1`; when +`n_passages > 1`, provide at least one negative. + +For quick custom experiments, inline JSONL is the simplest format. Use the inline dataset factory for these files, and +switch to corpus ID-based JSON before hard-negative mining: ```json {"query":"What does NVLink do?","pos_doc":"NVLink is a high-bandwidth GPU interconnect.","neg_doc":["CUDA is a programming model.","Tensor Cores accelerate matrix math."]} @@ -115,8 +130,10 @@ data_dir_list: - hf://nvidia/embed-nemotron-dataset-v1/SyntheticClassificationData ``` -`n_passages` controls how many passages are sampled for each query. For example, `n_passages: 5` means one positive -and four negatives. If a record has fewer negatives than requested, negatives are repeated to fill the group. +`n_passages` controls the size of each query group. For example, `n_passages: 5` means one positive and four negatives. +Training uses the first item in `pos_doc` as the positive, then takes negatives from `neg_doc` in order. If a record has +fewer negatives than requested, negatives are repeated cyclically to fill the group. Treat repetition as a fallback for +shape compatibility; for real training and validation, prefer enough distinct negatives or lower `n_passages`. ## Minimal Config Anatomy @@ -197,7 +214,8 @@ distributed: ``` For a cross-encoder, change `recipe`, `model._target_`, `dataloader.dataset.model_type`, and `dataloader.collate_fn` -to the cross-encoder values shown below. +to the cross-encoder values shown below. Also set `model.num_labels: 1`, keep `model.temperature`, and replace +`q_max_len` / `p_max_len` with `rerank_max_length` in the collator. ## Configure a Bi-Encoder @@ -252,10 +270,11 @@ Important knobs: - `query_prefix` and `passage_prefix`: add task-specific text before tokenization. Keep these aligned between training, hard-negative mining, and inference. - `do_distributed_inbatch_negative`: optional model setting that treats passages from other data-parallel ranks as - additional negatives. It is useful for larger distributed runs and uses document IDs from corpus-backed datasets to - avoid treating the same document as a negative for itself. It all-gathers passage embeddings across data-parallel - ranks, so expect extra communication and score-matrix memory. Use corpus-backed data when you need same-document - masking, and keep it disabled for ColBERT-style pooling. + additional negatives. Enable it with `model.do_distributed_inbatch_negative: true` or the CLI override + `--model.do_distributed_inbatch_negative true`. Today it all-gathers over the default process group, so use it only + for pure DP/FSDP retrieval runs (`tp_size: 1`, `cp_size: 1`). Same-document masking requires `doc_id` fields from + corpus-backed or custom datasets; inline JSONL does not provide duplicate-document masking. Keep it disabled for + ColBERT-style pooling. The complete example is [`examples/retrieval/bi_encoder/llama3_2_1b.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/bi_encoder/llama3_2_1b.yaml). @@ -414,8 +433,8 @@ torchrun --nproc_per_node=8 examples/retrieval/data_utils/mine_hard_negatives.py --mining.train_qa_file_path /path/to/input.json \ --mining.train_file_output_path /path/to/output.json \ --mining.cache_embeddings_dir /path/to/cache \ - --mining.query_prefix "query:" \ - --mining.passage_prefix "passage:" \ + --mining.query_prefix "query: " \ + --mining.passage_prefix "passage: " \ --mining.query_max_length 512 \ --mining.passage_max_length 512 ``` @@ -428,14 +447,19 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: - `hard_negatives_to_mine`: number of negatives to add per query. - `hard_neg_margin` and `hard_neg_margin_type`: filter near-positive candidates. -- `query_prefix` and `passage_prefix`: keep these consistent with the bi-encoder training config. +- `query_prefix` and `passage_prefix`: keep these semantically consistent with the bi-encoder training config. The + miner concatenates prefixes directly, while `BiEncoderCollator` inserts a space after non-empty prefixes; include the + trailing space in mining prefixes. - `query_max_length` and `passage_max_length`: keep these consistent with training unless you intentionally change truncation. - `use_negatives_from_file`: include existing negatives from the input file when mining. - `cache_embeddings_dir`: required for distributed mining so ranks can share cached passage embeddings. Rank `0` assembles the final embedding cache and score outputs, so plan memory and local disk accordingly. -Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. +Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. Hard +negative mining excludes known positives by document ID, but it cannot know every semantically relevant duplicate unless +your qrels/corpus encode that relationship. Deduplicate the corpus, exclude all known positives, inspect mined samples, +and avoid mining from validation or test corpora. ## Save, Resume, and Use the Checkpoint From 5a9f27d296cc0668803f6d6737ddaf7d2b427448 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 16:39:18 +0100 Subject: [PATCH 09/25] docs(retrieval): document runtime caveats Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/conf.py | 2 +- docs/guides/llm/retrieval-finetuning.md | 103 +++++++++++++----- .../retrieval/bi_encoder/llama3_2_1b.yaml | 4 +- .../retrieval/cross_encoder/llama3_2_1b.yaml | 2 +- 4 files changed, 82 insertions(+), 29 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index ad9e002958..8b1ed5ce75 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -50,7 +50,7 @@ ] templates_path = ["_templates"] -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "documentation.md"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "documentation.md", "guides/llm/retrieval-dataset-2.md"] # -- Options for MyST Parser (Markdown) -------------------------------------- # MyST Parser settings diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index d113c94a34..8428394bbf 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -42,33 +42,35 @@ Before running the examples: your environment can download. - Make sure every rank can read the dataset paths or `hf://` sources. -Run the Llama 3.2 1B bi-encoder example: +The commands below use `automodel`; if you are running from a source checkout, prefix them with `uv run`. + +Start with a one-GPU smoke test: ```bash -automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 8 +automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 1 \ + --dist_env.timeout_minutes 30 \ + --step_scheduler.global_batch_size 4 \ + --step_scheduler.local_batch_size 1 \ + --step_scheduler.max_steps 10 \ + --dataloader.dataset.max_train_samples 40 ``` -Run the matching cross-encoder example: +The first artifact to check is `training.jsonl` under `checkpoint.checkpoint_dir`. JSONL metrics are buffered, so +stdout/stderr are still the best live signal during a very short run. + +Scale the Llama 3.2 1B bi-encoder example to the GPUs on your machine: ```bash -automodel examples/retrieval/cross_encoder/llama3_2_1b.yaml --nproc-per-node 8 +automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 8 ``` -Adjust `--nproc-per-node` to the number of GPUs on your machine. The examples use FSDP2 and bfloat16 by default. If the -first launch downloads a model, reads from a slow filesystem, or runs across multiple nodes, increase -`dist_env.timeout_minutes` beyond the short example default. - -:::{tip} -For a small smoke test, override the schedule and sample count from the command line: +Run the matching cross-encoder example: ```bash -automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 1 \ - --step_scheduler.global_batch_size 4 \ - --step_scheduler.local_batch_size 1 \ - --step_scheduler.max_steps 10 \ - --dataloader.dataset.max_train_samples 40 +automodel examples/retrieval/cross_encoder/llama3_2_1b.yaml --nproc-per-node 8 ``` -::: + +Adjust `--nproc-per-node` to the number of GPUs on your machine. The examples use FSDP2 and bfloat16 by default. ## Choose a Recipe @@ -334,8 +336,26 @@ The complete example is ## Distributed Launch and Batch Size -Launch examples with `automodel --nproc-per-node `. The retrieval recipes support data-parallel -training through the configured distributed strategy; pipeline parallelism is not supported for encoder recipes today. +Launch single-node examples with `automodel --nproc-per-node `. The retrieval recipes support +data-parallel training through the configured distributed strategy; pipeline parallelism is not supported for encoder +recipes today. + +For multi-node runs, launch with your cluster launcher or an external `torchrun` command so every node has an explicit +rank and rendezvous endpoint: + +```bash +torchrun \ + --nnodes 2 \ + --nproc-per-node 8 \ + --node-rank ${NODE_RANK} \ + --rdzv-backend c10d \ + --rdzv-endpoint ${MASTER_ADDR}:${MASTER_PORT} \ + -m nemo_automodel.cli.app examples/retrieval/bi_encoder/llama3_2_1b.yaml +``` + +Use a shared or pre-populated Hugging Face cache on every node, make dataset paths visible to every rank, and use a +unique `checkpoint_dir` for each experiment. Increase `dist_env.timeout_minutes` for first model downloads, slow shared +filesystems, multi-node collectives, or large checkpoint writes. The step scheduler computes gradient accumulation from: @@ -344,9 +364,17 @@ gradient_accumulation_steps = global_batch_size / (local_batch_size * data_paral ``` `global_batch_size` must be divisible by `local_batch_size * data_parallel_size`, and the result must be at least `1`. -For memory pressure, reduce `step_scheduler.local_batch_size` first, then sequence lengths (`q_max_len`, `p_max_len`, or -`rerank_max_length`), then `n_passages`. Bi-encoders scale memory with query length plus `n_passages` passage sequences; -cross-encoders scale with `n_passages` combined query-passage sequences. +In pure data parallelism, `data_parallel_size` is the total GPU count. With tensor or context parallelism enabled, it is +`world_size / (tp_size * cp_size)`. For example, two 8-GPU nodes with `tp_size: 1` and `cp_size: 1` have +`data_parallel_size: 16`; with `tp_size: 2`, they have `data_parallel_size: 8`. + +`local_batch_size` is the number of query groups per rank. For memory pressure, reduce +`step_scheduler.local_batch_size` first, then sequence lengths (`q_max_len`, `p_max_len`, or `rerank_max_length`), then +`n_passages`. Bi-encoders scale memory with query length plus `local_batch_size * n_passages` passage sequences; +cross-encoders scale with `local_batch_size * n_passages` combined query-passage sequences. + +Current retrieval datasets are map-style datasets loaded in each process, not streaming distributed inputs. Pre-cache +HF data on each node or use a shared cache, and budget CPU RAM and local disk per rank for corpus-backed datasets. ## Add Validation @@ -384,15 +412,32 @@ metrics. For cross-encoder validation, use `model_type: cross_encoder` and `Cros tail -n 5 ./output/llama3_2_1b_encoder/checkpoints/validation.jsonl ``` +## Evaluate Retrieval Quality + +Candidate-group validation is a smoke test for the training objective. To decide whether a bi-encoder is useful for RAG +candidate generation, evaluate against a fixed held-out corpus and qrels: + +1. Encode corpus passages with the same tokenizer, pooling, normalization, passage prefix, and `p_max_len` used in + training. +2. Build an ANN or exact top-k index. With `l2_normalize: true`, use inner product or cosine similarity. +3. Encode held-out queries with the matching query prefix and `q_max_len`. +4. Report full-corpus Recall@K, MRR@K, and nDCG@K for the K values your application uses. + +For cross-encoders, freeze a first-stage retriever, rerank its top-K candidates, and report reranking metrics on that +same candidate set. Do not compare cross-encoder candidate-group validation directly to full-corpus bi-encoder metrics. + ## Monitor Training -Training metrics are written to `training.jsonl` under `checkpoint.checkpoint_dir`. Watch `loss`, `grad_norm`, learning -rate, GPU memory, and step time during smoke tests before scaling to a longer run: +Training metrics are written to `training.jsonl` under `checkpoint.checkpoint_dir`. The file logger buffers records +before flushing, so `tail -f` is useful for completed or longer runs but may not update during a short smoke test: ```bash tail -f ./output/llama3_2_1b_encoder/checkpoints/training.jsonl ``` +Use stdout/stderr as the live per-step signal today. Watch `loss`, `grad_norm`, learning rate, GPU memory, and step time +before scaling to a longer run. + The examples include a commented `wandb` block. Enable it when you want remote tracking, and tune `step_scheduler.log_remote_every_steps` to control remote logging cadence. @@ -494,7 +539,13 @@ checkpoint: restore_from: LATEST ``` -Use a unique `checkpoint_dir` per experiment unless you intentionally want to resume an existing run. +If `checkpoint.restore_from` is omitted, AutoModel still auto-detects the latest compatible checkpoint in +`checkpoint_dir` and resumes from it. Use a new or empty `checkpoint_dir` for fresh experiments, and rotate or clear +`training.jsonl` and `validation.jsonl` if you do not want logs from multiple runs appended together. + +When `checkpoint.is_async: true`, the `LATEST` symlink can lag the most recent write at job end. For final mining, +export, or evaluation workflows, prefer the explicit `epoch_*_step_*` checkpoint directory or keep async checkpointing +disabled for the final save. ## Use the Model @@ -515,7 +566,9 @@ jointly, so they are usually too expensive for first-stage full-corpus search. | OOM at startup or first batch | Lower `local_batch_size`, `q_max_len`, `p_max_len`, or `rerank_max_length`; use LoRA for larger backbones. | | Distributed launch times out | Increase `dist_env.timeout_minutes`, especially for first model downloads, slow filesystems, or multi-node runs. | | Batch-size assertion fails | Set `global_batch_size` to a multiple of `local_batch_size * data_parallel_size`. | -| Different mining and training behavior | Match tokenizer settings, prefixes, and max lengths across training and mining. | +| `training.jsonl` does not update during a smoke test | Use stdout/stderr for live monitoring; JSONL metrics are buffered before flush. | +| Run resumes unexpectedly | Use a new or empty `checkpoint_dir`; AutoModel auto-detects compatible checkpoints when `restore_from` is omitted. | +| Different mining and training behavior | Match tokenizer settings, max lengths, and prefix text including trailing spaces across training and mining. | ## Related Files diff --git a/examples/retrieval/bi_encoder/llama3_2_1b.yaml b/examples/retrieval/bi_encoder/llama3_2_1b.yaml index c51841ce70..a2c90b033a 100644 --- a/examples/retrieval/bi_encoder/llama3_2_1b.yaml +++ b/examples/retrieval/bi_encoder/llama3_2_1b.yaml @@ -31,7 +31,7 @@ step_scheduler: dist_env: backend: nccl - timeout_minutes: 1 + timeout_minutes: 30 model: _target_: nemo_automodel.NeMoAutoModelBiEncoder.from_pretrained @@ -143,4 +143,4 @@ distributed: # project: # entity: # name: -# save_dir: \ No newline at end of file +# save_dir: diff --git a/examples/retrieval/cross_encoder/llama3_2_1b.yaml b/examples/retrieval/cross_encoder/llama3_2_1b.yaml index 58a87d98f2..25c7b0fd5b 100644 --- a/examples/retrieval/cross_encoder/llama3_2_1b.yaml +++ b/examples/retrieval/cross_encoder/llama3_2_1b.yaml @@ -29,7 +29,7 @@ step_scheduler: dist_env: backend: nccl - timeout_minutes: 1 + timeout_minutes: 30 model: _target_: nemo_automodel.NeMoAutoModelCrossEncoder.from_pretrained From 7ee29ca55bdf38645637b5f30b91af79f8c19458 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 16:59:27 +0100 Subject: [PATCH 10/25] docs(retrieval): address review followups Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- AGENTS.md | 10 +++--- docs/guides/dataset-overview.md | 19 +++++++---- docs/guides/llm/retrieval-dataset.md | 22 +++++++++++-- docs/guides/llm/retrieval-finetuning.md | 33 +++++++++++++++---- .../llama_embed_nemotron_8b/README.md | 3 +- skills/build-and-dependency/SKILL.md | 24 +++++--------- 6 files changed, 74 insertions(+), 37 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 1d2a2e648f..1b822a42da 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -51,10 +51,10 @@ hand. ## Architecture Overview ``` -automodel -c +automodel [--nproc-per-node N] [--key.subkey value] | v -_cli/app.py -- routes command + domain to recipe scripts +cli/app.py -- loads YAML config and instantiates the configured recipe | v recipes/ -- main training / eval entry points @@ -86,9 +86,9 @@ _diffusers/ -- diffusion pipeline wrapper ### Entry Point -`_cli/app.py` parses `automodel ` and dispatches to the -matching recipe script. The `-c` flag points to a YAML config that drives all -component construction. +`cli/app.py` parses `automodel `, loads the YAML config, and +instantiates the configured recipe. CLI overrides such as `--model.name value` +are applied to the config before construction. ### Recipes diff --git a/docs/guides/dataset-overview.md b/docs/guides/dataset-overview.md index 00dfed81a6..0fee8936a9 100644 --- a/docs/guides/dataset-overview.md +++ b/docs/guides/dataset-overview.md @@ -238,17 +238,24 @@ See the [Function Calling guide](llm/toolcalling.md) for an end-to-end example w For a small reasoning-style chat SFT starting point, see [qwen2_5_0p5b_instruct_fineproofs_chat.yaml](../../examples/llm_finetune/qwen/qwen2_5_0p5b_instruct_fineproofs_chat.yaml). ### Retrieval (Embedding Fine-Tuning) -- Factory: `nemo_automodel.components.datasets.llm.make_retrieval_dataset` -- Collator: `nemo_automodel.components.datasets.llm.BiEncoderCollator` +- Factory for corpus ID JSON and `hf://` AutoModel-schema sources: + `nemo_automodel.components.datasets.llm.make_retrieval_dataset` +- Factory for inline JSONL: + `nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset` +- Collators: `nemo_automodel.components.datasets.llm.BiEncoderCollator` or + `nemo_automodel.components.datasets.llm.CrossEncoderCollator` - Use case: embedding model fine-tuning with (query, positive doc, negative docs) contrastive learning -- Supported schemas: +- Supported retrieval sources: - Corpus-ID JSON (Merlin/NeMo-retriever style) + - `hf://` sources that already follow the AutoModel retrieval schema - Inline-text JSONL (e.g., `{"query": "...", "pos_doc": "...", "neg_doc": ["...", "..."]}`) -- Example YAML: +- Inline JSONL example: ```yaml dataset: - _target_: nemo_automodel.components.datasets.llm.make_retrieval_dataset - data_dir_list: /abs/path/to/train.jsonl + _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset + model_type: bi_encoder + data_dir_list: + - /abs/path/to/train.jsonl data_type: train n_passages: 5 collate_fn: diff --git a/docs/guides/llm/retrieval-dataset.md b/docs/guides/llm/retrieval-dataset.md index e1a903bb88..1a369076bf 100644 --- a/docs/guides/llm/retrieval-dataset.md +++ b/docs/guides/llm/retrieval-dataset.md @@ -15,6 +15,8 @@ The dataset factory `nemo_automodel.components.datasets.llm.make_retrieval_datas - `question`: query string - `doc_text`: list of document texts in the order `[positive, negative_1, negative_2, ...]` - `doc_image`: list of images (or empty strings), aligned with `doc_text` +- `doc_id`: list of document IDs aligned with `doc_text` for corpus-backed and `hf://` sources. Pure inline JSONL + does not provide real document IDs for duplicate-document masking unless you add them in a custom preprocessing path. - `query_instruction` / `passage_instruction`: optional, used when `use_dataset_instruction: true` and the corpus provides instructions via metadata @@ -24,11 +26,12 @@ query-passage pair for reranking. Training uses exactly one positive passage per example: the first item in `pos_doc`. For datasets with multiple relevant passages, either choose a canonical positive, expand the record into one example per positive, or add a -multi-positive loss/masking strategy before training. +multi-positive loss/masking strategy before training. Keep the full set of known positives in your qrels or corpus +metadata for evaluation and false-negative filtering, even when each training row uses one positive. ## Supported Input Formats -NeMo Automodel supports **two** input schemas. They use different dataset factories: +NeMo Automodel supports **two** input schemas across three source types. They use different dataset factories: - Use `nemo_automodel.components.datasets.llm.make_retrieval_dataset` for corpus ID-based JSON and `hf://` sources. - Use `nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset` for inline JSONL where @@ -40,6 +43,17 @@ NeMo Automodel supports **two** input schemas. They use different dataset factor | `hf://` AutoModel schema | `question` | `pos_doc`, optional `neg_doc`, and a companion HF corpus split | Tutorial runs and shared AutoModel retrieval datasets | | Inline JSONL | `query` or `question` | Inline text in `pos_doc` and `neg_doc` | Small custom runs when you do not need mining or document-ID masking | +Separate qrels files are not consumed directly by the training dataset factory. Convert qrels-style data into retrieval +records before training: + +1. Put every passage in a corpus split with stable `id` and `text` values. +2. For each query, write one or more training records with `question_id`, `question`, `corpus_id`, `pos_doc`, and + `neg_doc`. +3. Use the first relevant document in each record as `pos_doc[0]`; expand multi-positive queries into multiple records + if you want every positive to become a supervised positive. +4. Preserve the complete qrels separately for full-corpus evaluation and for excluding all known positives during + negative mining. + ### Corpus ID-Based JSON (Merlin/NeMo-Retriever Style) This is the format used by NeMo retriever pipelines where documents live in a separate **corpus** and training examples reference documents by **ID**. @@ -161,6 +175,10 @@ dataloader: pad_to_multiple_of: 8 ``` +For corpus ID JSON and `hf://` sources, `do_shuffle: true` shuffles rows only when `max_train_samples` is set before +subsampling. Training order is controlled by the dataloader or distributed sampler. For inline JSONL, `do_shuffle: true` +shuffles the loaded rows directly. + Use the inline dataset factory for inline JSONL: ```yaml diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 8428394bbf..5b3846ec3c 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -39,7 +39,10 @@ Before running the examples: for multi-GPU runs. - From a source checkout, use `uv run automodel ...`; from an installed environment, use `automodel ...`. - Accept access terms for the configured Hugging Face model and set `HF_TOKEN`, or replace the model path with a model - your environment can download. + your environment can download. Retrieval has custom bidirectional backbones for Llama and Ministral3 embedding + models, and a custom Llama scoring backbone for cross-encoders. Other Hugging Face model types fall back to + `AutoModel` for bi-encoders or `AutoModelForSequenceClassification` for cross-encoders; verify that the tokenizer, + pooling, `num_labels`, and any retrieval-specific model arguments are accepted by the replacement model. - Make sure every rank can read the dataset paths or `hf://` sources. The commands below use `automodel`; if you are running from a source checkout, prefix them with `uv run`. @@ -115,13 +118,19 @@ The key field requirements differ by source: `n_passages > 1`, provide at least one negative. For quick custom experiments, inline JSONL is the simplest format. Use the inline dataset factory for these files, and -switch to corpus ID-based JSON before hard-negative mining: +switch to corpus ID-based JSON before hard-negative mining or full-corpus evaluation: ```json {"query":"What does NVLink do?","pos_doc":"NVLink is a high-bandwidth GPU interconnect.","neg_doc":["CUDA is a programming model.","Tensor Cores accelerate matrix math."]} {"query":"What is retrieval augmented generation?","pos_doc":"RAG grounds generation by retrieving relevant context.","neg_doc":["Beam search expands candidate tokens.","Dropout regularizes training."]} ``` +To migrate inline data to corpus ID JSON, assign a stable document ID to each unique passage, write those passages into +a corpus split with `id` and `text` columns, then replace inline `pos_doc` and `neg_doc` strings with those IDs. Keep +all known positives for each query in your qrels or source metadata, even if each training row uses only the first +positive. Otherwise, in-batch negatives and mined hard negatives can accidentally treat another relevant passage as a +negative. + For larger corpora, use the corpus ID-based JSON format from the dataset guide. Use `nemo_automodel.components.datasets.llm.make_retrieval_dataset` for corpus ID-based JSON and for `hf://` sources that already follow the AutoModel retrieval schema, such as: @@ -137,6 +146,9 @@ Training uses the first item in `pos_doc` as the positive, then takes negatives fewer negatives than requested, negatives are repeated cyclically to fill the group. Treat repetition as a fallback for shape compatibility; for real training and validation, prefer enough distinct negatives or lower `n_passages`. +The training recipe does not load a separate qrels file. Materialize qrels into training records before fine-tuning, and +keep the original qrels for offline Recall@K, MRR@K, and nDCG@K evaluation. + ## Minimal Config Anatomy This minimal bi-encoder config shows the pieces that must be present in a runnable retrieval fine-tuning job. The @@ -423,20 +435,27 @@ candidate generation, evaluate against a fixed held-out corpus and qrels: 3. Encode held-out queries with the matching query prefix and `q_max_len`. 4. Report full-corpus Recall@K, MRR@K, and nDCG@K for the K values your application uses. +AutoModel does not currently provide a one-command full-corpus retrieval evaluator in this guide. Use your existing IR +evaluation stack or a small script around the consolidated checkpoint and report enough run details to make the result +repeatable: query count, corpus size, qrels source, judged/unjudged handling, exact versus ANN search settings, K +values, baseline checkpoint, and whether confidence intervals or significance tests were used. + For cross-encoders, freeze a first-stage retriever, rerank its top-K candidates, and report reranking metrics on that same candidate set. Do not compare cross-encoder candidate-group validation directly to full-corpus bi-encoder metrics. ## Monitor Training -Training metrics are written to `training.jsonl` under `checkpoint.checkpoint_dir`. The file logger buffers records -before flushing, so `tail -f` is useful for completed or longer runs but may not update during a short smoke test: +Training metrics are written to `training.jsonl` under `checkpoint.checkpoint_dir`. The file logger buffers records in +chunks before writing and flushes the remaining records on close, so `tail -f` is useful for completed or longer runs +but may not update during a short smoke test: ```bash tail -f ./output/llama3_2_1b_encoder/checkpoints/training.jsonl ``` Use stdout/stderr as the live per-step signal today. Watch `loss`, `grad_norm`, learning rate, GPU memory, and step time -before scaling to a longer run. +before scaling to a longer run. On preempted or timed-out jobs, recent buffered JSONL metrics may be missing even when +stdout/stderr showed them. The examples include a commented `wandb` block. Enable it when you want remote tracking, and tune `step_scheduler.log_remote_every_steps` to control remote logging cadence. @@ -499,7 +518,9 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: truncation. - `use_negatives_from_file`: include existing negatives from the input file when mining. - `cache_embeddings_dir`: required for distributed mining so ranks can share cached passage embeddings. Rank `0` - assembles the final embedding cache and score outputs, so plan memory and local disk accordingly. + assembles the final embedding cache and score outputs, so plan memory and local disk accordingly. In multi-node + mining, this must be a shared writable path mounted at the same location on every node; node-local cache paths leave + rank `0` unable to read remote-rank shards. Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. Hard negative mining excludes known positives by document ID, but it cannot know every semantically relevant duplicate unless diff --git a/examples/retrieval/bi_encoder/llama_embed_nemotron_8b/README.md b/examples/retrieval/bi_encoder/llama_embed_nemotron_8b/README.md index 6c069461fe..c93debfd69 100644 --- a/examples/retrieval/bi_encoder/llama_embed_nemotron_8b/README.md +++ b/examples/retrieval/bi_encoder/llama_embed_nemotron_8b/README.md @@ -24,8 +24,7 @@ This script will download the dataset and prepare it for training. Run the model finetuning with the specified configuration using 8 GPUs: ```bash -torchrun --nproc-per-node=8 examples/retrieval/bi_encoder/finetune.py \ - --config examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml +automodel examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml --nproc-per-node 8 ``` The final model checkpoint in Hugging Face format will be stored in `output/llama_embed_nemotron_8b/epoch_0_step_28614/model/consolidated` diff --git a/skills/build-and-dependency/SKILL.md b/skills/build-and-dependency/SKILL.md index 0a8c199e00..3440a3cbf9 100644 --- a/skills/build-and-dependency/SKILL.md +++ b/skills/build-and-dependency/SKILL.md @@ -113,31 +113,23 @@ export HF_HOME="/path/to/hf_cache" # Hugging Face cache directory ## CLI Usage -The entry point is `automodel` (defined at `nemo_automodel._cli.app:main`). +The entry point is `automodel` (defined at `nemo_automodel.cli.app:main`). -Pattern: `automodel -c ` +Pattern: `automodel [--nproc-per-node N] [--key.subkey value]` ```bash -# LLM -automodel finetune llm -c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml -automodel pretrain llm -c config.yaml -automodel kd llm -c config.yaml -automodel benchmark llm -c config.yaml +# Local interactive launch; the YAML recipe selects LLM, VLM, diffusion, or retrieval behavior. +automodel examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml --nproc-per-node 8 +automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 8 -# VLM -automodel finetune vlm -c config.yaml - -# Diffusion -automodel finetune diffusion -c config.yaml - -# Retrieval -automodel finetune retrieval -c config.yaml +# External torchrun launch for clusters that set rank and rendezvous explicitly. +torchrun --nproc-per-node 8 -m nemo_automodel.cli.app examples/retrieval/bi_encoder/llama3_2_1b.yaml ``` Override any config value from the CLI: ```bash -automodel finetune llm -c config.yaml --model.name_or_path meta-llama/Llama-3.2-1B +automodel config.yaml --model.name_or_path meta-llama/Llama-3.2-1B ``` ## Common Pitfalls From c42783e9edb46dfeab7b51c971be934a2e646470 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 17:13:36 +0100 Subject: [PATCH 11/25] docs(retrieval): tighten mining guidance Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/dataset-overview.md | 26 +++++++++++-------- docs/guides/llm/retrieval-dataset-2.md | 9 +++++++ docs/guides/llm/retrieval-dataset.md | 15 ++++++----- docs/guides/llm/retrieval-finetuning.md | 23 +++++++++++----- .../retrieval/data_utils/mining_config.yaml | 8 ++++-- 5 files changed, 55 insertions(+), 26 deletions(-) create mode 100644 docs/guides/llm/retrieval-dataset-2.md diff --git a/docs/guides/dataset-overview.md b/docs/guides/dataset-overview.md index 0fee8936a9..f45135a3b8 100644 --- a/docs/guides/dataset-overview.md +++ b/docs/guides/dataset-overview.md @@ -251,17 +251,21 @@ For a small reasoning-style chat SFT starting point, see [qwen2_5_0p5b_instruct_ - Inline-text JSONL (e.g., `{"query": "...", "pos_doc": "...", "neg_doc": ["...", "..."]}`) - Inline JSONL example: ```yaml -dataset: - _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset - model_type: bi_encoder - data_dir_list: - - /abs/path/to/train.jsonl - data_type: train - n_passages: 5 -collate_fn: - _target_: nemo_automodel.components.datasets.llm.BiEncoderCollator - q_max_len: 512 - p_max_len: 512 +dataloader: + _target_: torchdata.stateful_dataloader.StatefulDataLoader + dataset: + _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset + model_type: bi_encoder + data_dir_list: + - /abs/path/to/train.jsonl + data_type: train + n_passages: 5 + collate_fn: + _target_: nemo_automodel.components.datasets.llm.BiEncoderCollator + q_max_len: 512 + p_max_len: 512 + batch_size: 2 + shuffle: true ``` See the detailed guide, [Retrieval dataset](llm/retrieval-dataset.md), for more information. diff --git a/docs/guides/llm/retrieval-dataset-2.md b/docs/guides/llm/retrieval-dataset-2.md new file mode 100644 index 0000000000..c06dbc4875 --- /dev/null +++ b/docs/guides/llm/retrieval-dataset-2.md @@ -0,0 +1,9 @@ +# Retrieval Dataset Draft Replaced + +This file previously contained a stale draft with obsolete retrieval API names. Do not use it for configuration +examples. + +Use the maintained guides instead: + +- [Retrieval Dataset](retrieval-dataset.md) +- [Retrieval Fine-Tuning](retrieval-finetuning.md) diff --git a/docs/guides/llm/retrieval-dataset.md b/docs/guides/llm/retrieval-dataset.md index 1a369076bf..9e0e7fd48d 100644 --- a/docs/guides/llm/retrieval-dataset.md +++ b/docs/guides/llm/retrieval-dataset.md @@ -40,7 +40,7 @@ NeMo Automodel supports **two** input schemas across three source types. They us | Source | Query field | Required document fields | Best for | |--------|-------------|--------------------------|----------| | Corpus ID JSON | `question` | `pos_doc`, `neg_doc`, and `corpus_id` IDs resolved through a local corpus | Production data, hard-negative mining, same-document masking | -| `hf://` AutoModel schema | `question` | `pos_doc`, optional `neg_doc`, and a companion HF corpus split | Tutorial runs and shared AutoModel retrieval datasets | +| `hf://` AutoModel schema | `question` | `pos_doc`, a companion HF corpus split, and `neg_doc` before training with `n_passages > 1` | Tutorial runs and shared AutoModel retrieval datasets | | Inline JSONL | `query` or `question` | Inline text in `pos_doc` and `neg_doc` | Small custom runs when you do not need mining or document-ID masking | Separate qrels files are not consumed directly by the training dataset factory. Convert qrels-style data into retrieval @@ -49,10 +49,12 @@ records before training: 1. Put every passage in a corpus split with stable `id` and `text` values. 2. For each query, write one or more training records with `question_id`, `question`, `corpus_id`, `pos_doc`, and `neg_doc`. -3. Use the first relevant document in each record as `pos_doc[0]`; expand multi-positive queries into multiple records - if you want every positive to become a supervised positive. -4. Preserve the complete qrels separately for full-corpus evaluation and for excluding all known positives during - negative mining. +3. For training, use the first relevant document in each record as `pos_doc[0]`; expand multi-positive queries into + multiple records if you want every positive to become a supervised positive. +4. For hard-negative mining, include all known positive document IDs for that query in the row's `pos_doc`. The miner + excludes only IDs present in the input row, not an external qrels file. +5. Preserve the complete qrels separately for full-corpus evaluation and audit mined negatives against them before + reusing the output for training. ### Corpus ID-Based JSON (Merlin/NeMo-Retriever Style) @@ -225,5 +227,6 @@ dataloader: ## Requirements - `pos_doc` must be **non-empty**. -- `neg_doc` must be present. It may be empty only when `n_passages: 1`. +- `neg_doc` must be present in local JSON and JSONL training records. It may be empty only when `n_passages: 1`. +- `hf://` sources may omit `neg_doc` in the source dataset, but add negatives before training with `n_passages > 1`. - If training requests negatives (e.g., `n_passages > 1`), `neg_doc` must contain **at least one** document. diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 5b3846ec3c..7c4462880e 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -146,8 +146,11 @@ Training uses the first item in `pos_doc` as the positive, then takes negatives fewer negatives than requested, negatives are repeated cyclically to fill the group. Treat repetition as a fallback for shape compatibility; for real training and validation, prefer enough distinct negatives or lower `n_passages`. -The training recipe does not load a separate qrels file. Materialize qrels into training records before fine-tuning, and -keep the original qrels for offline Recall@K, MRR@K, and nDCG@K evaluation. +The training recipe does not load a separate qrels file. Materialize qrels into retrieval records before fine-tuning. +For training, `pos_doc[0]` is the supervised positive. For mining, keep every known positive for the query in `pos_doc` +so the miner can exclude those IDs; it does not read an external qrels file. If you expand multi-positive queries into +one row per positive, make sure sibling positives are removed from `neg_doc` and audited out of mined negatives before +training with in-batch negatives. Keep the original qrels for offline Recall@K, MRR@K, and nDCG@K evaluation. ## Minimal Config Anatomy @@ -510,7 +513,9 @@ candidates, and write mined negatives back to each query. Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: - `hard_negatives_to_mine`: number of negatives to add per query. -- `hard_neg_margin` and `hard_neg_margin_type`: filter near-positive candidates. +- `hard_neg_margin` and `hard_neg_margin_type`: filter near-positive candidates. With `hard_neg_margin_type: perc`, + candidates scoring above `min_positive_score * hard_neg_margin` are removed; with `abs`, candidates scoring above + `min_positive_score - hard_neg_margin` are removed. Inspect mined samples when positive scores are low or negative. - `query_prefix` and `passage_prefix`: keep these semantically consistent with the bi-encoder training config. The miner concatenates prefixes directly, while `BiEncoderCollator` inserts a space after non-empty prefixes; include the trailing space in mining prefixes. @@ -523,9 +528,9 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: rank `0` unable to read remote-rank shards. Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. Hard -negative mining excludes known positives by document ID, but it cannot know every semantically relevant duplicate unless -your qrels/corpus encode that relationship. Deduplicate the corpus, exclude all known positives, inspect mined samples, -and avoid mining from validation or test corpora. +negative mining excludes document IDs listed in each input row's `pos_doc`, but it cannot read an external qrels file or +know every semantically relevant duplicate. Put all known positive IDs for the query in the mining input, deduplicate the +corpus, inspect mined samples, and avoid mining from validation or test corpora. ## Save, Resume, and Use the Checkpoint @@ -542,9 +547,13 @@ checkpoint: Each save creates a versioned directory such as: ```text -./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_500/ +./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_499/ ``` +Checkpoint directory names use the scheduler step at save time. The saved scheduler state advances to the next step, so +for exact paths prefer the `Saving checkpoint to ...` log line or the `LATEST` pointer over hand-constructing a step +number. + With `save_consolidated: true`, AutoModel also writes a Hugging Face-compatible model under: ```text diff --git a/examples/retrieval/data_utils/mining_config.yaml b/examples/retrieval/data_utils/mining_config.yaml index 4d9cf80d3b..a53d4a9355 100644 --- a/examples/retrieval/data_utils/mining_config.yaml +++ b/examples/retrieval/data_utils/mining_config.yaml @@ -21,7 +21,8 @@ # --config examples/retrieval/data_utils/mining_config.yaml \ # --mining.model_name_or_path /path/to/encoder/checkpoint \ # --mining.train_qa_file_path /path/to/input.json \ -# --mining.train_file_output_path /path/to/output.json +# --mining.train_file_output_path /path/to/output.json \ +# --mining.cache_embeddings_dir /shared/path/to/cache # Distributed environment settings dist_env: @@ -33,6 +34,7 @@ dist_env: # mining.model_name_or_path: Path to encoder checkpoint # mining.train_qa_file_path: Input QA file # mining.train_file_output_path: Output file with mined negatives +# mining.cache_embeddings_dir: Required for multi-rank torchrun; optional only for single-process mining mining: # Model path - REQUIRED (override via --mining.model_name_or_path) @@ -63,12 +65,14 @@ mining: # train_qa_file_path: /path/to/input.json # train_file_output_path: /path/to/output.json - # Caching (optional) + # Caching (required for multi-rank torchrun; use a shared writable path for multi-node) # cache_embeddings_dir: /path/to/cache load_embeddings_from_cache: false # Mining parameters hard_negatives_to_mine: 20 + # hard_neg_margin_type=perc removes candidates scoring above min_positive_score * hard_neg_margin. + # hard_neg_margin_type=abs removes candidates scoring above min_positive_score - hard_neg_margin. hard_neg_margin: 0.95 hard_neg_margin_type: perc mining_batch_size: 128 From 441d64be01f1407a1fceb66dc3ae609df1f21a75 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 17:31:20 +0100 Subject: [PATCH 12/25] docs(retrieval): resolve final review gaps Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/conf.py | 2 +- docs/guides/dataset-overview.md | 26 ++------------ docs/guides/llm/retrieval-dataset-2.md | 9 ----- docs/guides/llm/retrieval-dataset.md | 15 +++++--- docs/guides/llm/retrieval-finetuning.md | 34 +++++++++++++------ .../retrieval/data_utils/mining_config.yaml | 34 ++++++++----------- 6 files changed, 52 insertions(+), 68 deletions(-) delete mode 100644 docs/guides/llm/retrieval-dataset-2.md diff --git a/docs/conf.py b/docs/conf.py index 8b1ed5ce75..ad9e002958 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -50,7 +50,7 @@ ] templates_path = ["_templates"] -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "documentation.md", "guides/llm/retrieval-dataset-2.md"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "documentation.md"] # -- Options for MyST Parser (Markdown) -------------------------------------- # MyST Parser settings diff --git a/docs/guides/dataset-overview.md b/docs/guides/dataset-overview.md index f45135a3b8..0b46b62e5e 100644 --- a/docs/guides/dataset-overview.md +++ b/docs/guides/dataset-overview.md @@ -62,27 +62,6 @@ dataset: ``` See the detailed guide, [Column-Mapped Text Instruction Dataset](llm/column-mapped-text-instruction-dataset.md), for more information. -- **ChatDataset (multi-turn conversations and tool calling)** - - Class: `nemo_automodel.components.datasets.llm.ChatDataset` - - Use case: multi-turn conversations and tool calling in OpenAI chat format - - Sources: local JSON/JSONL or Hugging Face Hub dataset ID - - Key args: - - `path_or_dataset_id`: path to local file(s) or HuggingFace dataset ID - - `tokenizer`: tokenizer instance (required. Must have chat template support) - - `split`: dataset split (e.g., "train", "validation") - - `name`: dataset configuration/subset name - - `seq_length`: maximum sequence length for padding/truncation - - `padding`: padding strategy ("do_not_pad", "max_length", etc.) - - `truncation`: truncation strategy ("do_not_truncate", "longest_first", etc.) - - `start_of_turn_token`: token marking assistant response start (for answer-only loss) - - `chat_template`: optional override for tokenizer's chat template - - `skip_invalid_samples`: if ``true``, skip malformed JSONL lines when reading local files (warnings log skip counts); default ``false`` fails fast on a bad line - - Notes: - - Requires a tokenizer with chat template support - - Supports both single-turn and multi-turn tool calling - - Tool definitions are provided in a `tools` field at the conversation level - - Tool calls appear in assistant messages via `tool_calls` field - - Tool responses use the `tool` role ### ChatDataset (Multi-Turn Conversations and Tool Calling) - Class: `nemo_automodel.components.datasets.llm.ChatDataset` - Use case: multi-turn conversations and tool calling in OpenAI chat format @@ -237,14 +216,15 @@ dataset: See the [Function Calling guide](llm/toolcalling.md) for an end-to-end example with FunctionGemma. For a small reasoning-style chat SFT starting point, see [qwen2_5_0p5b_instruct_fineproofs_chat.yaml](../../examples/llm_finetune/qwen/qwen2_5_0p5b_instruct_fineproofs_chat.yaml). -### Retrieval (Embedding Fine-Tuning) +### Retrieval Fine-Tuning - Factory for corpus ID JSON and `hf://` AutoModel-schema sources: `nemo_automodel.components.datasets.llm.make_retrieval_dataset` - Factory for inline JSONL: `nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset` - Collators: `nemo_automodel.components.datasets.llm.BiEncoderCollator` or `nemo_automodel.components.datasets.llm.CrossEncoderCollator` -- Use case: embedding model fine-tuning with (query, positive doc, negative docs) contrastive learning +- Use case: bi-encoder embedding fine-tuning with contrastive query/passage groups, or cross-encoder reranking over + query/passage pairs - Supported retrieval sources: - Corpus-ID JSON (Merlin/NeMo-retriever style) - `hf://` sources that already follow the AutoModel retrieval schema diff --git a/docs/guides/llm/retrieval-dataset-2.md b/docs/guides/llm/retrieval-dataset-2.md deleted file mode 100644 index c06dbc4875..0000000000 --- a/docs/guides/llm/retrieval-dataset-2.md +++ /dev/null @@ -1,9 +0,0 @@ -# Retrieval Dataset Draft Replaced - -This file previously contained a stale draft with obsolete retrieval API names. Do not use it for configuration -examples. - -Use the maintained guides instead: - -- [Retrieval Dataset](retrieval-dataset.md) -- [Retrieval Fine-Tuning](retrieval-finetuning.md) diff --git a/docs/guides/llm/retrieval-dataset.md b/docs/guides/llm/retrieval-dataset.md index 9e0e7fd48d..26dd85a570 100644 --- a/docs/guides/llm/retrieval-dataset.md +++ b/docs/guides/llm/retrieval-dataset.md @@ -26,8 +26,10 @@ query-passage pair for reranking. Training uses exactly one positive passage per example: the first item in `pos_doc`. For datasets with multiple relevant passages, either choose a canonical positive, expand the record into one example per positive, or add a -multi-positive loss/masking strategy before training. Keep the full set of known positives in your qrels or corpus -metadata for evaluation and false-negative filtering, even when each training row uses one positive. +multi-positive loss/masking strategy before training. If expanded rows for the same query can share a batch, keep +distributed in-batch negatives disabled unless you also prevent sibling positives from becoming negatives through +qrels-aware sampling or masking. Keep the full set of known positives in your qrels or corpus metadata for evaluation +and false-negative filtering, even when each training row uses one positive. ## Supported Input Formats @@ -53,7 +55,9 @@ records before training: multiple records if you want every positive to become a supervised positive. 4. For hard-negative mining, include all known positive document IDs for that query in the row's `pos_doc`. The miner excludes only IDs present in the input row, not an external qrels file. -5. Preserve the complete qrels separately for full-corpus evaluation and audit mined negatives against them before +5. If you expand one query into multiple positive rows, keep those sibling-positive rows out of the same in-batch-negative + training batch or use qrels-aware masking. +6. Preserve the complete qrels separately for full-corpus evaluation and audit mined negatives against them before reusing the output for training. ### Corpus ID-Based JSON (Merlin/NeMo-Retriever Style) @@ -108,7 +112,9 @@ The `corpus_id` in `merlin_metadata.json` must match the `corpus_id` in each tra - `pos_doc` and `neg_doc` can be lists of `{"id": ...}` dicts or raw IDs (they are normalized internally). - Training uses `pos_doc[0]` as the positive. Additional positives are ignored unless you expand the data before training. -- If you set `use_dataset_instruction: true`, optional fields like `query_instruction` and `passage_instruction` in `merlin_metadata.json` are surfaced to the collator. +- To train with corpus instructions, set `use_dataset_instruction: true` on both the dataset and the bi-encoder + collator. The dataset surfaces `query_instruction` and `passage_instruction` from `merlin_metadata.json`; the collator + prepends them before tokenization. ::: ### Hugging Face `hf://` Sources @@ -174,6 +180,7 @@ dataloader: p_max_len: 512 query_prefix: "query:" passage_prefix: "passage:" + use_dataset_instruction: false pad_to_multiple_of: 8 ``` diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 7c4462880e..c7ad153656 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -129,7 +129,7 @@ To migrate inline data to corpus ID JSON, assign a stable document ID to each un a corpus split with `id` and `text` columns, then replace inline `pos_doc` and `neg_doc` strings with those IDs. Keep all known positives for each query in your qrels or source metadata, even if each training row uses only the first positive. Otherwise, in-batch negatives and mined hard negatives can accidentally treat another relevant passage as a -negative. +negative. The detailed source schemas and conversion rules live in [Retrieval Dataset](retrieval-dataset.md). For larger corpora, use the corpus ID-based JSON format from the dataset guide. Use `nemo_automodel.components.datasets.llm.make_retrieval_dataset` for corpus ID-based JSON and for `hf://` sources that @@ -150,7 +150,9 @@ The training recipe does not load a separate qrels file. Materialize qrels into For training, `pos_doc[0]` is the supervised positive. For mining, keep every known positive for the query in `pos_doc` so the miner can exclude those IDs; it does not read an external qrels file. If you expand multi-positive queries into one row per positive, make sure sibling positives are removed from `neg_doc` and audited out of mined negatives before -training with in-batch negatives. Keep the original qrels for offline Recall@K, MRR@K, and nDCG@K evaluation. +training. Also keep sibling-positive rows out of the same in-batch-negative training batch, disable distributed in-batch +negatives, or add qrels-aware sampling/masking. Keep the original qrels for offline Recall@K, MRR@K, and nDCG@K +evaluation. ## Minimal Config Anatomy @@ -290,8 +292,9 @@ Important knobs: additional negatives. Enable it with `model.do_distributed_inbatch_negative: true` or the CLI override `--model.do_distributed_inbatch_negative true`. Today it all-gathers over the default process group, so use it only for pure DP/FSDP retrieval runs (`tp_size: 1`, `cp_size: 1`). Same-document masking requires `doc_id` fields from - corpus-backed or custom datasets; inline JSONL does not provide duplicate-document masking. Keep it disabled for - ColBERT-style pooling. + corpus-backed or custom datasets; inline JSONL does not provide duplicate-document masking. For multi-positive queries + expanded into separate rows, keep it disabled unless your sampler or masking prevents sibling positives from becoming + negatives. Keep it disabled for ColBERT-style pooling. The complete example is [`examples/retrieval/bi_encoder/llama3_2_1b.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/bi_encoder/llama3_2_1b.yaml). @@ -369,8 +372,10 @@ torchrun \ ``` Use a shared or pre-populated Hugging Face cache on every node, make dataset paths visible to every rank, and use a -unique `checkpoint_dir` for each experiment. Increase `dist_env.timeout_minutes` for first model downloads, slow shared -filesystems, multi-node collectives, or large checkpoint writes. +unique `checkpoint_dir` for each experiment. For multi-node training, `checkpoint_dir` must be on a shared, persistent +filesystem mounted at the same path from every node; relative `./output/...` paths are appropriate only when they resolve +to shared storage. Increase `dist_env.timeout_minutes` for first model downloads, slow shared filesystems, multi-node +collectives, or large checkpoint writes. The step scheduler computes gradient accumulation from: @@ -503,7 +508,8 @@ torchrun --nproc_per_node=8 examples/retrieval/data_utils/mine_hard_negatives.py --mining.query_prefix "query: " \ --mining.passage_prefix "passage: " \ --mining.query_max_length 512 \ - --mining.passage_max_length 512 + --mining.passage_max_length 512 \ + --mining.add_eos_token false ``` Hard-negative mining expects the corpus ID-based retrieval JSON format described in the dataset guide, not the inline @@ -518,14 +524,20 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: `min_positive_score - hard_neg_margin` are removed. Inspect mined samples when positive scores are low or negative. - `query_prefix` and `passage_prefix`: keep these semantically consistent with the bi-encoder training config. The miner concatenates prefixes directly, while `BiEncoderCollator` inserts a space after non-empty prefixes; include the - trailing space in mining prefixes. + trailing space in mining prefixes. The miner supports static prefixes only. If training used + `use_dataset_instruction: true`, materialize the same instruction text into the mining input or equivalent static + prefixes before mining. - `query_max_length` and `passage_max_length`: keep these consistent with training unless you intentionally change truncation. -- `use_negatives_from_file`: include existing negatives from the input file when mining. +- `add_bos_token` and `add_eos_token`: match the tokenizer behavior used during training. If omitted, mining falls back + to tokenizer defaults, which can differ from the training config. +- `use_negatives_from_file`: include existing negatives from the input file when mining. Existing negatives are prepended + to the output and mined negatives are appended, so deduplicate and audit the output before using it for training. - `cache_embeddings_dir`: required for distributed mining so ranks can share cached passage embeddings. Rank `0` assembles the final embedding cache and score outputs, so plan memory and local disk accordingly. In multi-node mining, this must be a shared writable path mounted at the same location on every node; node-local cache paths leave - rank `0` unable to read remote-rank shards. + rank `0` unable to read remote-rank shards. Use a fresh cache directory for each model, dataset, prefix, sequence + length, and world-size combination; stale cache files can be reused if they are already present. Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. Hard negative mining excludes document IDs listed in each input row's `pos_doc`, but it cannot read an external qrels file or @@ -557,7 +569,7 @@ number. With `save_consolidated: true`, AutoModel also writes a Hugging Face-compatible model under: ```text -./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_500/model/consolidated/ +./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated/ ``` The `LATEST` symlink points to the most recent checkpoint. To resume from the latest compatible checkpoint, set: diff --git a/examples/retrieval/data_utils/mining_config.yaml b/examples/retrieval/data_utils/mining_config.yaml index a53d4a9355..a4de105e90 100644 --- a/examples/retrieval/data_utils/mining_config.yaml +++ b/examples/retrieval/data_utils/mining_config.yaml @@ -22,7 +22,8 @@ # --mining.model_name_or_path /path/to/encoder/checkpoint \ # --mining.train_qa_file_path /path/to/input.json \ # --mining.train_file_output_path /path/to/output.json \ -# --mining.cache_embeddings_dir /shared/path/to/cache +# --mining.cache_embeddings_dir /shared/path/to/cache/ \ +# --mining.add_eos_token false # Distributed environment settings dist_env: @@ -34,7 +35,8 @@ dist_env: # mining.model_name_or_path: Path to encoder checkpoint # mining.train_qa_file_path: Input QA file # mining.train_file_output_path: Output file with mined negatives -# mining.cache_embeddings_dir: Required for multi-rank torchrun; optional only for single-process mining +# mining.cache_embeddings_dir: Required for multi-rank torchrun; optional only for single-process mining. +# Use a fresh path per model/dataset/prefix/length/world-size combination. mining: # Model path - REQUIRED (override via --mining.model_name_or_path) @@ -43,20 +45,9 @@ mining: # Tokenizer path - defaults to model_name_or_path if not specified # tokenizer_name_or_path: /path/to/tokenizer - # Tokenizer special token behavior - # - # DEFAULT (recommended for Automodel-trained models): - # Leave commented out to use Automodel's tokenizer defaults. - # This ensures mining stays synchronized with training behavior. - # - # OVERRIDE (for models trained in other frameworks): - # Uncomment and set explicitly only when mining with models trained - # outside Automodel that used specific tokenizer configurations. - # - # Examples: - # - Automodel models: Leave commented (uses Automodel defaults) - # - nvidia/llama-embed-nemotron-8b: add_bos_token: true, add_eos_token: false - # - Custom external model: Check training config and set accordingly + # Tokenizer special token behavior. Match the tokenizer settings used during training. + # If left commented, mining falls back to tokenizer defaults, which may differ from + # the fine-tuning config. # # add_bos_token: true # add_eos_token: false @@ -65,8 +56,9 @@ mining: # train_qa_file_path: /path/to/input.json # train_file_output_path: /path/to/output.json - # Caching (required for multi-rank torchrun; use a shared writable path for multi-node) - # cache_embeddings_dir: /path/to/cache + # Caching (required for multi-rank torchrun; use a shared writable path for multi-node). + # Use a fresh directory per model, dataset, prefix, sequence-length, and world-size combination. + # cache_embeddings_dir: /path/to/cache/ load_embeddings_from_cache: false # Mining parameters @@ -82,7 +74,8 @@ mining: document_embedding_batch_size: 16 corpus_chunk_size: 50000 - # Text prefixes (for models trained with prefixes) + # Text prefixes (for models trained with prefixes). Mining supports static prefixes only; if training used + # use_dataset_instruction, materialize equivalent instruction text into these prefixes or into the mining input. query_prefix: "" passage_prefix: "" @@ -90,5 +83,6 @@ mining: query_max_length: 512 passage_max_length: 512 - # Whether to include negatives from the input file + # Whether to include negatives from the input file. Existing negatives are prepended to the mined output; + # deduplicate/audit the final file before training. use_negatives_from_file: false From b6b43b3a0e7491f5a81468f4a79fdc65be6f9856 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 17:53:40 +0100 Subject: [PATCH 13/25] docs(retrieval): polish final review feedback Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/dataset-overview.md | 4 +-- docs/guides/llm/retrieval-dataset.md | 10 ++++-- docs/guides/llm/retrieval-finetuning.md | 31 ++++++++++++------- .../retrieval/data_utils/mining_config.yaml | 2 +- 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/docs/guides/dataset-overview.md b/docs/guides/dataset-overview.md index 0b46b62e5e..c46e1e9496 100644 --- a/docs/guides/dataset-overview.md +++ b/docs/guides/dataset-overview.md @@ -2,7 +2,7 @@ This page summarizes the datasets supported in NeMo AutoModel for LLM, VLM, and retrieval training and shows how to plug in your own datasets using Python functions or the YAML `_target_` mechanism. -- See also: [LLM datasets](llm/dataset.md), [VLM datasets](vlm/dataset.md), and [Retrieval dataset](llm/retrieval-dataset.md) for deeper, task-specific guides. +- See also: [LLM datasets](llm/dataset.md), [VLM datasets](vlm/dataset.md), [Retrieval dataset](llm/retrieval-dataset.md), and [Retrieval fine-tuning](llm/retrieval-finetuning.md) for deeper, task-specific guides. - If a dataset you need is missing, please open a [GitHub issue](https://github.com/NVIDIA-NeMo/Automodel/issues) with a short description and example schema so we can prioritize support. --- @@ -247,7 +247,7 @@ dataloader: batch_size: 2 shuffle: true ``` -See the detailed guide, [Retrieval dataset](llm/retrieval-dataset.md), for more information. +See [Retrieval dataset](llm/retrieval-dataset.md) for schema details and [Retrieval fine-tuning](llm/retrieval-finetuning.md) for the end-to-end workflow. ### NanoGPT Binary Shards (Pretraining) - Class: `nemo_automodel.components.datasets.llm.nanogpt_dataset.NanogptDataset` diff --git a/docs/guides/llm/retrieval-dataset.md b/docs/guides/llm/retrieval-dataset.md index 26dd85a570..5248560896 100644 --- a/docs/guides/llm/retrieval-dataset.md +++ b/docs/guides/llm/retrieval-dataset.md @@ -1,6 +1,6 @@ # Retrieval Dataset for Bi-Encoders and Cross-Encoders -NeMo Automodel supports **retrieval model fine-tuning** using a retrieval-style dataset: each training example is a +NeMo AutoModel supports **retrieval model fine-tuning** using a retrieval-style dataset: each training example is a **query** paired with **one positive** document and **one or more negative** documents. This dataset is used by the retrieval recipes (see `examples/retrieval/bi_encoder/` and @@ -33,7 +33,7 @@ and false-negative filtering, even when each training row uses one positive. ## Supported Input Formats -NeMo Automodel supports **two** input schemas across three source types. They use different dataset factories: +NeMo AutoModel supports **two** input schemas across three source types. They use different dataset factories: - Use `nemo_automodel.components.datasets.llm.make_retrieval_dataset` for corpus ID-based JSON and `hf://` sources. - Use `nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset` for inline JSONL where @@ -237,3 +237,9 @@ dataloader: - `neg_doc` must be present in local JSON and JSONL training records. It may be empty only when `n_passages: 1`. - `hf://` sources may omit `neg_doc` in the source dataset, but add negatives before training with `n_passages > 1`. - If training requests negatives (e.g., `n_passages > 1`), `neg_doc` must contain **at least one** document. + +:::{warning} +`n_passages: 1` is a schema escape hatch, not a good default training setup. The standard bi-encoder and cross-encoder +recipes need at least one negative candidate for meaningful contrastive or reranking supervision, unless you add a +custom negative strategy such as qrels-aware in-batch negatives. +::: diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index c7ad153656..3549da262a 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -36,7 +36,8 @@ a small candidate set and you want a stronger reranking stage. Before running the examples: - Use an AutoModel environment with the full GPU training dependencies installed. The NGC container is the safest path - for multi-GPU runs. + for multi-GPU runs; for source checkouts, see [Installation](../installation.md) and run + `uv sync --locked --all-groups --extra all`. - From a source checkout, use `uv run automodel ...`; from an installed environment, use `automodel ...`. - Accept access terms for the configured Hugging Face model and set `HF_TOKEN`, or replace the model path with a model your environment can download. Retrieval has custom bidirectional backbones for Llama and Ministral3 embedding @@ -45,7 +46,8 @@ Before running the examples: pooling, `num_labels`, and any retrieval-specific model arguments are accepted by the replacement model. - Make sure every rank can read the dataset paths or `hf://` sources. -The commands below use `automodel`; if you are running from a source checkout, prefix them with `uv run`. +The commands below use `automodel`; if you are running from a source checkout, prefix them with `uv run`. For direct +`torchrun` commands, use `uv run torchrun ...` from a source checkout, or activate an installed environment first. Start with a one-GPU smoke test: @@ -117,6 +119,12 @@ The key field requirements differ by source: `neg_doc` must be present for local JSON and JSONL sources. It may be `[]` only when `n_passages: 1`; when `n_passages > 1`, provide at least one negative. +:::{warning} +`n_passages: 1` is useful for schema checks or custom negative strategies, but it is not a good default training setup. +The standard bi-encoder and cross-encoder recipes need at least one negative candidate for meaningful contrastive or +reranking supervision, unless you add a custom strategy such as qrels-aware in-batch negatives. +::: + For quick custom experiments, inline JSONL is the simplest format. Use the inline dataset factory for these files, and switch to corpus ID-based JSON before hard-negative mining or full-corpus evaluation: @@ -233,8 +241,9 @@ distributed: ``` For a cross-encoder, change `recipe`, `model._target_`, `dataloader.dataset.model_type`, and `dataloader.collate_fn` -to the cross-encoder values shown below. Also set `model.num_labels: 1`, keep `model.temperature`, and replace -`q_max_len` / `p_max_len` with `rerank_max_length` in the collator. +to the cross-encoder values shown below. Also set `model.num_labels: 1`, set the loss temperature under +`model.temperature`, replace `q_max_len` / `p_max_len` with `rerank_max_length` in the collator, and use a separate +`checkpoint.checkpoint_dir` such as `./output/llama3_2_1b_cross_encoder/checkpoints`. ## Configure a Bi-Encoder @@ -297,7 +306,7 @@ Important knobs: negatives. Keep it disabled for ColBERT-style pooling. The complete example is -[`examples/retrieval/bi_encoder/llama3_2_1b.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/bi_encoder/llama3_2_1b.yaml). +[`examples/retrieval/bi_encoder/llama3_2_1b.yaml`](../../../examples/retrieval/bi_encoder/llama3_2_1b.yaml). ## Configure a Cross-Encoder @@ -350,7 +359,7 @@ Important knobs: to index `0`. The complete example is -[`examples/retrieval/cross_encoder/llama3_2_1b.yaml`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/retrieval/cross_encoder/llama3_2_1b.yaml). +[`examples/retrieval/cross_encoder/llama3_2_1b.yaml`](../../../examples/retrieval/cross_encoder/llama3_2_1b.yaml). ## Distributed Launch and Batch Size @@ -362,7 +371,7 @@ For multi-node runs, launch with your cluster launcher or an external `torchrun` rank and rendezvous endpoint: ```bash -torchrun \ +uv run torchrun \ --nnodes 2 \ --nproc-per-node 8 \ --node-rank ${NODE_RANK} \ @@ -499,7 +508,7 @@ memory and want maximum adaptation. After an initial bi-encoder run, mine harder negatives with the consolidated encoder checkpoint: ```bash -torchrun --nproc_per_node=8 examples/retrieval/data_utils/mine_hard_negatives.py \ +uv run torchrun --nproc_per_node=8 examples/retrieval/data_utils/mine_hard_negatives.py \ --config examples/retrieval/data_utils/mining_config.yaml \ --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated \ --mining.train_qa_file_path /path/to/input.json \ @@ -614,8 +623,8 @@ jointly, so they are usually too expensive for first-stage full-corpus search. ## Related Files -- Bi-encoder recipe: [`nemo_automodel/recipes/retrieval/train_bi_encoder.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/retrieval/train_bi_encoder.py) -- Cross-encoder recipe: [`nemo_automodel/recipes/retrieval/train_cross_encoder.py`](https://github.com/NVIDIA-NeMo/Automodel/blob/main/nemo_automodel/recipes/retrieval/train_cross_encoder.py) +- Bi-encoder recipe: [`nemo_automodel/recipes/retrieval/train_bi_encoder.py`](../../../nemo_automodel/recipes/retrieval/train_bi_encoder.py) +- Cross-encoder recipe: [`nemo_automodel/recipes/retrieval/train_cross_encoder.py`](../../../nemo_automodel/recipes/retrieval/train_cross_encoder.py) - Retrieval dataset guide: [Retrieval Dataset](retrieval-dataset.md) - Llama-Embed-Nemotron-8B example: - [`examples/retrieval/bi_encoder/llama_embed_nemotron_8b/`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/retrieval/bi_encoder/llama_embed_nemotron_8b) + [`examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml`](../../../examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml) diff --git a/examples/retrieval/data_utils/mining_config.yaml b/examples/retrieval/data_utils/mining_config.yaml index a4de105e90..ddd6ba8d92 100644 --- a/examples/retrieval/data_utils/mining_config.yaml +++ b/examples/retrieval/data_utils/mining_config.yaml @@ -17,7 +17,7 @@ # so no model architecture config is needed here. # # Usage: -# torchrun --nproc_per_node=8 ./examples/retrieval/data_utils/mine_hard_negatives.py \ +# uv run torchrun --nproc_per_node=8 ./examples/retrieval/data_utils/mine_hard_negatives.py \ # --config examples/retrieval/data_utils/mining_config.yaml \ # --mining.model_name_or_path /path/to/encoder/checkpoint \ # --mining.train_qa_file_path /path/to/input.json \ From f4cdc9daa2d614aa9187d7879a464fd16aaa0d22 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 18:09:51 +0100 Subject: [PATCH 14/25] docs(retrieval): align final mining examples Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-finetuning.md | 13 +++++++------ .../retrieval/data_utils/mine_hard_negatives.py | 2 +- examples/retrieval/data_utils/mining_config.yaml | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 3549da262a..8e77f62512 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -37,7 +37,7 @@ Before running the examples: - Use an AutoModel environment with the full GPU training dependencies installed. The NGC container is the safest path for multi-GPU runs; for source checkouts, see [Installation](../installation.md) and run - `uv sync --locked --all-groups --extra all`. + `uv sync --frozen --extra all`. - From a source checkout, use `uv run automodel ...`; from an installed environment, use `automodel ...`. - Accept access terms for the configured Hugging Face model and set `HF_TOKEN`, or replace the model path with a model your environment can download. Retrieval has custom bidirectional backbones for Llama and Ministral3 embedding @@ -508,7 +508,7 @@ memory and want maximum adaptation. After an initial bi-encoder run, mine harder negatives with the consolidated encoder checkpoint: ```bash -uv run torchrun --nproc_per_node=8 examples/retrieval/data_utils/mine_hard_negatives.py \ +uv run torchrun --standalone --nproc_per_node=8 examples/retrieval/data_utils/mine_hard_negatives.py \ --config examples/retrieval/data_utils/mining_config.yaml \ --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated \ --mining.train_qa_file_path /path/to/input.json \ @@ -581,7 +581,7 @@ With `save_consolidated: true`, AutoModel also writes a Hugging Face-compatible ./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated/ ``` -The `LATEST` symlink points to the most recent checkpoint. To resume from the latest compatible checkpoint, set: +The `LATEST` symlink points to the most recent checkpoint. To resume exactly from that pointer, set: ```yaml checkpoint: @@ -590,9 +590,10 @@ checkpoint: restore_from: LATEST ``` -If `checkpoint.restore_from` is omitted, AutoModel still auto-detects the latest compatible checkpoint in -`checkpoint_dir` and resumes from it. Use a new or empty `checkpoint_dir` for fresh experiments, and rotate or clear -`training.jsonl` and `validation.jsonl` if you do not want logs from multiple runs appended together. +Explicit restore paths, including `LATEST`, are loaded directly. If `checkpoint.restore_from` is omitted, AutoModel +auto-detects the latest compatible checkpoint in `checkpoint_dir` and resumes from it. Use a new or empty +`checkpoint_dir` for fresh experiments, and rotate or clear `training.jsonl` and `validation.jsonl` if you do not want +logs from multiple runs appended together. When `checkpoint.is_async: true`, the `LATEST` symlink can lag the most recent write at job end. For final mining, export, or evaluation workflows, prefer the explicit `epoch_*_step_*` checkpoint directory or keep async checkpointing diff --git a/examples/retrieval/data_utils/mine_hard_negatives.py b/examples/retrieval/data_utils/mine_hard_negatives.py index 13e3873cda..26207b39a2 100755 --- a/examples/retrieval/data_utils/mine_hard_negatives.py +++ b/examples/retrieval/data_utils/mine_hard_negatives.py @@ -13,7 +13,7 @@ # limitations under the License. # To run this script, use the following command: -# torchrun --nproc_per_node=8 --master_port=29500 ./examples/retrieval/data_utils/mine_hard_negatives.py \ +# uv run torchrun --standalone --nproc_per_node=8 ./examples/retrieval/data_utils/mine_hard_negatives.py \ # --config examples/retrieval/data_utils/mining_config.yaml \ # --mining.model_name_or_path /path/to/encoder/checkpoint \ # --mining.train_qa_file_path /path/to/input.json \ diff --git a/examples/retrieval/data_utils/mining_config.yaml b/examples/retrieval/data_utils/mining_config.yaml index ddd6ba8d92..ab25032108 100644 --- a/examples/retrieval/data_utils/mining_config.yaml +++ b/examples/retrieval/data_utils/mining_config.yaml @@ -17,7 +17,7 @@ # so no model architecture config is needed here. # # Usage: -# uv run torchrun --nproc_per_node=8 ./examples/retrieval/data_utils/mine_hard_negatives.py \ +# uv run torchrun --standalone --nproc_per_node=8 ./examples/retrieval/data_utils/mine_hard_negatives.py \ # --config examples/retrieval/data_utils/mining_config.yaml \ # --mining.model_name_or_path /path/to/encoder/checkpoint \ # --mining.train_qa_file_path /path/to/input.json \ From 1901c169c0435de7df819cfd7ec15e575848a304 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 18:31:31 +0100 Subject: [PATCH 15/25] docs(retrieval): address persona review gaps Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-dataset.md | 2 +- docs/guides/llm/retrieval-finetuning.md | 127 +++++++++++++++--- .../retrieval/bi_encoder/llama3_2_1b.yaml | 2 - .../retrieval/cross_encoder/llama3_2_1b.yaml | 2 - .../data_utils/mine_hard_negatives.py | 16 ++- .../retrieval/data_utils/mining_config.yaml | 8 +- .../retrieval/data_utils/unroll_pos_docs.py | 51 +++++-- 7 files changed, 167 insertions(+), 41 deletions(-) diff --git a/docs/guides/llm/retrieval-dataset.md b/docs/guides/llm/retrieval-dataset.md index 5248560896..39d48ed6e0 100644 --- a/docs/guides/llm/retrieval-dataset.md +++ b/docs/guides/llm/retrieval-dataset.md @@ -41,7 +41,7 @@ NeMo AutoModel supports **two** input schemas across three source types. They us | Source | Query field | Required document fields | Best for | |--------|-------------|--------------------------|----------| -| Corpus ID JSON | `question` | `pos_doc`, `neg_doc`, and `corpus_id` IDs resolved through a local corpus | Production data, hard-negative mining, same-document masking | +| Corpus ID JSON | `question` | `question_id`, `corpus_id`, `pos_doc`, and `neg_doc` IDs resolved through a local corpus | Production data, hard-negative mining, same-document masking | | `hf://` AutoModel schema | `question` | `pos_doc`, a companion HF corpus split, and `neg_doc` before training with `n_passages > 1` | Tutorial runs and shared AutoModel retrieval datasets | | Inline JSONL | `query` or `question` | Inline text in `pos_doc` and `neg_doc` | Small custom runs when you do not need mining or document-ID masking | diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 8e77f62512..0d4b1e6249 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -38,12 +38,13 @@ Before running the examples: - Use an AutoModel environment with the full GPU training dependencies installed. The NGC container is the safest path for multi-GPU runs; for source checkouts, see [Installation](../installation.md) and run `uv sync --frozen --extra all`. -- From a source checkout, use `uv run automodel ...`; from an installed environment, use `automodel ...`. +- Run the example commands from a source checkout or an NGC/container workspace that contains the repository + `examples/` tree. The YAML configs and mining helpers below are repo-relative; if you use an installed package + without the repository, copy the referenced config/script files into your own project and update the paths. +- From a source checkout, use `uv run automodel ...`; from an installed environment that has local copies of the + configs, use `automodel ...`. - Accept access terms for the configured Hugging Face model and set `HF_TOKEN`, or replace the model path with a model - your environment can download. Retrieval has custom bidirectional backbones for Llama and Ministral3 embedding - models, and a custom Llama scoring backbone for cross-encoders. Other Hugging Face model types fall back to - `AutoModel` for bi-encoders or `AutoModelForSequenceClassification` for cross-encoders; verify that the tokenizer, - pooling, `num_labels`, and any retrieval-specific model arguments are accepted by the replacement model. + your environment can download. See the support matrix below before swapping model families. - Make sure every rank can read the dataset paths or `hf://` sources. The commands below use `automodel`; if you are running from a source checkout, prefix them with `uv run`. For direct @@ -97,6 +98,18 @@ Adjust `--nproc-per-node` to the number of GPUs on your machine. The examples us The bi-encoder computes a query embedding and passage embeddings independently. The cross-encoder formats each query-passage pair into one sequence and predicts a score for each candidate passage. +Supported model families and effective retrieval kwargs: + +| Model `config.model_type` | Bi-encoder behavior | Cross-encoder behavior | Effective retrieval kwargs | +|---------------------------|---------------------|------------------------|----------------------------| +| `llama`, `llama_bidirec` | Uses `LlamaBidirectionalModel` | Uses `LlamaBidirectionalForSequenceClassification` | Bi-encoder: `pooling`, wrapper-level `l2_normalize`, and top-level recipe `temperature`. Cross-encoder: `pooling`, `num_labels`, and `temperature` on the Llama scoring config. | +| `ministral3`, `ministral3_bidirec` | Uses `Ministral3BidirectionalModel` | Not supported by the custom retrieval registry today; use a different cross-encoder backbone. | Bi-encoder: `pooling`, wrapper-level `l2_normalize`, and top-level recipe `temperature`. | +| Any other Hugging Face model type | Falls back to `AutoModel` | Falls back to `AutoModelForSequenceClassification` only when the model type is not listed above | Bi-encoder fallback receives standard Hugging Face `from_pretrained` kwargs; `pooling` and `l2_normalize` still apply in the AutoModel wrapper. Cross-encoder fallback forwards `num_labels`; do not assume custom `pooling` or `temperature` are accepted unless that HF class documents them. | + +Known model types with a registry entry fail fast when the requested retrieval task is unsupported rather than falling +back silently. For example, `ministral3` is supported for bi-encoder embeddings but not for the cross-encoder scoring +recipe. + ## Prepare Data Use the retrieval dataset format described in [Retrieval Dataset](retrieval-dataset.md). Choose the data path that @@ -458,7 +471,9 @@ repeatable: query count, corpus size, qrels source, judged/unjudged handling, ex values, baseline checkpoint, and whether confidence intervals or significance tests were used. For cross-encoders, freeze a first-stage retriever, rerank its top-K candidates, and report reranking metrics on that -same candidate set. Do not compare cross-encoder candidate-group validation directly to full-corpus bi-encoder metrics. +same candidate set. Also report first-stage candidate recall or coverage: if a query's positive document is missing +from the retriever top-K, count that query as a miss rather than dropping it from reranker evaluation. Do not compare +cross-encoder candidate-group validation directly to full-corpus bi-encoder metrics. ## Monitor Training @@ -505,7 +520,8 @@ memory and want maximum adaptation. ## Mine Hard Negatives -After an initial bi-encoder run, mine harder negatives with the consolidated encoder checkpoint: +After an initial bi-encoder run, mine harder negatives with the consolidated encoder checkpoint. This single-node +example uses `--standalone`: ```bash uv run torchrun --standalone --nproc_per_node=8 examples/retrieval/data_utils/mine_hard_negatives.py \ @@ -513,7 +529,7 @@ uv run torchrun --standalone --nproc_per_node=8 examples/retrieval/data_utils/mi --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated \ --mining.train_qa_file_path /path/to/input.json \ --mining.train_file_output_path /path/to/output.json \ - --mining.cache_embeddings_dir /path/to/cache \ + --mining.cache_embeddings_dir /shared/path/to/cache/llama3_2_1b_fever_mine_v1 \ --mining.query_prefix "query: " \ --mining.passage_prefix "passage: " \ --mining.query_max_length 512 \ @@ -521,13 +537,34 @@ uv run torchrun --standalone --nproc_per_node=8 examples/retrieval/data_utils/mi --mining.add_eos_token false ``` +For multi-node mining, replace `--standalone` with the same explicit rendezvous flags you use for multi-node training: + +```bash +uv run torchrun \ + --nnodes 2 \ + --nproc-per-node 8 \ + --node-rank ${NODE_RANK} \ + --rdzv-backend c10d \ + --rdzv-endpoint ${MASTER_ADDR}:${MASTER_PORT} \ + examples/retrieval/data_utils/mine_hard_negatives.py \ + --config examples/retrieval/data_utils/mining_config.yaml \ + --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated \ + --mining.train_qa_file_path /path/to/input.json \ + --mining.train_file_output_path /path/to/output.json \ + --mining.cache_embeddings_dir /shared/path/to/cache/llama3_2_1b_fever_mine_v1 \ + --mining.query_prefix "query: " \ + --mining.passage_prefix "passage: " \ + --mining.add_eos_token false +``` + Hard-negative mining expects the corpus ID-based retrieval JSON format described in the dataset guide, not the inline JSONL shortcut. The input must reference one corpus so the miner can build a passage embedding cache, retrieve candidates, and write mined negatives back to each query. Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: -- `hard_negatives_to_mine`: number of negatives to add per query. +- `hard_negatives_to_mine`: target number of negatives to add per query. The miner can return fewer when the corpus has + too few valid candidates or margin filtering removes high-scoring candidates. Audit per-query counts before training. - `hard_neg_margin` and `hard_neg_margin_type`: filter near-positive candidates. With `hard_neg_margin_type: perc`, candidates scoring above `min_positive_score * hard_neg_margin` are removed; with `abs`, candidates scoring above `min_positive_score - hard_neg_margin` are removed. Inspect mined samples when positive scores are low or negative. @@ -551,7 +588,9 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. Hard negative mining excludes document IDs listed in each input row's `pos_doc`, but it cannot read an external qrels file or know every semantically relevant duplicate. Put all known positive IDs for the query in the mining input, deduplicate the -corpus, inspect mined samples, and avoid mining from validation or test corpora. +corpus, inspect mined samples, filter duplicate IDs and `-inf` scores from mined outputs, and avoid mining from +validation or test corpora. If you unroll multi-positive training data, mine from rows that still carry every known +positive in `pos_doc`; otherwise sibling positives can be mined as false negatives. ## Save, Resume, and Use the Checkpoint @@ -575,13 +614,20 @@ Checkpoint directory names use the scheduler step at save time. The saved schedu for exact paths prefer the `Saving checkpoint to ...` log line or the `LATEST` pointer over hand-constructing a step number. -With `save_consolidated: true`, AutoModel also writes a Hugging Face-compatible model under: +With `save_consolidated: true` and full fine-tuning, AutoModel also writes a Hugging Face-compatible model under: ```text ./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated/ ``` -The `LATEST` symlink points to the most recent checkpoint. To resume exactly from that pointer, set: +PEFT/LoRA runs save adapter artifacts under the checkpoint `model/` directory instead of the full consolidated export +path above. Resume LoRA training from the AutoModel checkpoint directory, but use full fine-tuning when you need the +`model/consolidated` path for the mining command shown in this guide. If you need mining or serving from LoRA weights, +first produce a HF-loadable merged/exported encoder with your adapter workflow and point +`--mining.model_name_or_path` at that exported directory. + +The `LATEST` symlink points to the most recent checkpoint when it is valid. To resume from the latest resolved +checkpoint, set: ```yaml checkpoint: @@ -590,10 +636,11 @@ checkpoint: restore_from: LATEST ``` -Explicit restore paths, including `LATEST`, are loaded directly. If `checkpoint.restore_from` is omitted, AutoModel -auto-detects the latest compatible checkpoint in `checkpoint_dir` and resumes from it. Use a new or empty -`checkpoint_dir` for fresh experiments, and rotate or clear `training.jsonl` and `validation.jsonl` if you do not want -logs from multiple runs appended together. +`LATEST` is a resolver keyword: AutoModel follows the symlink or pointer file and can fall back to scanning +`epoch_*_step_*` checkpoint directories if the pointer is not usable. An explicit `epoch_*_step_*` path is the exact +restore target. If `checkpoint.restore_from` is omitted, AutoModel auto-detects the latest compatible checkpoint in +`checkpoint_dir` and resumes from it. Use a new or empty `checkpoint_dir` for fresh experiments, and rotate or clear +`training.jsonl` and `validation.jsonl` if you do not want logs from multiple runs appended together. When `checkpoint.is_async: true`, the `LATEST` symlink can lag the most recent write at job end. For final mining, export, or evaluation workflows, prefer the explicit `epoch_*_step_*` checkpoint directory or keep async checkpointing @@ -604,9 +651,57 @@ disabled for the final save. Use a bi-encoder checkpoint to encode passages, build an approximate nearest-neighbor index, encode queries, and search the index. Keep the same tokenizer, pooling, normalization, prefixes, and max lengths that you used for training. +Minimal bi-encoder loading and scoring sketch: + +```python +import torch + +from nemo_automodel import NeMoAutoModelBiEncoder, NeMoAutoTokenizer + +model_path = "./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated" +tokenizer = NeMoAutoTokenizer.from_pretrained(model_path, add_eos_token=False) +model = NeMoAutoModelBiEncoder.from_pretrained(model_path, use_liger_kernel=False).eval() +device = next(model.parameters()).device + +texts = ["query: what does nvlink do?", "passage: NVLink is a high-bandwidth GPU interconnect."] +tokens = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt") +tokens = {key: value.to(device) for key, value in tokens.items()} +with torch.no_grad(): + embeddings = model.encode(tokens) +score = embeddings[0] @ embeddings[1] +``` + Use a cross-encoder checkpoint to rerank a shortlist from a retriever. Cross-encoders score each query-passage pair jointly, so they are usually too expensive for first-stage full-corpus search. +Minimal cross-encoder scoring sketch: + +```python +import torch + +from nemo_automodel import NeMoAutoModelCrossEncoder, NeMoAutoTokenizer + +model_path = "./output/llama3_2_1b_cross_encoder/checkpoints/LATEST/model/consolidated" +tokenizer = NeMoAutoTokenizer.from_pretrained(model_path, add_eos_token=False) +model = NeMoAutoModelCrossEncoder.from_pretrained(model_path, use_liger_kernel=False).eval() +device = next(model.parameters()).device + +pairs = [ + "question: what does nvlink do?\n\npassage: NVLink is a high-bandwidth GPU interconnect.", + "question: what does nvlink do?\n\npassage: Dropout regularizes neural networks.", +] +tokens = tokenizer(pairs, padding=True, truncation=True, max_length=512, return_tensors="pt") +tokens = {key: value.to(device) for key, value in tokens.items()} +with torch.no_grad(): + logits = model(tokens).logits.squeeze(-1) +ranking = torch.argsort(logits, descending=True) +``` + +Bi-encoder scores are comparable only within the same model, tokenizer, prefix, max-length, pooling, normalization, and +indexing setup. Mining scores are raw embedding similarities from that exact setup. Cross-encoder logits are +uncalibrated reranking signals; do not mix them with bi-encoder scores or use one global threshold across model +versions without calibration. + ## Troubleshooting | Symptom | Check | diff --git a/examples/retrieval/bi_encoder/llama3_2_1b.yaml b/examples/retrieval/bi_encoder/llama3_2_1b.yaml index a2c90b033a..04c383ac2f 100644 --- a/examples/retrieval/bi_encoder/llama3_2_1b.yaml +++ b/examples/retrieval/bi_encoder/llama3_2_1b.yaml @@ -80,8 +80,6 @@ dataloader: # n_passages: 5 # seed: 42 # do_shuffle: false -# max_train_samples: 1000 -# train_data_select_offset: 0 # collate_fn: # _target_: nemo_automodel.components.datasets.llm.BiEncoderCollator # q_max_len: 512 diff --git a/examples/retrieval/cross_encoder/llama3_2_1b.yaml b/examples/retrieval/cross_encoder/llama3_2_1b.yaml index 25c7b0fd5b..1d87f1e77c 100644 --- a/examples/retrieval/cross_encoder/llama3_2_1b.yaml +++ b/examples/retrieval/cross_encoder/llama3_2_1b.yaml @@ -77,8 +77,6 @@ dataloader: # n_passages: 5 # seed: 42 # do_shuffle: false -# max_train_samples: 1000 -# train_data_select_offset: 0 # collate_fn: # _target_: nemo_automodel.components.datasets.llm.CrossEncoderCollator # rerank_max_length: 512 diff --git a/examples/retrieval/data_utils/mine_hard_negatives.py b/examples/retrieval/data_utils/mine_hard_negatives.py index 26207b39a2..3aeb16b001 100755 --- a/examples/retrieval/data_utils/mine_hard_negatives.py +++ b/examples/retrieval/data_utils/mine_hard_negatives.py @@ -15,15 +15,19 @@ # To run this script, use the following command: # uv run torchrun --standalone --nproc_per_node=8 ./examples/retrieval/data_utils/mine_hard_negatives.py \ # --config examples/retrieval/data_utils/mining_config.yaml \ -# --mining.model_name_or_path /path/to/encoder/checkpoint \ +# --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated \ # --mining.train_qa_file_path /path/to/input.json \ # --mining.train_file_output_path /path/to/output.json \ -# --mining.cache_embeddings_dir /path/to/cache \ -# --mining.hard_neg_margin 0.95 +# --mining.cache_embeddings_dir /shared/path/to/cache/ \ +# --mining.query_prefix "query: " \ +# --mining.passage_prefix "passage: " \ +# --mining.query_max_length 512 \ +# --mining.passage_max_length 512 \ +# --mining.add_eos_token false # -# The model is loaded directly from the checkpoint path (--mining.model_name_or_path), -# so no model architecture config is needed. This allows mining with any saved -# encoder checkpoint without requiring the original training config. +# The model is loaded directly from --mining.model_name_or_path. Use a HF-loadable +# encoder export such as LATEST/model/consolidated from a full fine-tuning run. +# Match tokenizer, prefix, and max-length settings to the training config. # # The mining_config.yaml contains only mining parameters and dist_env settings, # not the model architecture. All mining parameters can also be overridden via diff --git a/examples/retrieval/data_utils/mining_config.yaml b/examples/retrieval/data_utils/mining_config.yaml index ab25032108..f0587a766d 100644 --- a/examples/retrieval/data_utils/mining_config.yaml +++ b/examples/retrieval/data_utils/mining_config.yaml @@ -19,10 +19,14 @@ # Usage: # uv run torchrun --standalone --nproc_per_node=8 ./examples/retrieval/data_utils/mine_hard_negatives.py \ # --config examples/retrieval/data_utils/mining_config.yaml \ -# --mining.model_name_or_path /path/to/encoder/checkpoint \ +# --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated \ # --mining.train_qa_file_path /path/to/input.json \ # --mining.train_file_output_path /path/to/output.json \ # --mining.cache_embeddings_dir /shared/path/to/cache/ \ +# --mining.query_prefix "query: " \ +# --mining.passage_prefix "passage: " \ +# --mining.query_max_length 512 \ +# --mining.passage_max_length 512 \ # --mining.add_eos_token false # Distributed environment settings @@ -32,7 +36,7 @@ dist_env: # Mining parameters - can be overridden via command line # Required parameters (must be provided via command line or here): -# mining.model_name_or_path: Path to encoder checkpoint +# mining.model_name_or_path: Path to a HF-loadable encoder export, such as LATEST/model/consolidated # mining.train_qa_file_path: Input QA file # mining.train_file_output_path: Output file with mined negatives # mining.cache_embeddings_dir: Required for multi-rank torchrun; optional only for single-process mining. diff --git a/examples/retrieval/data_utils/unroll_pos_docs.py b/examples/retrieval/data_utils/unroll_pos_docs.py index 50de69b34b..3ed94c9323 100644 --- a/examples/retrieval/data_utils/unroll_pos_docs.py +++ b/examples/retrieval/data_utils/unroll_pos_docs.py @@ -13,10 +13,13 @@ # limitations under the License. """ -Unroll training data with multiple positive documents into records with single positive documents. +Unroll training data with multiple positive documents into records with one supervised positive first. This script takes training data where each question has multiple positive documents -and creates a new dataset where each (question, pos_doc) pair becomes its own record. +and creates a new dataset where each (question, pos_doc[0]) pair becomes its own +record. The remaining known positives stay in pos_doc after the supervised positive +so hard-negative mining can exclude the full positive set. AutoModel retrieval +training uses pos_doc[0] as the supervised positive. Input Format: Expects NeMo Retriever training data format (train.json): @@ -35,14 +38,18 @@ } Output: - Same format, but each record has exactly one positive document: - {"question_id": "q0_0", "question": "...", "pos_doc": [{"id": "d1"}]} - {"question_id": "q0_1", "question": "...", "pos_doc": [{"id": "d2"}]} + Same format, but each record starts with a different supervised positive: + {"question_id": "q0_0", "question": "...", "pos_doc": [{"id": "d1"}, {"id": "d2"}]} + {"question_id": "q0_1", "question": "...", "pos_doc": [{"id": "d2"}, {"id": "d1"}]} Usage: - python examples/retrieval/data_utils/unroll_pos_docs.py data/nv_pp_dd_sdg_train_eval/train.json - python examples/retrieval/data_utils/unroll_pos_docs.py data/nv_pp_dd_sdg_train_eval/train.json --output data/nv_pp_dd_sdg_train_eval/train_unrolled.json - python examples/retrieval/data_utils/unroll_pos_docs.py data/nv_pp_dd_sdg_train_eval/train.json --suffix _unrolled + uv run python examples/retrieval/data_utils/unroll_pos_docs.py data/nv_pp_dd_sdg_train_eval/train.json + uv run python examples/retrieval/data_utils/unroll_pos_docs.py \ + data/nv_pp_dd_sdg_train_eval/train.json \ + --output data/nv_pp_dd_sdg_train_eval/train_unrolled.json + uv run python examples/retrieval/data_utils/unroll_pos_docs.py \ + data/nv_pp_dd_sdg_train_eval/train.json \ + --suffix _unrolled """ import argparse @@ -51,35 +58,55 @@ from typing import Any +def _doc_id(doc: Any) -> Any: + """Return a document ID from either a raw ID or a {"id": ...} mapping.""" + if isinstance(doc, dict): + return doc.get("id") + return doc + + +def _drop_known_positives_from_negatives(neg_docs: list[Any], pos_docs: list[Any]) -> list[Any]: + """Remove any negative whose ID is already known to be positive.""" + pos_ids = {_doc_id(doc) for doc in pos_docs} + return [doc for doc in neg_docs if _doc_id(doc) not in pos_ids] + + def unroll_training_data(data: list[dict[str, Any]]) -> list[dict[str, Any]]: """ Unroll training records with multiple positive docs into individual records. + Each output row keeps one supervised positive first, while retaining sibling + positives after it for false-negative filtering in downstream mining. + Args: data: List of training records with potentially multiple pos_doc entries Returns: - List of training records where each has exactly one pos_doc + List of training records where each row has a different pos_doc[0] """ unrolled = [] for record in data: pos_docs = record.get("pos_doc", []) + neg_docs = _drop_known_positives_from_negatives(record.get("neg_doc", []), pos_docs) if len(pos_docs) <= 1: # Already has single (or zero) pos_doc, keep as-is - unrolled.append(record) + new_record = dict(record) + new_record["neg_doc"] = neg_docs + unrolled.append(new_record) else: # Unroll into multiple records base_question_id = record["question_id"] for idx, pos_doc in enumerate(pos_docs): + ordered_pos_docs = [pos_doc] + [other_pos_doc for j, other_pos_doc in enumerate(pos_docs) if j != idx] new_record = { "question_id": f"{base_question_id}_{idx}", "question": record["question"], "corpus_id": record["corpus_id"], - "pos_doc": [pos_doc], - "neg_doc": record.get("neg_doc", []), + "pos_doc": ordered_pos_docs, + "neg_doc": neg_docs, } unrolled.append(new_record) From d38ffb3cf63dcba6163e83f0433954cff10adff1 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 18:49:01 +0100 Subject: [PATCH 16/25] docs(retrieval): close final review nits Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-finetuning.md | 58 +++++++++---- .../retrieval/bi_encoder/llama3_2_1b.yaml | 2 +- .../retrieval/cross_encoder/llama3_2_1b.yaml | 2 +- .../data_utils/mine_hard_negatives.py | 10 ++- .../retrieval/data_utils/mining_config.yaml | 11 ++- .../retrieval/data_utils/unroll_pos_docs.py | 16 ++-- .../datasets/llm/test_unroll_pos_docs.py | 87 +++++++++++++++++++ 7 files changed, 153 insertions(+), 33 deletions(-) create mode 100644 tests/unit_tests/datasets/llm/test_unroll_pos_docs.py diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 0d4b1e6249..350cc499a1 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -103,12 +103,15 @@ Supported model families and effective retrieval kwargs: | Model `config.model_type` | Bi-encoder behavior | Cross-encoder behavior | Effective retrieval kwargs | |---------------------------|---------------------|------------------------|----------------------------| | `llama`, `llama_bidirec` | Uses `LlamaBidirectionalModel` | Uses `LlamaBidirectionalForSequenceClassification` | Bi-encoder: `pooling`, wrapper-level `l2_normalize`, and top-level recipe `temperature`. Cross-encoder: `pooling`, `num_labels`, and `temperature` on the Llama scoring config. | -| `ministral3`, `ministral3_bidirec` | Uses `Ministral3BidirectionalModel` | Not supported by the custom retrieval registry today; use a different cross-encoder backbone. | Bi-encoder: `pooling`, wrapper-level `l2_normalize`, and top-level recipe `temperature`. | +| `ministral3`, `ministral3_bidirec` | Uses `Ministral3BidirectionalModel` | Direct cross-encoder scoring is not supported by the custom retrieval registry today; use a different direct cross-encoder backbone. | Bi-encoder: `pooling`, wrapper-level `l2_normalize`, and top-level recipe `temperature`. | | Any other Hugging Face model type | Falls back to `AutoModel` | Falls back to `AutoModelForSequenceClassification` only when the model type is not listed above | Bi-encoder fallback receives standard Hugging Face `from_pretrained` kwargs; `pooling` and `l2_normalize` still apply in the AutoModel wrapper. Cross-encoder fallback forwards `num_labels`; do not assume custom `pooling` or `temperature` are accepted unless that HF class documents them. | Known model types with a registry entry fail fast when the requested retrieval task is unsupported rather than falling -back silently. For example, `ministral3` is supported for bi-encoder embeddings but not for the cross-encoder scoring -recipe. +back silently. For example, direct `ministral3` loading is supported for bi-encoder embeddings but not for the +cross-encoder scoring recipe. If you are extracting a text tower from a parent checkpoint, set +`model.extract_submodel: language_model`; extracted text backbones use the extraction path, where supported extracted +types use registered retrieval classes and unsupported extracted types can fall back to Hugging Face sequence +classification for cross-encoder scoring. ## Prepare Data @@ -319,7 +322,7 @@ Important knobs: negatives. Keep it disabled for ColBERT-style pooling. The complete example is -[`examples/retrieval/bi_encoder/llama3_2_1b.yaml`](../../../examples/retrieval/bi_encoder/llama3_2_1b.yaml). +{download}`examples/retrieval/bi_encoder/llama3_2_1b.yaml <../../../examples/retrieval/bi_encoder/llama3_2_1b.yaml>`. ## Configure a Cross-Encoder @@ -372,7 +375,7 @@ Important knobs: to index `0`. The complete example is -[`examples/retrieval/cross_encoder/llama3_2_1b.yaml`](../../../examples/retrieval/cross_encoder/llama3_2_1b.yaml). +{download}`examples/retrieval/cross_encoder/llama3_2_1b.yaml <../../../examples/retrieval/cross_encoder/llama3_2_1b.yaml>`. ## Distributed Launch and Batch Size @@ -526,7 +529,7 @@ example uses `--standalone`: ```bash uv run torchrun --standalone --nproc_per_node=8 examples/retrieval/data_utils/mine_hard_negatives.py \ --config examples/retrieval/data_utils/mining_config.yaml \ - --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated \ + --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_499/model/consolidated \ --mining.train_qa_file_path /path/to/input.json \ --mining.train_file_output_path /path/to/output.json \ --mining.cache_embeddings_dir /shared/path/to/cache/llama3_2_1b_fever_mine_v1 \ @@ -548,15 +551,21 @@ uv run torchrun \ --rdzv-endpoint ${MASTER_ADDR}:${MASTER_PORT} \ examples/retrieval/data_utils/mine_hard_negatives.py \ --config examples/retrieval/data_utils/mining_config.yaml \ - --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated \ + --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_499/model/consolidated \ --mining.train_qa_file_path /path/to/input.json \ --mining.train_file_output_path /path/to/output.json \ --mining.cache_embeddings_dir /shared/path/to/cache/llama3_2_1b_fever_mine_v1 \ --mining.query_prefix "query: " \ --mining.passage_prefix "passage: " \ + --mining.query_max_length 512 \ + --mining.passage_max_length 512 \ --mining.add_eos_token false ``` +Replace `epoch_0_step_499` with the explicit checkpoint directory that you want to mine from. If you only have +`LATEST.txt`, read it first and substitute the resolved `epoch_*_step_*` directory; the mining script loads the +Hugging Face export directly and does not apply AutoModel's checkpoint resolver. + Hard-negative mining expects the corpus ID-based retrieval JSON format described in the dataset guide, not the inline JSONL shortcut. The input must reference one corpus so the miner can build a passage embedding cache, retrieve candidates, and write mined negatives back to each query. @@ -575,6 +584,9 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: prefixes before mining. - `query_max_length` and `passage_max_length`: keep these consistent with training unless you intentionally change truncation. +- `pooling` and `l2_normalize`: the mining script currently loads `NeMoAutoModelBiEncoder.from_pretrained()` with the + wrapper defaults (`pooling: avg`, `l2_normalize: true`). Mine with checkpoints trained using those defaults, or use a + custom mining entry point that passes the same wrapper settings used during training. - `add_bos_token` and `add_eos_token`: match the tokenizer behavior used during training. If omitted, mining falls back to tokenizer defaults, which can differ from the training config. - `use_negatives_from_file`: include existing negatives from the input file when mining. Existing negatives are prepended @@ -617,9 +629,14 @@ number. With `save_consolidated: true` and full fine-tuning, AutoModel also writes a Hugging Face-compatible model under: ```text -./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated/ +./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_499/model/consolidated/ ``` +Use the concrete `epoch_*_step_*` directory printed in your logs. Some workflows also create a `LATEST` symlink, but +direct Hugging Face and mining loads expect a real exported model path. If your run produced `LATEST.txt` instead of a +symlink, read that file and substitute the resolved checkpoint directory before calling `from_pretrained()` or +`mine_hard_negatives.py`. + PEFT/LoRA runs save adapter artifacts under the checkpoint `model/` directory instead of the full consolidated export path above. Resume LoRA training from the AutoModel checkpoint directory, but use full fine-tuning when you need the `model/consolidated` path for the mining command shown in this guide. If you need mining or serving from LoRA weights, @@ -658,13 +675,15 @@ import torch from nemo_automodel import NeMoAutoModelBiEncoder, NeMoAutoTokenizer -model_path = "./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated" +model_path = "./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_499/model/consolidated" tokenizer = NeMoAutoTokenizer.from_pretrained(model_path, add_eos_token=False) model = NeMoAutoModelBiEncoder.from_pretrained(model_path, use_liger_kernel=False).eval() device = next(model.parameters()).device texts = ["query: what does nvlink do?", "passage: NVLink is a high-bandwidth GPU interconnect."] -tokens = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt") +tokenized = tokenizer(texts, padding=False, truncation=True, max_length=512, return_token_type_ids=False) +tokenized = [{key: tokenized[key][idx] for key in tokenized.keys()} for idx in range(len(texts))] +tokens = tokenizer.pad(tokenized, padding="longest", return_tensors="pt") tokens = {key: value.to(device) for key, value in tokens.items()} with torch.no_grad(): embeddings = model.encode(tokens) @@ -681,16 +700,19 @@ import torch from nemo_automodel import NeMoAutoModelCrossEncoder, NeMoAutoTokenizer -model_path = "./output/llama3_2_1b_cross_encoder/checkpoints/LATEST/model/consolidated" +model_path = "./output/llama3_2_1b_cross_encoder/checkpoints/epoch_0_step_499/model/consolidated" tokenizer = NeMoAutoTokenizer.from_pretrained(model_path, add_eos_token=False) model = NeMoAutoModelCrossEncoder.from_pretrained(model_path, use_liger_kernel=False).eval() device = next(model.parameters()).device +prompt_template = "question:{query} \n \n passage:{passage}" pairs = [ - "question: what does nvlink do?\n\npassage: NVLink is a high-bandwidth GPU interconnect.", - "question: what does nvlink do?\n\npassage: Dropout regularizes neural networks.", + prompt_template.format(query="what does nvlink do?", passage="NVLink is a high-bandwidth GPU interconnect."), + prompt_template.format(query="what does nvlink do?", passage="Dropout regularizes neural networks."), ] -tokens = tokenizer(pairs, padding=True, truncation=True, max_length=512, return_tensors="pt") +tokenized = tokenizer(pairs, padding=False, truncation=True, max_length=512, return_token_type_ids=False) +tokenized = [{key: tokenized[key][idx] for key in tokenized.keys()} for idx in range(len(pairs))] +tokens = tokenizer.pad(tokenized, padding="longest", return_tensors="pt") tokens = {key: value.to(device) for key, value in tokens.items()} with torch.no_grad(): logits = model(tokens).logits.squeeze(-1) @@ -719,8 +741,10 @@ versions without calibration. ## Related Files -- Bi-encoder recipe: [`nemo_automodel/recipes/retrieval/train_bi_encoder.py`](../../../nemo_automodel/recipes/retrieval/train_bi_encoder.py) -- Cross-encoder recipe: [`nemo_automodel/recipes/retrieval/train_cross_encoder.py`](../../../nemo_automodel/recipes/retrieval/train_cross_encoder.py) +- Bi-encoder recipe: + {download}`nemo_automodel/recipes/retrieval/train_bi_encoder.py <../../../nemo_automodel/recipes/retrieval/train_bi_encoder.py>` +- Cross-encoder recipe: + {download}`nemo_automodel/recipes/retrieval/train_cross_encoder.py <../../../nemo_automodel/recipes/retrieval/train_cross_encoder.py>` - Retrieval dataset guide: [Retrieval Dataset](retrieval-dataset.md) - Llama-Embed-Nemotron-8B example: - [`examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml`](../../../examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml) + {download}`examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml <../../../examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml>` diff --git a/examples/retrieval/bi_encoder/llama3_2_1b.yaml b/examples/retrieval/bi_encoder/llama3_2_1b.yaml index 04c383ac2f..36c6a74714 100644 --- a/examples/retrieval/bi_encoder/llama3_2_1b.yaml +++ b/examples/retrieval/bi_encoder/llama3_2_1b.yaml @@ -13,7 +13,7 @@ # limitations under the License. # To run this recipe: -# automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 8 +# uv run automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 8 # Adjust --nproc-per-node to the number of GPUs available on your machine. recipe: TrainBiEncoderRecipe diff --git a/examples/retrieval/cross_encoder/llama3_2_1b.yaml b/examples/retrieval/cross_encoder/llama3_2_1b.yaml index 1d87f1e77c..61dee9eb1b 100644 --- a/examples/retrieval/cross_encoder/llama3_2_1b.yaml +++ b/examples/retrieval/cross_encoder/llama3_2_1b.yaml @@ -13,7 +13,7 @@ # limitations under the License. # To run this recipe: -# automodel examples/retrieval/cross_encoder/llama3_2_1b.yaml --nproc-per-node 8 +# uv run automodel examples/retrieval/cross_encoder/llama3_2_1b.yaml --nproc-per-node 8 # Adjust --nproc-per-node to the number of GPUs available on your machine. recipe: TrainCrossEncoderRecipe diff --git a/examples/retrieval/data_utils/mine_hard_negatives.py b/examples/retrieval/data_utils/mine_hard_negatives.py index 3aeb16b001..50d90966c0 100755 --- a/examples/retrieval/data_utils/mine_hard_negatives.py +++ b/examples/retrieval/data_utils/mine_hard_negatives.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -# To run this script, use the following command: +# Single-node usage: # uv run torchrun --standalone --nproc_per_node=8 ./examples/retrieval/data_utils/mine_hard_negatives.py \ # --config examples/retrieval/data_utils/mining_config.yaml \ -# --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated \ +# --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_499/model/consolidated \ # --mining.train_qa_file_path /path/to/input.json \ # --mining.train_file_output_path /path/to/output.json \ # --mining.cache_embeddings_dir /shared/path/to/cache/ \ @@ -25,8 +25,12 @@ # --mining.passage_max_length 512 \ # --mining.add_eos_token false # +# For multi-node torchrun, use explicit rendezvous flags as shown in the +# retrieval fine-tuning guide. +# # The model is loaded directly from --mining.model_name_or_path. Use a HF-loadable -# encoder export such as LATEST/model/consolidated from a full fine-tuning run. +# encoder export such as checkpoints/epoch_0_step_499/model/consolidated from a +# full fine-tuning run. # Match tokenizer, prefix, and max-length settings to the training config. # # The mining_config.yaml contains only mining parameters and dist_env settings, diff --git a/examples/retrieval/data_utils/mining_config.yaml b/examples/retrieval/data_utils/mining_config.yaml index f0587a766d..1bf8276216 100644 --- a/examples/retrieval/data_utils/mining_config.yaml +++ b/examples/retrieval/data_utils/mining_config.yaml @@ -16,10 +16,10 @@ # The model is loaded directly from --mining.model_name_or_path, # so no model architecture config is needed here. # -# Usage: +# Single-node usage: # uv run torchrun --standalone --nproc_per_node=8 ./examples/retrieval/data_utils/mine_hard_negatives.py \ # --config examples/retrieval/data_utils/mining_config.yaml \ -# --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/LATEST/model/consolidated \ +# --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_499/model/consolidated \ # --mining.train_qa_file_path /path/to/input.json \ # --mining.train_file_output_path /path/to/output.json \ # --mining.cache_embeddings_dir /shared/path/to/cache/ \ @@ -28,6 +28,8 @@ # --mining.query_max_length 512 \ # --mining.passage_max_length 512 \ # --mining.add_eos_token false +# +# For multi-node torchrun, use explicit rendezvous flags as shown in the retrieval fine-tuning guide. # Distributed environment settings dist_env: @@ -36,7 +38,8 @@ dist_env: # Mining parameters - can be overridden via command line # Required parameters (must be provided via command line or here): -# mining.model_name_or_path: Path to a HF-loadable encoder export, such as LATEST/model/consolidated +# mining.model_name_or_path: Path to a HF-loadable encoder export, such as +# checkpoints/epoch_0_step_499/model/consolidated # mining.train_qa_file_path: Input QA file # mining.train_file_output_path: Output file with mined negatives # mining.cache_embeddings_dir: Required for multi-rank torchrun; optional only for single-process mining. @@ -44,7 +47,7 @@ dist_env: mining: # Model path - REQUIRED (override via --mining.model_name_or_path) - # model_name_or_path: /path/to/encoder/checkpoint + # model_name_or_path: ./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_499/model/consolidated # Tokenizer path - defaults to model_name_or_path if not specified # tokenizer_name_or_path: /path/to/tokenizer diff --git a/examples/retrieval/data_utils/unroll_pos_docs.py b/examples/retrieval/data_utils/unroll_pos_docs.py index 3ed94c9323..7eaf89a880 100644 --- a/examples/retrieval/data_utils/unroll_pos_docs.py +++ b/examples/retrieval/data_utils/unroll_pos_docs.py @@ -39,8 +39,8 @@ Output: Same format, but each record starts with a different supervised positive: - {"question_id": "q0_0", "question": "...", "pos_doc": [{"id": "d1"}, {"id": "d2"}]} - {"question_id": "q0_1", "question": "...", "pos_doc": [{"id": "d2"}, {"id": "d1"}]} + {"question_id": "q0_0", "original_question_id": "q0", "pos_doc": [{"id": "d1"}, {"id": "d2"}]} + {"question_id": "q0_1", "original_question_id": "q0", "pos_doc": [{"id": "d2"}, {"id": "d1"}]} Usage: uv run python examples/retrieval/data_utils/unroll_pos_docs.py data/nv_pp_dd_sdg_train_eval/train.json @@ -58,11 +58,12 @@ from typing import Any -def _doc_id(doc: Any) -> Any: +def _doc_id(doc: Any) -> str: """Return a document ID from either a raw ID or a {"id": ...} mapping.""" - if isinstance(doc, dict): - return doc.get("id") - return doc + raw_id = doc.get("id") if isinstance(doc, dict) else doc + if raw_id is None: + raise ValueError(f"Document entry is missing an id: {doc!r}") + return str(raw_id) def _drop_known_positives_from_negatives(neg_docs: list[Any], pos_docs: list[Any]) -> list[Any]: @@ -103,6 +104,7 @@ def unroll_training_data(data: list[dict[str, Any]]) -> list[dict[str, Any]]: ordered_pos_docs = [pos_doc] + [other_pos_doc for j, other_pos_doc in enumerate(pos_docs) if j != idx] new_record = { "question_id": f"{base_question_id}_{idx}", + "original_question_id": record.get("original_question_id", base_question_id), "question": record["question"], "corpus_id": record["corpus_id"], "pos_doc": ordered_pos_docs, @@ -115,7 +117,7 @@ def unroll_training_data(data: list[dict[str, Any]]) -> list[dict[str, Any]]: def main(): parser = argparse.ArgumentParser( - description="Unroll training data with multiple positive docs into single-pos-doc records" + description="Unroll training data so each sibling positive is supervised first while preserving all positives" ) parser.add_argument("input_file", type=str, help="Path to input training JSON file") parser.add_argument( diff --git a/tests/unit_tests/datasets/llm/test_unroll_pos_docs.py b/tests/unit_tests/datasets/llm/test_unroll_pos_docs.py new file mode 100644 index 0000000000..a8fe23387d --- /dev/null +++ b/tests/unit_tests/datasets/llm/test_unroll_pos_docs.py @@ -0,0 +1,87 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from examples.retrieval.data_utils.unroll_pos_docs import unroll_training_data + + +def test_unroll_preserves_sibling_positives_and_filters_stringified_negatives(): + records = [ + { + "question_id": "q0", + "question": "Which documents are positive?", + "corpus_id": "demo", + "pos_doc": [{"id": 1}, {"id": "2"}], + "neg_doc": [{"id": "1"}, {"id": 2}, {"id": "3"}], + } + ] + + unrolled = unroll_training_data(records) + + assert unrolled == [ + { + "question_id": "q0_0", + "original_question_id": "q0", + "question": "Which documents are positive?", + "corpus_id": "demo", + "pos_doc": [{"id": 1}, {"id": "2"}], + "neg_doc": [{"id": "3"}], + }, + { + "question_id": "q0_1", + "original_question_id": "q0", + "question": "Which documents are positive?", + "corpus_id": "demo", + "pos_doc": [{"id": "2"}, {"id": 1}], + "neg_doc": [{"id": "3"}], + }, + ] + + +def test_unroll_keeps_single_positive_question_id_and_filters_negatives(): + records = [ + { + "question_id": "q0", + "question": "Which document is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": 1}], + "neg_doc": [{"id": "1"}, {"id": "2"}], + } + ] + + assert unroll_training_data(records) == [ + { + "question_id": "q0", + "question": "Which document is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": 1}], + "neg_doc": [{"id": "2"}], + } + ] + + +def test_unroll_raises_for_missing_document_id(): + records = [ + { + "question_id": "q0", + "question": "Which document is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": "1"}], + "neg_doc": [{}], + } + ] + + with pytest.raises(ValueError, match="missing an id"): + unroll_training_data(records) From 7fbec0bc7eda87219f109815b334c1df13d9c5fc Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 19:08:33 +0100 Subject: [PATCH 17/25] fix(retrieval): load saved encoder exports Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-finetuning.md | 7 +-- nemo_automodel/_transformers/registry.py | 5 ++ .../datasets/llm/retrieval_dataset.py | 7 ++- .../models/llama_bidirectional/model.py | 8 +++- .../unit_tests/_transformers/test_registry.py | 18 ++++++++ .../datasets/llm/test_retrieval_dataset.py | 3 ++ .../recipes/test_mine_hard_negatives.py | 46 +++++++++++++++++++ 7 files changed, 88 insertions(+), 6 deletions(-) diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 350cc499a1..36e25f6c99 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -174,9 +174,10 @@ The training recipe does not load a separate qrels file. Materialize qrels into For training, `pos_doc[0]` is the supervised positive. For mining, keep every known positive for the query in `pos_doc` so the miner can exclude those IDs; it does not read an external qrels file. If you expand multi-positive queries into one row per positive, make sure sibling positives are removed from `neg_doc` and audited out of mined negatives before -training. Also keep sibling-positive rows out of the same in-batch-negative training batch, disable distributed in-batch -negatives, or add qrels-aware sampling/masking. Keep the original qrels for offline Recall@K, MRR@K, and nDCG@K -evaluation. +training. The helper in `examples/retrieval/data_utils/unroll_pos_docs.py` writes `original_question_id` so mined outputs +can still be joined back to the original qrels. Also keep sibling-positive rows out of the same in-batch-negative +training batch, disable distributed in-batch negatives, or add qrels-aware sampling/masking. Keep the original qrels for +offline Recall@K, MRR@K, and nDCG@K evaluation. ## Minimal Config Anatomy diff --git a/nemo_automodel/_transformers/registry.py b/nemo_automodel/_transformers/registry.py index b3615770c3..a8468fc782 100644 --- a/nemo_automodel/_transformers/registry.py +++ b/nemo_automodel/_transformers/registry.py @@ -197,7 +197,12 @@ "kimi_k25": ("nemo_automodel.components.models.kimi_k25_vl.model", "KimiK25VLConfig"), "kimi_vl": ("nemo_automodel.components.models.kimivl.model", "KimiVLConfig"), "llavaonevision1_5": ("nemo_automodel.components.models.llava_onevision.model", "Llavaonevision1_5Config"), + "llama_bidirec": ("nemo_automodel.components.models.llama_bidirectional.model", "LlamaBidirectionalConfig"), "mistral4": ("nemo_automodel.components.models.mistral4.configuration", "Mistral4Config"), + "ministral3_bidirec": ( + "nemo_automodel.components.models.ministral_bidirectional.model", + "Ministral3BidirectionalConfig", + ), } diff --git a/nemo_automodel/components/datasets/llm/retrieval_dataset.py b/nemo_automodel/components/datasets/llm/retrieval_dataset.py index 0823a7abdd..f3d8d19077 100644 --- a/nemo_automodel/components/datasets/llm/retrieval_dataset.py +++ b/nemo_automodel/components/datasets/llm/retrieval_dataset.py @@ -288,6 +288,7 @@ def load_datasets( raise ValueError(f"Missing required fields: {missing} in train_data item: {item}") normalized_item = { "question_id": item["question_id"], + "original_question_id": item.get("original_question_id", item["question_id"]), "question": item["question"], "corpus_id": item["corpus_id"], } @@ -431,11 +432,13 @@ def _load_hf_subset(repo_id: str, subset: str): f"adapter/preprocessor before using direct hf:// loading." ) - # 5. Normalize to the standard {question_id, question, corpus_id, pos_doc, neg_doc} shape + # 5. Normalize to the standard {question_id, original_question_id, question, corpus_id, pos_doc, neg_doc} shape normalized_data = [] for idx, item in enumerate(queries_hf): + question_id = str(item.get("question_id", f"{subset}:{idx}")) normalized_item = { - "question_id": str(item.get("question_id", f"{subset}:{idx}")), + "question_id": question_id, + "original_question_id": str(item.get("original_question_id", question_id)), "question": item["question"], "corpus_id": corpus_id, } diff --git a/nemo_automodel/components/models/llama_bidirectional/model.py b/nemo_automodel/components/models/llama_bidirectional/model.py index 77d966d72d..138cc5a708 100644 --- a/nemo_automodel/components/models/llama_bidirectional/model.py +++ b/nemo_automodel/components/models/llama_bidirectional/model.py @@ -286,7 +286,7 @@ def _register_with_hf_auto_classes(): This is needed so that AutoModel.from_config(LlamaBidirectionalConfig) works inside LlamaForSequenceClassification.__init__. """ - from transformers import AutoConfig, AutoModel + from transformers import AutoConfig, AutoModel, AutoModelForSequenceClassification try: AutoConfig.register(LlamaBidirectionalConfig.model_type, LlamaBidirectionalConfig) @@ -296,6 +296,12 @@ def _register_with_hf_auto_classes(): AutoModel.register(LlamaBidirectionalConfig, LlamaBidirectionalModel) except ValueError: pass # Already registered + try: + AutoModelForSequenceClassification.register( + LlamaBidirectionalConfig, LlamaBidirectionalForSequenceClassification + ) + except ValueError: + pass # Already registered _register_with_hf_auto_classes() diff --git a/tests/unit_tests/_transformers/test_registry.py b/tests/unit_tests/_transformers/test_registry.py index 46a1f1b687..77306f7e19 100644 --- a/tests/unit_tests/_transformers/test_registry.py +++ b/tests/unit_tests/_transformers/test_registry.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json import types import pytest @@ -264,6 +265,23 @@ def test_custom_config_registrations_in_config_mapping(): ) +def test_retrieval_bidirectional_configs_resolve_from_pretrained(tmp_path): + """Saved retrieval exports must be loadable in fresh processes through AutoConfig.""" + from transformers import AutoConfig + + from nemo_automodel._transformers.registry import _CUSTOM_CONFIG_REGISTRATIONS + + for model_type in ("llama_bidirec", "ministral3_bidirec"): + assert model_type in _CUSTOM_CONFIG_REGISTRATIONS + model_dir = tmp_path / model_type + model_dir.mkdir() + (model_dir / "config.json").write_text(json.dumps({"model_type": model_type})) + + config = AutoConfig.from_pretrained(model_dir) + + assert config.model_type == model_type + + def test_kimi_k25_arch_alias_in_model_arch_mapping(): """KimiK25ForConditionalGeneration (checkpoint arch) must map to KimiK25VLForConditionalGeneration.""" from nemo_automodel._transformers.registry import MODEL_ARCH_MAPPING diff --git a/tests/unit_tests/datasets/llm/test_retrieval_dataset.py b/tests/unit_tests/datasets/llm/test_retrieval_dataset.py index b7f71dc02c..012966fafd 100644 --- a/tests/unit_tests/datasets/llm/test_retrieval_dataset.py +++ b/tests/unit_tests/datasets/llm/test_retrieval_dataset.py @@ -169,6 +169,7 @@ def test_load_datasets_normalizes_and_errors(tmp_path, monkeypatch): "data": [ { "question_id": "q1", + "original_question_id": "q-root", "question": "What?", "corpus_id": "corpusA", "pos_doc": [{"id": "p1"}], @@ -183,6 +184,7 @@ def test_load_datasets_normalizes_and_errors(tmp_path, monkeypatch): assert len(dataset) == 1 row = dataset[0] assert row["question_id"] == "q1" + assert row["original_question_id"] == "q-root" assert row["pos_doc"][0]["id"] == "p1" assert row["neg_doc"][0]["id"] == "n1" and row["neg_doc"][1]["id"] == "n2" assert "corpusA" in corpus_dict @@ -1126,6 +1128,7 @@ def fake_load_dataset(repo_id, config=None, split=None, **kw): data_list, _ = rd._load_hf_subset("org/repo", "MySub") assert data_list[0]["question_id"] == "MySub:0" + assert data_list[0]["original_question_id"] == "MySub:0" def test_load_hf_subset_allows_empty_neg_doc(tmp_path, monkeypatch): diff --git a/tests/unit_tests/recipes/test_mine_hard_negatives.py b/tests/unit_tests/recipes/test_mine_hard_negatives.py index 9045797c29..3b0aac4bc0 100644 --- a/tests/unit_tests/recipes/test_mine_hard_negatives.py +++ b/tests/unit_tests/recipes/test_mine_hard_negatives.py @@ -14,6 +14,7 @@ """Unit tests for MineHardNegativesRecipe — attn_implementation support.""" +import json from unittest.mock import MagicMock, patch import pytest @@ -158,3 +159,48 @@ def test_setup_with_attn_implementation(attn_impl): assert kwargs["attn_implementation"] == attn_impl assert kwargs["use_liger_kernel"] is False assert kwargs["use_sdpa_patching"] is True + + +def test_write_output_preserves_original_question_id(tmp_path): + """Mined outputs should keep query lineage added by unroll_pos_docs.py.""" + input_file = tmp_path / "input.json" + output_file = tmp_path / "output.json" + input_file.write_text(json.dumps({"corpus": {"path": "/fake/corpus"}, "data": []})) + + recipe = _make_recipe( + { + "train_qa_file_path": str(input_file), + "train_file_output_path": str(output_file), + } + ) + recipe.train_qa_file_path = str(input_file) + recipe.train_file_output_path = str(output_file) + recipe.questions_dataset = [ + { + "question_id": "q0_0", + "original_question_id": "q0", + "question": "Which doc is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": "p1"}], + "neg_doc": [{"id": "old"}], + "pos_score": 0.7, + "neg_scores": [0.1], + } + ] + recipe._build_negative_docs_by_question_id = lambda: {"q0_0": [{"id": "n1", "score": 0.2}]} + recipe._build_positive_scores_by_question_id = lambda: {"q0_0": [0.9]} + recipe._get_mining_args_dict = lambda: {} + + recipe._write_output() + + output = json.loads(output_file.read_text()) + assert output["data"] == [ + { + "question_id": "q0_0", + "original_question_id": "q0", + "question": "Which doc is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": "p1", "score": 0.9}], + "neg_doc": [{"id": "n1", "score": 0.2}], + } + ] From 076515c538f8187394c8b7b746ac25b8d3dfede2 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 19:27:53 +0100 Subject: [PATCH 18/25] fix(retrieval): preserve encoder metadata Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-finetuning.md | 33 ++- .../data_utils/audit_mined_negatives.py | 214 ++++++++++++++++++ .../retrieval/data_utils/mining_config.yaml | 5 +- nemo_automodel/_transformers/auto_model.py | 10 +- nemo_automodel/_transformers/retrieval.py | 87 ++++++- .../recipes/retrieval/mine_hard_negatives.py | 11 + .../llm/test_audit_mined_negatives.py | 84 +++++++ .../test_llama_bidirectional_model.py | 86 ++++++- .../recipes/test_mine_hard_negatives.py | 9 + 9 files changed, 511 insertions(+), 28 deletions(-) create mode 100644 examples/retrieval/data_utils/audit_mined_negatives.py create mode 100644 tests/unit_tests/datasets/llm/test_audit_mined_negatives.py diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 36e25f6c99..187319319a 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -585,9 +585,6 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: prefixes before mining. - `query_max_length` and `passage_max_length`: keep these consistent with training unless you intentionally change truncation. -- `pooling` and `l2_normalize`: the mining script currently loads `NeMoAutoModelBiEncoder.from_pretrained()` with the - wrapper defaults (`pooling: avg`, `l2_normalize: true`). Mine with checkpoints trained using those defaults, or use a - custom mining entry point that passes the same wrapper settings used during training. - `add_bos_token` and `add_eos_token`: match the tokenizer behavior used during training. If omitted, mining falls back to tokenizer defaults, which can differ from the training config. - `use_negatives_from_file`: include existing negatives from the input file when mining. Existing negatives are prepended @@ -598,12 +595,36 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: rank `0` unable to read remote-rank shards. Use a fresh cache directory for each model, dataset, prefix, sequence length, and world-size combination; stale cache files can be reused if they are already present. +`pooling` and `l2_normalize` are saved bi-encoder wrapper metadata, not `mining.*` config fields. Do not pass +`--mining.pooling` or `--mining.l2_normalize`; the miner rejects unknown mining keys. Mine from a saved bi-encoder export +produced with the wrapper settings you want, or explicitly reload and export the model with those settings before mining. + Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. Hard negative mining excludes document IDs listed in each input row's `pos_doc`, but it cannot read an external qrels file or know every semantically relevant duplicate. Put all known positive IDs for the query in the mining input, deduplicate the -corpus, inspect mined samples, filter duplicate IDs and `-inf` scores from mined outputs, and avoid mining from -validation or test corpora. If you unroll multi-positive training data, mine from rows that still carry every known -positive in `pos_doc`; otherwise sibling positives can be mined as false negatives. +corpus, inspect mined samples, filter duplicate IDs and non-finite scores such as `-inf` from mined outputs, and avoid +mining from validation or test corpora. If you unroll multi-positive training data, mine from rows that still carry every +known positive in `pos_doc`; otherwise sibling positives can be mined as false negatives. + +Run the audit utility before reusing mined output: + +```bash +uv run python examples/retrieval/data_utils/audit_mined_negatives.py \ + /path/to/mined.json +``` + +If the report only contains issues that you want to drop automatically, write a cleaned copy: + +```bash +uv run python examples/retrieval/data_utils/audit_mined_negatives.py \ + /path/to/mined.json \ + --drop-invalid-negatives \ + --output /path/to/mined_audited.json +``` + +The audit flags negatives whose IDs also appear in the row's `pos_doc`, duplicate negative IDs in the same row, missing +negative scores, and non-finite negative scores. The cleaned output preserves query lineage fields such as +`original_question_id`, so unrolled examples remain traceable to their source question. ## Save, Resume, and Use the Checkpoint diff --git a/examples/retrieval/data_utils/audit_mined_negatives.py b/examples/retrieval/data_utils/audit_mined_negatives.py new file mode 100644 index 0000000000..906c41c9f5 --- /dev/null +++ b/examples/retrieval/data_utils/audit_mined_negatives.py @@ -0,0 +1,214 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Audit mined retrieval negatives for common false-negative and score issues.""" + +import argparse +import json +import math +from pathlib import Path +from typing import Any + + +def _doc_id(doc: Any) -> str: + """Return a document ID from either a raw ID or a {"id": ...} mapping.""" + raw_id = doc.get("id") if isinstance(doc, dict) else doc + if raw_id is None: + raise ValueError(f"Document entry is missing an id: {doc!r}") + return str(raw_id) + + +def _score_state(doc: Any) -> str: + """Classify a mined negative score as finite, missing, or non-finite.""" + if not isinstance(doc, dict) or "score" not in doc: + return "missing" + try: + score = float(doc["score"]) + except (TypeError, ValueError): + return "non_finite" + return "finite" if math.isfinite(score) else "non_finite" + + +def audit_records( + records: list[dict[str, Any]], + *, + drop_invalid_negatives: bool = False, + max_findings: int = 20, +) -> tuple[dict[str, int], list[dict[str, Any]], list[dict[str, Any]]]: + """Audit retrieval records and optionally drop invalid mined negatives. + + Args: + records: Retrieval training records from the top-level ``data`` field. + drop_invalid_negatives: Drop negatives that duplicate positives, duplicate + another negative in the same row, or have a non-finite score. + max_findings: Maximum example findings to return. + + Returns: + A tuple of ``(summary, cleaned_records, finding_examples)``. + """ + summary = { + "records": len(records), + "negatives": 0, + "rows_with_findings": 0, + "negative_is_known_positive": 0, + "duplicate_negative": 0, + "missing_negative_score": 0, + "non_finite_negative_score": 0, + "dropped_negatives": 0, + "total_findings": 0, + } + cleaned_records = [] + findings = [] + + for row_idx, record in enumerate(records): + pos_ids = {_doc_id(doc) for doc in record.get("pos_doc", [])} + seen_neg_ids = set() + cleaned_negatives = [] + row_has_findings = False + + for neg_doc in record.get("neg_doc", []): + summary["negatives"] += 1 + neg_id = _doc_id(neg_doc) + should_drop = False + + if neg_id in pos_ids: + summary["negative_is_known_positive"] += 1 + should_drop = True + row_has_findings = True + _append_finding(findings, max_findings, row_idx, record, neg_id, "negative_is_known_positive") + + if neg_id in seen_neg_ids: + summary["duplicate_negative"] += 1 + should_drop = True + row_has_findings = True + _append_finding(findings, max_findings, row_idx, record, neg_id, "duplicate_negative") + seen_neg_ids.add(neg_id) + + score_state = _score_state(neg_doc) + if score_state == "missing": + summary["missing_negative_score"] += 1 + row_has_findings = True + _append_finding(findings, max_findings, row_idx, record, neg_id, "missing_negative_score") + elif score_state == "non_finite": + summary["non_finite_negative_score"] += 1 + should_drop = True + row_has_findings = True + _append_finding(findings, max_findings, row_idx, record, neg_id, "non_finite_negative_score") + + if drop_invalid_negatives and should_drop: + summary["dropped_negatives"] += 1 + continue + cleaned_negatives.append(neg_doc) + + if row_has_findings: + summary["rows_with_findings"] += 1 + cleaned_record = dict(record) + cleaned_record["neg_doc"] = cleaned_negatives + cleaned_records.append(cleaned_record) + + summary["total_findings"] = ( + summary["negative_is_known_positive"] + + summary["duplicate_negative"] + + summary["missing_negative_score"] + + summary["non_finite_negative_score"] + ) + return summary, cleaned_records, findings + + +def _append_finding( + findings: list[dict[str, Any]], + max_findings: int, + row_idx: int, + record: dict[str, Any], + neg_id: str, + issue: str, +) -> None: + """Append a compact finding example up to the configured limit.""" + if len(findings) >= max_findings: + return + finding = { + "row": row_idx, + "question_id": record.get("question_id"), + "negative_id": neg_id, + "issue": issue, + } + if "original_question_id" in record: + finding["original_question_id"] = record["original_question_id"] + findings.append(finding) + + +def audit_training_data( + training_data: dict[str, Any], + *, + drop_invalid_negatives: bool = False, + max_findings: int = 20, +) -> tuple[dict[str, int], dict[str, Any], list[dict[str, Any]]]: + """Audit a top-level retrieval JSON object.""" + records = training_data.get("data", []) + summary, cleaned_records, findings = audit_records( + records, + drop_invalid_negatives=drop_invalid_negatives, + max_findings=max_findings, + ) + cleaned_training_data = dict(training_data) + cleaned_training_data["data"] = cleaned_records + return summary, cleaned_training_data, findings + + +def main() -> int: + """Run the mined-negative audit CLI.""" + parser = argparse.ArgumentParser(description="Audit mined retrieval negatives before reusing them for training") + parser.add_argument("input_file", type=str, help="Path to mined retrieval JSON") + parser.add_argument( + "--output", + "-o", + type=str, + default=None, + help="Optional path to write a cleaned retrieval JSON", + ) + parser.add_argument( + "--drop-invalid-negatives", + action="store_true", + help="Drop negatives that are known positives, duplicate row negatives, or have non-finite scores", + ) + parser.add_argument("--max-findings", type=int, default=20, help="Maximum finding examples to print") + parser.add_argument( + "--allow-findings", + action="store_true", + help="Exit with status 0 even when the audit reports findings", + ) + args = parser.parse_args() + + input_path = Path(args.input_file) + with open(input_path, "r") as f: + training_data = json.load(f) + + summary, cleaned_training_data, findings = audit_training_data( + training_data, + drop_invalid_negatives=args.drop_invalid_negatives, + max_findings=args.max_findings, + ) + + print(json.dumps({"summary": summary, "findings": findings}, indent=2)) + + if args.output is not None: + output_path = Path(args.output) + with open(output_path, "w") as f: + json.dump(cleaned_training_data, f, indent=2) + + return 1 if summary["total_findings"] and not args.allow_findings else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/retrieval/data_utils/mining_config.yaml b/examples/retrieval/data_utils/mining_config.yaml index 1bf8276216..ce2d3d5f0a 100644 --- a/examples/retrieval/data_utils/mining_config.yaml +++ b/examples/retrieval/data_utils/mining_config.yaml @@ -89,7 +89,10 @@ mining: # Maximum sequence lengths query_max_length: 512 passage_max_length: 512 + # pooling and l2_normalize are saved model metadata, not mining config keys. + # Do not add mining.pooling or mining.l2_normalize here; the miner rejects unknown keys. # Whether to include negatives from the input file. Existing negatives are prepended to the mined output; - # deduplicate/audit the final file before training. + # deduplicate/audit the final file before training: + # uv run python examples/retrieval/data_utils/audit_mined_negatives.py /path/to/output.json use_negatives_from_file: false diff --git a/nemo_automodel/_transformers/auto_model.py b/nemo_automodel/_transformers/auto_model.py index e95560068b..e3d30c6dda 100644 --- a/nemo_automodel/_transformers/auto_model.py +++ b/nemo_automodel/_transformers/auto_model.py @@ -1041,8 +1041,8 @@ class NeMoAutoModelBiEncoder(_NeMoAutoModelForRetrievalBase): def from_pretrained( cls, pretrained_model_name_or_path: str, - pooling: str = "avg", - l2_normalize: bool = True, + pooling: str | None = None, + l2_normalize: bool | None = None, **kwargs, ) -> PreTrainedModel: """Load a bi-encoder model with infrastructure. @@ -1052,8 +1052,10 @@ def from_pretrained( Args: pretrained_model_name_or_path: Path to pretrained model or model identifier. - pooling: Pooling strategy (``'avg'``, ``'cls'``, ``'last'``, etc.). - l2_normalize: Whether to L2-normalize embeddings. + pooling: Pooling strategy (``'avg'``, ``'cls'``, ``'last'``, etc.). When omitted, saved + retrieval metadata is restored when available, otherwise defaults to ``'avg'``. + l2_normalize: Whether to L2-normalize embeddings. When omitted, saved retrieval metadata + is restored when available, otherwise defaults to ``True``. **kwargs: Forwarded to ``_NeMoAutoModelForRetrievalBase.from_pretrained``. Returns: diff --git a/nemo_automodel/_transformers/retrieval.py b/nemo_automodel/_transformers/retrieval.py index 7287fed41d..4d65f779bd 100644 --- a/nemo_automodel/_transformers/retrieval.py +++ b/nemo_automodel/_transformers/retrieval.py @@ -30,6 +30,44 @@ logger = logging.get_logger(__name__) +_RETRIEVAL_METADATA_KEY = "nemo_retrieval" +_BI_ENCODER_DEFAULT_POOLING = "avg" +_BI_ENCODER_DEFAULT_L2_NORMALIZE = True + + +def _get_retrieval_metadata(config) -> dict: + """Return saved retrieval wrapper metadata from a model config.""" + metadata = getattr(config, _RETRIEVAL_METADATA_KEY, {}) + return metadata if isinstance(metadata, dict) else {} + + +def _coerce_bool(value) -> bool: + """Coerce booleans from config values while rejecting ambiguous strings.""" + if isinstance(value, bool): + return value + if isinstance(value, str): + normalized = value.strip().lower() + if normalized in {"1", "true", "yes", "y"}: + return True + if normalized in {"0", "false", "no", "n"}: + return False + raise ValueError(f"Cannot interpret boolean value: {value!r}") + return bool(value) + + +def _resolve_bi_encoder_options(config, pooling: Optional[str], l2_normalize: Optional[bool]) -> tuple[str, bool]: + """Resolve wrapper options from explicit arguments, saved metadata, then defaults.""" + metadata = _get_retrieval_metadata(config) + resolved_pooling = pooling + if resolved_pooling is None: + resolved_pooling = metadata.get("pooling", getattr(config, "pooling", _BI_ENCODER_DEFAULT_POOLING)) + + resolved_l2_normalize = l2_normalize + if resolved_l2_normalize is None: + resolved_l2_normalize = metadata.get("l2_normalize", _BI_ENCODER_DEFAULT_L2_NORMALIZE) + + return resolved_pooling, _coerce_bool(resolved_l2_normalize) + def _extract_submodel(model: nn.Module, extract_submodel: str) -> PreTrainedModel: """Extract a nested submodel from a loaded model using a dotted attribute path.""" @@ -175,7 +213,14 @@ def pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor, pool_ty return emb -def configure_encoder_metadata(model: PreTrainedModel, config) -> None: +def configure_encoder_metadata( + model: PreTrainedModel, + config, + *, + task: Optional[str] = None, + pooling: Optional[str] = None, + l2_normalize: Optional[bool] = None, +) -> None: """Configure HuggingFace consolidated checkpoint metadata on a model. Sets ``config.architectures`` unconditionally. For custom retrieval @@ -187,10 +232,23 @@ def configure_encoder_metadata(model: PreTrainedModel, config) -> None: Args: model: The backbone ``PreTrainedModel`` instance. config: The model's config object (typically ``model.config``). + task: Optional retrieval task name saved for downstream loaders. + pooling: Optional bi-encoder pooling strategy saved for wrapper reloads. + l2_normalize: Optional bi-encoder normalization flag saved for wrapper reloads. """ encoder_class_name = model.__class__.__name__ config.architectures = [encoder_class_name] + metadata = dict(_get_retrieval_metadata(config)) + if task is not None: + metadata["task"] = task + if pooling is not None: + metadata["pooling"] = pooling + if l2_normalize is not None: + metadata["l2_normalize"] = _coerce_bool(l2_normalize) + if metadata: + setattr(config, _RETRIEVAL_METADATA_KEY, metadata) + # Only set auto_map for custom retrieval architectures. # Standard HF models don't need auto_map pointing to a local model.py. if ModelRegistry.has_retrieval_model(encoder_class_name): @@ -212,6 +270,7 @@ def build_encoder_backbone( extract_submodel: Optional[str] = None, num_labels: Optional[int] = None, temperature: Optional[float] = None, + loaded_config: Optional[object] = None, **hf_kwargs, ) -> PreTrainedModel: """Build an encoder backbone from a pretrained checkpoint. @@ -240,6 +299,7 @@ def build_encoder_backbone( (e.g. ``"language_model"`` to extract the text backbone from a VLM). num_labels: Number of labels for reranking/classification backbones. temperature: Optional retrieval score temperature for custom retrieval backbones. + loaded_config: Optional config that has already been loaded by the caller. **hf_kwargs: Extra keyword arguments forwarded to ``from_pretrained``. Returns: @@ -249,7 +309,9 @@ def build_encoder_backbone( ValueError: If the task is unsupported for a known model type, or the architecture class is missing from :class:`ModelRegistry`. """ - config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code) + config = loaded_config + if config is None: + config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code) model_type = getattr(config, "model_type", "") if extract_submodel is not None: @@ -365,14 +427,21 @@ def __init__( self.pooling = pooling self.l2_normalize = l2_normalize self.do_distributed_inbatch_negative = do_distributed_inbatch_negative + configure_encoder_metadata( + self.model, + self.config, + task=self._TASK, + pooling=self.pooling, + l2_normalize=self.l2_normalize, + ) @classmethod def build( cls, model_name_or_path: str, task: str = None, - pooling: str = "avg", - l2_normalize: bool = True, + pooling: Optional[str] = None, + l2_normalize: Optional[bool] = None, do_distributed_inbatch_negative: bool = False, trust_remote_code: bool = False, **hf_kwargs, @@ -384,8 +453,15 @@ def build( logger.info(f"Building BiEncoderModel from {model_name_or_path}") + config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code) + pooling, l2_normalize = _resolve_bi_encoder_options(config, pooling, l2_normalize) backbone = build_encoder_backbone( - model_name_or_path, effective_task, trust_remote_code=trust_remote_code, pooling=pooling, **hf_kwargs + model_name_or_path, + effective_task, + trust_remote_code=trust_remote_code, + pooling=pooling, + loaded_config=config, + **hf_kwargs, ) return cls( @@ -447,6 +523,7 @@ class CrossEncoderModel(nn.Module): def __init__(self, model: PreTrainedModel): super().__init__() _init_encoder_common(self, model) + configure_encoder_metadata(self.model, self.config, task=self._TASK) @classmethod def build( diff --git a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py index 9d68c2a5e2..0ac781f60a 100644 --- a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py +++ b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py @@ -72,6 +72,8 @@ # Attention implementation for model loading "attn_implementation": None, # None = use model default; "sdpa", "flash_attention_2", "eager" } +MINING_PATH_FIELDS = {"train_qa_file_path", "train_file_output_path", "cache_embeddings_dir"} +MINING_CONFIG_FIELDS = set(MINING_DEFAULTS) | MINING_PATH_FIELDS def build_distributed(cfg_dist: Dict[str, Any]) -> DistInfo: @@ -278,6 +280,15 @@ def _get_mining_param(self, name, default=None): def _extract_mining_params(self): """Extract all mining parameters from configuration.""" + mining_dict = self.mining_cfg.to_dict() if hasattr(self.mining_cfg, "to_dict") else dict(self.mining_cfg) + unknown_fields = sorted(set(mining_dict) - MINING_CONFIG_FIELDS) + if unknown_fields: + supported_fields = ", ".join(sorted(MINING_CONFIG_FIELDS)) + raise ValueError( + f"Unknown mining config field(s): {', '.join(unknown_fields)}. " + f"Supported mining fields are: {supported_fields}" + ) + # Required parameters self.train_qa_file_path = self._get_mining_param("train_qa_file_path") self.train_file_output_path = self._get_mining_param("train_file_output_path") diff --git a/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py b/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py new file mode 100644 index 0000000000..c5e07c396d --- /dev/null +++ b/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py @@ -0,0 +1,84 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from examples.retrieval.data_utils.audit_mined_negatives import audit_training_data + + +def test_audit_mined_negatives_reports_and_cleans_invalid_rows(): + training_data = { + "corpus": {"path": "/corpus"}, + "data": [ + { + "question_id": "q0_0", + "original_question_id": "q0", + "question": "Which document is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": 1}], + "neg_doc": [ + {"id": "1", "score": 0.9}, + {"id": "2", "score": 0.4}, + {"id": "2", "score": 0.3}, + {"id": "3", "score": float("-inf")}, + {"id": "4"}, + ], + } + ], + } + + summary, cleaned, findings = audit_training_data(training_data, drop_invalid_negatives=True) + + assert summary == { + "records": 1, + "negatives": 5, + "rows_with_findings": 1, + "negative_is_known_positive": 1, + "duplicate_negative": 1, + "missing_negative_score": 1, + "non_finite_negative_score": 1, + "dropped_negatives": 3, + "total_findings": 4, + } + assert cleaned["corpus"] == training_data["corpus"] + assert cleaned["data"] == [ + { + "question_id": "q0_0", + "original_question_id": "q0", + "question": "Which document is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": 1}], + "neg_doc": [{"id": "2", "score": 0.4}, {"id": "4"}], + } + ] + assert findings[0]["original_question_id"] == "q0" + + +def test_audit_mined_negatives_preserves_records_without_findings(): + training_data = { + "corpus": {"path": "/corpus"}, + "data": [ + { + "question_id": "q0", + "question": "Which document is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": "1"}], + "neg_doc": [{"id": "2", "score": 0.2}], + } + ], + } + + summary, cleaned, findings = audit_training_data(training_data, drop_invalid_negatives=True) + + assert summary["total_findings"] == 0 + assert cleaned == training_data + assert findings == [] diff --git a/tests/unit_tests/models/bi_encoder/test_llama_bidirectional_model.py b/tests/unit_tests/models/bi_encoder/test_llama_bidirectional_model.py index 365856408e..bae26b0dee 100644 --- a/tests/unit_tests/models/bi_encoder/test_llama_bidirectional_model.py +++ b/tests/unit_tests/models/bi_encoder/test_llama_bidirectional_model.py @@ -139,7 +139,6 @@ def test_bidirectional_attention_is_symmetric(): ) - # --- Fakes for classification and encoder tests --- class FakeOutputs: def __init__(self, last_hidden_state=None, hidden_states=None): @@ -234,9 +233,7 @@ def forward(self, input_ids=None, attention_mask=None, return_dict=True, output_ ) lm = NoTTIDLm(hidden=8) - model = BiEncoderModel( - model=lm, pooling="avg", l2_normalize=True - ) + model = BiEncoderModel(model=lm, pooling="avg", l2_normalize=True) # encode removes token_type_ids and normalizes q = { "input_ids": torch.ones(2, 3, dtype=torch.long), @@ -278,9 +275,7 @@ def forward(self, input_ids=None, attention_mask=None, return_dict=True, output_ return OnlyHiddenOutputs(hidden_states) # Test with model using NoLastLM for query encoder - model_no_last = BiEncoderModel( - model=NoLastLM(hidden=8), pooling="avg", l2_normalize=True - ) + model_no_last = BiEncoderModel(model=NoLastLM(hidden=8), pooling="avg", l2_normalize=True) v2 = model_no_last.encode( {"input_ids": torch.ones(2, 3, dtype=torch.long), "attention_mask": torch.ones(2, 3, dtype=torch.long)}, ) @@ -315,6 +310,64 @@ def from_pretrained(cls, *args, **kwargs): assert any("save1" in p for p in model.model.saved) +@pytest.mark.parametrize( + ("pooling_arg", "l2_arg", "expected_pooling", "expected_l2_normalize"), + [(None, None, "last", False), ("cls", True, "cls", True)], +) +def test_biencoder_build_resolves_saved_retrieval_metadata( + monkeypatch, + pooling_arg, + l2_arg, + expected_pooling, + expected_l2_normalize, +): + """Saved retrieval metadata should restore wrapper options unless explicitly overridden.""" + import nemo_automodel._transformers.retrieval as encoder_module + + class FakeConfig: + model_type = "qwen3" + nemo_retrieval = {"task": "embedding", "pooling": "last", "l2_normalize": False} + + calls = {} + + def fake_auto_config_from_pretrained(*args, **kwargs): + return FakeConfig() + + def fake_build_encoder_backbone( + model_name_or_path, + task, + trust_remote_code=False, + pooling=None, + loaded_config=None, + **hf_kwargs, + ): + calls["model_name_or_path"] = model_name_or_path + calls["task"] = task + calls["pooling"] = pooling + calls["loaded_config"] = loaded_config + return FakeLM(hidden=16) + + monkeypatch.setattr(encoder_module.AutoConfig, "from_pretrained", fake_auto_config_from_pretrained) + monkeypatch.setattr(encoder_module, "build_encoder_backbone", fake_build_encoder_backbone) + + kwargs = {} + if pooling_arg is not None: + kwargs["pooling"] = pooling_arg + if l2_arg is not None: + kwargs["l2_normalize"] = l2_arg + model = BiEncoderModel.build("saved-export", **kwargs) + + assert model.pooling == expected_pooling + assert model.l2_normalize is expected_l2_normalize + assert calls["pooling"] == expected_pooling + assert calls["loaded_config"].nemo_retrieval["pooling"] == "last" + assert model.config.nemo_retrieval == { + "task": "embedding", + "pooling": expected_pooling, + "l2_normalize": expected_l2_normalize, + } + + def test_llama_bidirectional_forward_paths(monkeypatch): cfg = LlamaBidirectionalConfig( vocab_size=64, hidden_size=16, num_hidden_layers=1, num_attention_heads=1, intermediate_size=32, pad_token_id=0 @@ -513,11 +566,16 @@ class FakeCfg: FakeCfg = type("LlamaBidirectionalConfig", (), {}) fake.config = FakeCfg() - configure_encoder_metadata(fake, fake.config) + configure_encoder_metadata(fake, fake.config, task="embedding", pooling="last", l2_normalize=False) assert fake.config.architectures == ["LlamaBidirectionalModel"] assert "auto_map" in vars(fake.config) assert "AutoModel" in fake.config.auto_map + assert fake.config.nemo_retrieval == { + "task": "embedding", + "pooling": "last", + "l2_normalize": False, + } def test_init_encoder_common_name_or_path_for_generic(): @@ -535,10 +593,14 @@ def __init__(self): # Use a class name that is NOT a retrieval arch FakeModel.__name__ = "Qwen3Model" - FakeModel = type("Qwen3Model", (nn.Module,), { - "__init__": FakeModel.__init__, - "config": property(lambda self: self._config), - }) + FakeModel = type( + "Qwen3Model", + (nn.Module,), + { + "__init__": FakeModel.__init__, + "config": property(lambda self: self._config), + }, + ) fake = object.__new__(FakeModel) nn.Module.__init__(fake) fake._config = FakeCfg() diff --git a/tests/unit_tests/recipes/test_mine_hard_negatives.py b/tests/unit_tests/recipes/test_mine_hard_negatives.py index 3b0aac4bc0..641fcfd268 100644 --- a/tests/unit_tests/recipes/test_mine_hard_negatives.py +++ b/tests/unit_tests/recipes/test_mine_hard_negatives.py @@ -122,6 +122,15 @@ def test_extract_mining_params_attn_implementation_explicit(value): assert recipe.attn_implementation == value +@pytest.mark.parametrize("unknown_field", ["pooling", "l2_normalize"]) +def test_extract_mining_params_rejects_unknown_fields(unknown_field): + """Unsupported mining keys should fail loudly instead of being ignored.""" + recipe = _make_recipe({unknown_field: "unused"}) + + with pytest.raises(ValueError, match=f"Unknown mining config field\\(s\\): {unknown_field}"): + recipe._extract_mining_params() + + # --------------------------------------------------------------------------- # setup() — model loading with/without attn_implementation # --------------------------------------------------------------------------- From 62a9ed3838951349981a38adc49e72832a2eec22 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 19:47:57 +0100 Subject: [PATCH 19/25] fix(retrieval): harden mining workflow Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-finetuning.md | 21 ++++- .../data_utils/audit_mined_negatives.py | 26 +++++- .../export_biencoder_with_metadata.py | 91 +++++++++++++++++++ .../retrieval/data_utils/mining_config.yaml | 1 + nemo_automodel/_transformers/retrieval.py | 23 ++++- .../recipes/retrieval/mine_hard_negatives.py | 41 ++++++--- .../_transformers/test_retrieval.py | 43 +++++++++ .../llm/test_audit_mined_negatives.py | 68 +++++++++++++- .../recipes/test_mine_hard_negatives.py | 52 ++++++++++- 9 files changed, 337 insertions(+), 29 deletions(-) create mode 100644 examples/retrieval/data_utils/export_biencoder_with_metadata.py diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 187319319a..e285e30dee 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -593,11 +593,21 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: assembles the final embedding cache and score outputs, so plan memory and local disk accordingly. In multi-node mining, this must be a shared writable path mounted at the same location on every node; node-local cache paths leave rank `0` unable to read remote-rank shards. Use a fresh cache directory for each model, dataset, prefix, sequence - length, and world-size combination; stale cache files can be reused if they are already present. + length, and world-size combination. Set `load_embeddings_from_cache: true` only when you intentionally want to reuse + every cached query shard, corpus chunk, and consolidated embedding file from the same model/input/prefix/length run. `pooling` and `l2_normalize` are saved bi-encoder wrapper metadata, not `mining.*` config fields. Do not pass `--mining.pooling` or `--mining.l2_normalize`; the miner rejects unknown mining keys. Mine from a saved bi-encoder export -produced with the wrapper settings you want, or explicitly reload and export the model with those settings before mining. +produced with the wrapper settings you want. For an older export that does not carry this metadata, write a new export +before mining: + +```bash +uv run python examples/retrieval/data_utils/export_biencoder_with_metadata.py \ + /path/to/export_without_metadata \ + /path/to/export_for_mining \ + --pooling last \ + --no-l2-normalize +``` Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. Hard negative mining excludes document IDs listed in each input row's `pos_doc`, but it cannot read an external qrels file or @@ -622,9 +632,10 @@ uv run python examples/retrieval/data_utils/audit_mined_negatives.py \ --output /path/to/mined_audited.json ``` -The audit flags negatives whose IDs also appear in the row's `pos_doc`, duplicate negative IDs in the same row, missing -negative scores, and non-finite negative scores. The cleaned output preserves query lineage fields such as -`original_question_id`, so unrolled examples remain traceable to their source question. +With `--drop-invalid-negatives --output`, the command exits successfully when the cleaned output has no remaining audit +findings. The audit flags and drops negatives whose IDs also appear in the row's `pos_doc`, duplicate negative IDs in the +same row, missing negative scores, and non-finite negative scores. The cleaned output preserves query lineage fields such +as `original_question_id`, so unrolled examples remain traceable to their source question. ## Save, Resume, and Use the Checkpoint diff --git a/examples/retrieval/data_utils/audit_mined_negatives.py b/examples/retrieval/data_utils/audit_mined_negatives.py index 906c41c9f5..61c8c2fc36 100644 --- a/examples/retrieval/data_utils/audit_mined_negatives.py +++ b/examples/retrieval/data_utils/audit_mined_negatives.py @@ -51,7 +51,7 @@ def audit_records( Args: records: Retrieval training records from the top-level ``data`` field. drop_invalid_negatives: Drop negatives that duplicate positives, duplicate - another negative in the same row, or have a non-finite score. + another negative in the same row, or have a missing/non-finite score. max_findings: Maximum example findings to return. Returns: @@ -98,6 +98,7 @@ def audit_records( score_state = _score_state(neg_doc) if score_state == "missing": summary["missing_negative_score"] += 1 + should_drop = True row_has_findings = True _append_finding(findings, max_findings, row_idx, record, neg_id, "missing_negative_score") elif score_state == "non_finite": @@ -180,7 +181,10 @@ def main() -> int: parser.add_argument( "--drop-invalid-negatives", action="store_true", - help="Drop negatives that are known positives, duplicate row negatives, or have non-finite scores", + help=( + "Drop negatives that are known positives, duplicate row negatives, or have missing/non-finite scores. " + "When used with --output, the exit code is based on the cleaned output." + ), ) parser.add_argument("--max-findings", type=int, default=20, help="Maximum finding examples to print") parser.add_argument( @@ -200,14 +204,26 @@ def main() -> int: max_findings=args.max_findings, ) - print(json.dumps({"summary": summary, "findings": findings}, indent=2)) - + exit_summary = summary + payload = {"summary": summary, "findings": findings} if args.output is not None: output_path = Path(args.output) with open(output_path, "w") as f: json.dump(cleaned_training_data, f, indent=2) - return 1 if summary["total_findings"] and not args.allow_findings else 0 + if args.drop_invalid_negatives: + remaining_summary, _, remaining_findings = audit_training_data( + cleaned_training_data, + drop_invalid_negatives=False, + max_findings=args.max_findings, + ) + payload["remaining_summary"] = remaining_summary + payload["remaining_findings"] = remaining_findings + exit_summary = remaining_summary + + print(json.dumps(payload, indent=2)) + + return 1 if exit_summary["total_findings"] and not args.allow_findings else 0 if __name__ == "__main__": diff --git a/examples/retrieval/data_utils/export_biencoder_with_metadata.py b/examples/retrieval/data_utils/export_biencoder_with_metadata.py new file mode 100644 index 0000000000..00a2f2b212 --- /dev/null +++ b/examples/retrieval/data_utils/export_biencoder_with_metadata.py @@ -0,0 +1,91 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Re-save a bi-encoder export with explicit retrieval wrapper metadata.""" + +import argparse +import logging +from pathlib import Path + +from nemo_automodel import NeMoAutoModelBiEncoder, NeMoAutoTokenizer + +logger = logging.getLogger(__name__) + + +def main() -> int: + """Run the metadata export CLI.""" + parser = argparse.ArgumentParser( + description="Load a HF-loadable bi-encoder export and re-save it with explicit retrieval metadata" + ) + parser.add_argument("input_model", type=str, help="Path or Hugging Face ID for the source bi-encoder export") + parser.add_argument("output_dir", type=str, help="Directory to write the metadata-bearing export") + parser.add_argument( + "--pooling", + type=str, + choices=("avg", "weighted_avg", "cls", "last", "colbert"), + default=None, + help="Pooling strategy to persist. Omit to keep saved metadata or the AutoModel default.", + ) + l2_group = parser.add_mutually_exclusive_group() + l2_group.add_argument( + "--l2-normalize", + dest="l2_normalize", + action="store_true", + help="Persist l2_normalize=true", + ) + l2_group.add_argument( + "--no-l2-normalize", + dest="l2_normalize", + action="store_false", + help="Persist l2_normalize=false", + ) + parser.set_defaults(l2_normalize=None) + parser.add_argument( + "--tokenizer-name-or-path", + type=str, + default=None, + help="Tokenizer path to save with the export. Defaults to input_model.", + ) + parser.add_argument("--trust-remote-code", action="store_true", help="Forward trust_remote_code=True to loaders") + parser.add_argument("--torch-dtype", type=str, default="auto", help="Torch dtype forwarded to model loading") + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO) + + model_kwargs = { + "use_liger_kernel": False, + "trust_remote_code": args.trust_remote_code, + "torch_dtype": args.torch_dtype, + } + if args.pooling is not None: + model_kwargs["pooling"] = args.pooling + if args.l2_normalize is not None: + model_kwargs["l2_normalize"] = args.l2_normalize + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + model = NeMoAutoModelBiEncoder.from_pretrained(args.input_model, **model_kwargs) + model.save_pretrained(str(output_dir)) + + tokenizer_path = args.tokenizer_name_or_path or args.input_model + tokenizer = NeMoAutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=args.trust_remote_code) + tokenizer.save_pretrained(str(output_dir)) + + logger.info("Wrote bi-encoder export with retrieval metadata to %s", output_dir) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/retrieval/data_utils/mining_config.yaml b/examples/retrieval/data_utils/mining_config.yaml index ce2d3d5f0a..9e5d31bf9f 100644 --- a/examples/retrieval/data_utils/mining_config.yaml +++ b/examples/retrieval/data_utils/mining_config.yaml @@ -66,6 +66,7 @@ mining: # Caching (required for multi-rank torchrun; use a shared writable path for multi-node). # Use a fresh directory per model, dataset, prefix, sequence-length, and world-size combination. # cache_embeddings_dir: /path/to/cache/ + # Set true only when every cached shard/chunk came from the same model/input/prefix/length run. load_embeddings_from_cache: false # Mining parameters diff --git a/nemo_automodel/_transformers/retrieval.py b/nemo_automodel/_transformers/retrieval.py index 4d65f779bd..a383a7dc6c 100644 --- a/nemo_automodel/_transformers/retrieval.py +++ b/nemo_automodel/_transformers/retrieval.py @@ -15,7 +15,9 @@ """Encoder models for bi-encoder and cross-encoder tasks.""" import inspect +import json import os +from pathlib import Path from typing import Optional import torch @@ -41,6 +43,23 @@ def _get_retrieval_metadata(config) -> dict: return metadata if isinstance(metadata, dict) else {} +def _load_encoder_config(model_name_or_path: str, trust_remote_code: bool = False): + """Load an encoder config and merge AutoModel v5 metadata when present.""" + config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code) + model_path = Path(model_name_or_path) + v5_config_path = model_path / "config.v5.json" + if not v5_config_path.exists(): + return config + + with open(v5_config_path, "r") as f: + v5_config = json.load(f) + + metadata = v5_config.get(_RETRIEVAL_METADATA_KEY) + if isinstance(metadata, dict): + setattr(config, _RETRIEVAL_METADATA_KEY, metadata) + return config + + def _coerce_bool(value) -> bool: """Coerce booleans from config values while rejecting ambiguous strings.""" if isinstance(value, bool): @@ -311,7 +330,7 @@ def build_encoder_backbone( """ config = loaded_config if config is None: - config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code) + config = _load_encoder_config(model_name_or_path, trust_remote_code=trust_remote_code) model_type = getattr(config, "model_type", "") if extract_submodel is not None: @@ -453,7 +472,7 @@ def build( logger.info(f"Building BiEncoderModel from {model_name_or_path}") - config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code) + config = _load_encoder_config(model_name_or_path, trust_remote_code=trust_remote_code) pooling, l2_normalize = _resolve_bi_encoder_options(config, pooling, l2_normalize) backbone = build_encoder_backbone( model_name_or_path, diff --git a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py index 0ac781f60a..b977e7ee72 100644 --- a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py +++ b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py @@ -17,6 +17,7 @@ from __future__ import annotations import logging +import math from pathlib import Path from typing import Any, Dict, List, Optional, Tuple @@ -507,6 +508,9 @@ def _encode_texts( Returns: numpy array of embeddings [num_texts, embedding_dim]. """ + if not texts: + return np.empty((0, 0), dtype=np.float32) + embeddings = [] num_texts = len(texts) num_batches = (num_texts + batch_size - 1) // batch_size @@ -590,7 +594,7 @@ def _encode_queries_sharded(self) -> np.ndarray: shard_path = shard_dir / f"queries_rank{r:04d}.npz" # Compute or load this rank's shard - if shard_path.exists(): + if self.load_embeddings_from_cache and shard_path.exists(): local_embeds = _load_npz_array(shard_path) else: local_texts = self.questions[local_start:local_end] @@ -619,9 +623,10 @@ def _encode_queries_sharded(self) -> np.ndarray: rr_start, rr_end = _compute_rank_partition(num_q, ws, rr) expected = rr_end - rr_start _validate_shard_shape(rr_path, expected, rr_emb.shape[0]) - parts.append(rr_emb) + if expected > 0: + parts.append(rr_emb) - return np.concatenate(parts, axis=0) + return np.concatenate(parts, axis=0) if parts else np.empty((0, 0), dtype=np.float32) def _load_cached_chunk(self, cache_path: Path) -> Optional[np.ndarray]: """Load a fully-assembled chunk cache if it exists. @@ -634,7 +639,7 @@ def _load_cached_chunk(self, cache_path: Path) -> Optional[np.ndarray]: Returns: Cached embeddings array, or None if cache doesn't exist. """ - if cache_path is None or not cache_path.exists(): + if not self.load_embeddings_from_cache or cache_path is None or not cache_path.exists(): return None # In distributed runs, only rank0 needs the assembled chunk @@ -671,7 +676,7 @@ def _encode_chunk_distributed( rank_cache_path = cache_path.parent / f"{cache_path.stem}_rank{r:04d}{cache_path.suffix}" # Compute or load this rank's slice - if rank_cache_path.exists(): + if self.load_embeddings_from_cache and rank_cache_path.exists(): local_embeds = _load_npz_array(rank_cache_path) else: local_texts = texts[local_start:local_end] @@ -698,9 +703,10 @@ def _encode_chunk_distributed( rr_emb = _load_npz_array(rr_path) expected = rr_end - rr_start _validate_shard_shape(rr_path, expected, rr_emb.shape[0]) - parts.append(rr_emb) + if expected > 0: + parts.append(rr_emb) - embeddings = np.concatenate(parts, axis=0) + embeddings = np.concatenate(parts, axis=0) if parts else np.empty((0, 0), dtype=np.float32) # Save assembled chunk for faster reuse next time np.savez(cache_path, embeddings) return embeddings @@ -810,7 +816,7 @@ def _encode_all_documents(self) -> np.ndarray: chunk_idx += 1 if self.dist_env.is_main: - return np.concatenate(all_embeddings, axis=0) + return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.empty((0, 0), dtype=np.float32) return np.empty((0, 0), dtype=np.float32) def _load_embeddings_from_cache(self) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]: @@ -1008,7 +1014,8 @@ def _mine_hard_negatives( - pos_scores: Similarity scores for each positive document """ # Convert document embeddings to tensor once (encoder embeddings are 2D) - doc_embeddings_tensor = torch.tensor(document_embeddings, device="cuda") + device = self.dist_env.device + doc_embeddings_tensor = torch.tensor(document_embeddings, device=device) neg_indices_all = [] neg_scores_all = [] @@ -1026,7 +1033,7 @@ def _mine_hard_negatives( batch_pos_indices = pos_doc_indices[start_idx:end_idx] # Compute similarity scores: [batch_size, num_docs] - batch_query_tensor = torch.tensor(batch_query_embs, device="cuda") + batch_query_tensor = torch.tensor(batch_query_embs, device=device) batch_scores = batch_query_tensor @ doc_embeddings_tensor.T # Extract positive scores and mask positives @@ -1056,7 +1063,7 @@ def _mine_hard_negatives( # Vectorized margin filtering if hard_neg_margin is not None: - min_pos_tensor = torch.tensor(min_pos_scores, device="cuda") + min_pos_tensor = torch.tensor(min_pos_scores, device=device) if hard_neg_margin_type.lower() == "abs": threshold = torch.unsqueeze(min_pos_tensor - hard_neg_margin, dim=1) @@ -1073,14 +1080,20 @@ def _mine_hard_negatives( k = min(num_negs * TOPK_BUFFER_MULTIPLIER, batch_scores.shape[1]) topk = batch_scores.topk(k=k, dim=1) topk_indices = topk.indices.tolist() + topk_scores = topk.values.tolist() # Post-process: remove any remaining positives and limit to num_negs for i, query_pos_indices in enumerate(batch_pos_indices): pos_set = set(query_pos_indices) - # Filter out positives from top-k candidates - hard_neg_candidates = [idx for idx in topk_indices[i] if idx not in pos_set] - hard_neg_scores = [batch_scores[i, idx].item() for idx in topk_indices[i] if idx not in pos_set] + # Filter out positives and candidates removed by margin filtering. + hard_neg_candidates = [] + hard_neg_scores = [] + for idx, score in zip(topk_indices[i], topk_scores[i]): + if idx in pos_set or not math.isfinite(score): + continue + hard_neg_candidates.append(idx) + hard_neg_scores.append(score) # Limit to num_negs neg_indices_all.append(hard_neg_candidates[:num_negs]) diff --git a/tests/unit_tests/_transformers/test_retrieval.py b/tests/unit_tests/_transformers/test_retrieval.py index 5c35f5f24d..9dfd8a556b 100644 --- a/tests/unit_tests/_transformers/test_retrieval.py +++ b/tests/unit_tests/_transformers/test_retrieval.py @@ -222,3 +222,46 @@ def test_extract_submodel_without_config_raises(): with pytest.raises(ValueError, match="has no .config attribute"): _extract_submodel(model, "language_model") + + +def test_load_encoder_config_merges_v5_retrieval_metadata(tmp_path): + """v4-compatible exports keep AutoModel metadata in config.v5.json.""" + from nemo_automodel._transformers.retrieval import _load_encoder_config + + (tmp_path / "config.json").write_text(json.dumps({"model_type": "bert"})) + (tmp_path / "config.v5.json").write_text( + json.dumps( + { + "model_type": "bert", + "nemo_retrieval": {"task": "embedding", "pooling": "last", "l2_normalize": False}, + } + ) + ) + + config = _load_encoder_config(str(tmp_path)) + + assert config.nemo_retrieval == {"task": "embedding", "pooling": "last", "l2_normalize": False} + + +def test_nemo_auto_biencoder_defaults_do_not_override_saved_metadata(monkeypatch): + """The public AutoModel entry point should defer pooling/l2 defaults to the saved config.""" + from nemo_automodel._transformers import auto_model + + captured = {} + + def fake_base_from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + captured["pretrained_model_name_or_path"] = pretrained_model_name_or_path + captured["kwargs"] = kwargs + return object() + + monkeypatch.setattr( + auto_model._NeMoAutoModelForRetrievalBase, + "from_pretrained", + classmethod(fake_base_from_pretrained), + ) + + auto_model.NeMoAutoModelBiEncoder.from_pretrained("saved-export") + + assert captured["pretrained_model_name_or_path"] == "saved-export" + assert captured["kwargs"]["pooling"] is None + assert captured["kwargs"]["l2_normalize"] is None diff --git a/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py b/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py index c5e07c396d..cea41cdfdf 100644 --- a/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py +++ b/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py @@ -12,7 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +import json +import sys + from examples.retrieval.data_utils.audit_mined_negatives import audit_training_data +from examples.retrieval.data_utils.audit_mined_negatives import main as audit_main def test_audit_mined_negatives_reports_and_cleans_invalid_rows(): @@ -46,7 +50,7 @@ def test_audit_mined_negatives_reports_and_cleans_invalid_rows(): "duplicate_negative": 1, "missing_negative_score": 1, "non_finite_negative_score": 1, - "dropped_negatives": 3, + "dropped_negatives": 4, "total_findings": 4, } assert cleaned["corpus"] == training_data["corpus"] @@ -57,7 +61,7 @@ def test_audit_mined_negatives_reports_and_cleans_invalid_rows(): "question": "Which document is positive?", "corpus_id": "demo", "pos_doc": [{"id": 1}], - "neg_doc": [{"id": "2", "score": 0.4}, {"id": "4"}], + "neg_doc": [{"id": "2", "score": 0.4}], } ] assert findings[0]["original_question_id"] == "q0" @@ -82,3 +86,63 @@ def test_audit_mined_negatives_preserves_records_without_findings(): assert summary["total_findings"] == 0 assert cleaned == training_data assert findings == [] + + +def test_audit_cli_writes_cleaned_output_and_exits_zero(tmp_path, monkeypatch, capsys): + input_file = tmp_path / "mined.json" + output_file = tmp_path / "cleaned.json" + input_file.write_text( + json.dumps( + { + "corpus": {"path": "/corpus"}, + "data": [ + { + "question_id": "q0", + "question": "Which document is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": "1"}], + "neg_doc": [{"id": "1", "score": 1.0}, {"id": "2"}], + } + ], + } + ) + ) + monkeypatch.setattr( + sys, + "argv", + [ + "audit_mined_negatives.py", + str(input_file), + "--drop-invalid-negatives", + "--output", + str(output_file), + ], + ) + + assert audit_main() == 0 + report = json.loads(capsys.readouterr().out) + cleaned = json.loads(output_file.read_text()) + + assert report["summary"]["total_findings"] == 2 + assert report["remaining_summary"]["total_findings"] == 0 + assert cleaned["data"][0]["neg_doc"] == [] + + +def test_audit_cli_exits_nonzero_when_findings_remain(tmp_path, monkeypatch): + input_file = tmp_path / "mined.json" + input_file.write_text( + json.dumps( + { + "data": [ + { + "question_id": "q0", + "pos_doc": [{"id": "1"}], + "neg_doc": [{"id": "1", "score": 1.0}], + } + ] + } + ) + ) + monkeypatch.setattr(sys, "argv", ["audit_mined_negatives.py", str(input_file)]) + + assert audit_main() == 1 diff --git a/tests/unit_tests/recipes/test_mine_hard_negatives.py b/tests/unit_tests/recipes/test_mine_hard_negatives.py index 641fcfd268..12f8e3ace2 100644 --- a/tests/unit_tests/recipes/test_mine_hard_negatives.py +++ b/tests/unit_tests/recipes/test_mine_hard_negatives.py @@ -17,7 +17,9 @@ import json from unittest.mock import MagicMock, patch +import numpy as np import pytest +import torch from nemo_automodel.components.config.loader import ConfigNode from nemo_automodel.recipes.retrieval.mine_hard_negatives import MINING_DEFAULTS, MineHardNegativesRecipe @@ -122,7 +124,7 @@ def test_extract_mining_params_attn_implementation_explicit(value): assert recipe.attn_implementation == value -@pytest.mark.parametrize("unknown_field", ["pooling", "l2_normalize"]) +@pytest.mark.parametrize("unknown_field", ["pooling", "l2_normalize", "query_prefx", "hard_negative_to_mine"]) def test_extract_mining_params_rejects_unknown_fields(unknown_field): """Unsupported mining keys should fail loudly instead of being ignored.""" recipe = _make_recipe({unknown_field: "unused"}) @@ -203,6 +205,8 @@ def test_write_output_preserves_original_question_id(tmp_path): recipe._write_output() output = json.loads(output_file.read_text()) + assert output["corpus"] == {"path": "/fake/corpus"} + assert output["mining"] == {"args": {}} assert output["data"] == [ { "question_id": "q0_0", @@ -213,3 +217,49 @@ def test_write_output_preserves_original_question_id(tmp_path): "neg_doc": [{"id": "n1", "score": 0.2}], } ] + + +def test_encode_texts_empty_input_returns_empty_array(): + recipe = _make_recipe() + + embeddings = recipe._encode_texts(texts=[], batch_size=2, max_length=16) + + assert embeddings.shape == (0, 0) + assert embeddings.dtype == np.float32 + + +def test_load_cached_chunk_ignored_when_cache_loading_disabled(tmp_path): + recipe = _make_recipe({"load_embeddings_from_cache": False}) + recipe.load_embeddings_from_cache = False + cache_path = tmp_path / "chunk_0000.npz" + np.savez(cache_path, np.ones((1, 2), dtype=np.float32)) + + assert recipe._load_cached_chunk(cache_path) is None + + +def test_mine_hard_negatives_drops_margin_filtered_candidates(): + recipe = _make_recipe() + recipe.dist_env = MagicMock(device=torch.device("cpu")) + query_embeddings = np.array([[1.0, 0.0]], dtype=np.float32) + document_embeddings = np.array( + [ + [1.0, 0.0], + [0.96, 0.0], + [0.2, 0.0], + ], + dtype=np.float32, + ) + + neg_indices, neg_scores, pos_scores = recipe._mine_hard_negatives( + query_embeddings=query_embeddings, + document_embeddings=document_embeddings, + pos_doc_indices=[[0]], + batch_size=1, + num_negs=2, + hard_neg_margin=0.95, + hard_neg_margin_type="perc", + ) + + assert neg_indices == [[2]] + assert neg_scores[0][0] == pytest.approx(0.2) + assert pos_scores[0][0] == pytest.approx(1.0) From 95eb04131ab259efc7d7fd2d9eb2211be743c785 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 20:13:36 +0100 Subject: [PATCH 20/25] fix(retrieval): validate mining cache reuse Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-finetuning.md | 26 ++- .../export_biencoder_with_metadata.py | 7 +- .../retrieval/data_utils/mining_config.yaml | 6 +- nemo_automodel/_transformers/retrieval.py | 65 +++++- .../recipes/retrieval/mine_hard_negatives.py | 188 +++++++++++++++++- .../_transformers/test_retrieval.py | 48 +++++ .../recipes/test_mine_hard_negatives.py | 144 +++++++++++++- 7 files changed, 455 insertions(+), 29 deletions(-) diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index e285e30dee..651ea2e46a 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -309,7 +309,9 @@ dataloader: Important knobs: -- `pooling`: controls how token hidden states become one embedding. Common choices are `avg`, `cls`, and `last`. +- `pooling`: controls how token hidden states become one embedding. Common single-vector choices are `avg`, `cls`, + `last`, and `weighted_avg`. The hard-negative miner supports only single-vector pooling modes; do not mine with + `colbert` pooling, which returns token-level embeddings. - `l2_normalize`: normalizes embeddings before scoring. When enabled, the recipe divides scores by `temperature`. - `q_max_len` and `p_max_len`: set separate truncation lengths for queries and passages. - `query_prefix` and `passage_prefix`: add task-specific text before tokenization. Keep these aligned between training, @@ -571,6 +573,12 @@ Hard-negative mining expects the corpus ID-based retrieval JSON format described JSONL shortcut. The input must reference one corpus so the miner can build a passage embedding cache, retrieve candidates, and write mined negatives back to each query. +The quickstart configs use `hf://` sources for the first train/eval path. The miner currently reads a local +corpus-backed retrieval JSON file instead of `hf://` URIs directly. For a train -> mine -> retrain loop, first +materialize or preprocess your selected HF subset into the corpus ID JSON schema from +[Retrieval Dataset](retrieval-dataset.md), then set `--mining.train_qa_file_path` to that local JSON file. The mining +commands below assume that local corpus-backed input. + Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: - `hard_negatives_to_mine`: target number of negatives to add per query. The miner can return fewer when the corpus has @@ -593,8 +601,10 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: assembles the final embedding cache and score outputs, so plan memory and local disk accordingly. In multi-node mining, this must be a shared writable path mounted at the same location on every node; node-local cache paths leave rank `0` unable to read remote-rank shards. Use a fresh cache directory for each model, dataset, prefix, sequence - length, and world-size combination. Set `load_embeddings_from_cache: true` only when you intentionally want to reuse - every cached query shard, corpus chunk, and consolidated embedding file from the same model/input/prefix/length run. + length, and world-size combination. The miner validates a cache fingerprint before reuse, but fresh run-specific + paths are still easier to reason about. Set `load_embeddings_from_cache: true` only when you intentionally want to + reuse every cached query shard, corpus chunk, and consolidated embedding file from the same + model/input/prefix/length/world-size run. `pooling` and `l2_normalize` are saved bi-encoder wrapper metadata, not `mining.*` config fields. Do not pass `--mining.pooling` or `--mining.l2_normalize`; the miner rejects unknown mining keys. Mine from a saved bi-encoder export @@ -609,6 +619,10 @@ uv run python examples/retrieval/data_utils/export_biencoder_with_metadata.py \ --no-l2-normalize ``` +Hard-negative mining parallelizes embedding generation across ranks, but the final exact scoring step still runs on +rank `0` and materializes the full document embedding matrix there. For very large corpora, use a smaller mining slice +or a custom ANN/blockwise mining workflow instead of expecting this helper to scale to web-scale indexing. + Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. Hard negative mining excludes document IDs listed in each input row's `pos_doc`, but it cannot read an external qrels file or know every semantically relevant duplicate. Put all known positive IDs for the query in the mining input, deduplicate the @@ -620,9 +634,13 @@ Run the audit utility before reusing mined output: ```bash uv run python examples/retrieval/data_utils/audit_mined_negatives.py \ - /path/to/mined.json + /path/to/mined.json \ + --allow-findings ``` +`--allow-findings` keeps this first inspection command from failing the shell when it finds issues. Omit it in CI or +quality gates when findings should fail the job. + If the report only contains issues that you want to drop automatically, write a cleaned copy: ```bash diff --git a/examples/retrieval/data_utils/export_biencoder_with_metadata.py b/examples/retrieval/data_utils/export_biencoder_with_metadata.py index 00a2f2b212..ec149dc455 100644 --- a/examples/retrieval/data_utils/export_biencoder_with_metadata.py +++ b/examples/retrieval/data_utils/export_biencoder_with_metadata.py @@ -33,9 +33,12 @@ def main() -> int: parser.add_argument( "--pooling", type=str, - choices=("avg", "weighted_avg", "cls", "last", "colbert"), + choices=("avg", "weighted_avg", "cls", "last"), default=None, - help="Pooling strategy to persist. Omit to keep saved metadata or the AutoModel default.", + help=( + "Single-vector pooling strategy to persist. Omit to keep saved metadata or the AutoModel default. " + "ColBERT/token-level pooling is not supported by the hard-negative miner." + ), ) l2_group = parser.add_mutually_exclusive_group() l2_group.add_argument( diff --git a/examples/retrieval/data_utils/mining_config.yaml b/examples/retrieval/data_utils/mining_config.yaml index 9e5d31bf9f..e743638925 100644 --- a/examples/retrieval/data_utils/mining_config.yaml +++ b/examples/retrieval/data_utils/mining_config.yaml @@ -65,8 +65,9 @@ mining: # Caching (required for multi-rank torchrun; use a shared writable path for multi-node). # Use a fresh directory per model, dataset, prefix, sequence-length, and world-size combination. + # The miner validates cache metadata before reuse, but fresh run-specific paths are still clearest. # cache_embeddings_dir: /path/to/cache/ - # Set true only when every cached shard/chunk came from the same model/input/prefix/length run. + # Set true only when every cached shard/chunk came from the same model/input/prefix/length/world-size run. load_embeddings_from_cache: false # Mining parameters @@ -92,8 +93,9 @@ mining: passage_max_length: 512 # pooling and l2_normalize are saved model metadata, not mining config keys. # Do not add mining.pooling or mining.l2_normalize here; the miner rejects unknown keys. + # The miner supports single-vector pooling modes only (avg, weighted_avg, cls, last), not ColBERT pooling. # Whether to include negatives from the input file. Existing negatives are prepended to the mined output; # deduplicate/audit the final file before training: - # uv run python examples/retrieval/data_utils/audit_mined_negatives.py /path/to/output.json + # uv run python examples/retrieval/data_utils/audit_mined_negatives.py /path/to/output.json --allow-findings use_negatives_from_file: false diff --git a/nemo_automodel/_transformers/retrieval.py b/nemo_automodel/_transformers/retrieval.py index a383a7dc6c..95d51fcb9a 100644 --- a/nemo_automodel/_transformers/retrieval.py +++ b/nemo_automodel/_transformers/retrieval.py @@ -23,6 +23,7 @@ import torch import torch.nn as nn import torch.nn.functional as F +from huggingface_hub import hf_hub_download from transformers import AutoConfig, AutoModel, AutoModelForSequenceClassification, PreTrainedModel from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING from transformers.utils import logging @@ -35,6 +36,16 @@ _RETRIEVAL_METADATA_KEY = "nemo_retrieval" _BI_ENCODER_DEFAULT_POOLING = "avg" _BI_ENCODER_DEFAULT_L2_NORMALIZE = True +_HF_HUB_DOWNLOAD_CONFIG_KWARGS = { + "cache_dir", + "force_download", + "local_files_only", + "repo_type", + "revision", + "subfolder", + "token", +} +_AUTO_CONFIG_LOAD_KWARGS = _HF_HUB_DOWNLOAD_CONFIG_KWARGS - {"repo_type"} def _get_retrieval_metadata(config) -> dict: @@ -43,20 +54,54 @@ def _get_retrieval_metadata(config) -> dict: return metadata if isinstance(metadata, dict) else {} -def _load_encoder_config(model_name_or_path: str, trust_remote_code: bool = False): - """Load an encoder config and merge AutoModel v5 metadata when present.""" - config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code) - model_path = Path(model_name_or_path) - v5_config_path = model_path / "config.v5.json" - if not v5_config_path.exists(): - return config - +def _load_v5_retrieval_metadata(config, v5_config_path: str | Path): + """Merge retrieval wrapper metadata from an AutoModel v5 config file.""" with open(v5_config_path, "r") as f: v5_config = json.load(f) metadata = v5_config.get(_RETRIEVAL_METADATA_KEY) if isinstance(metadata, dict): setattr(config, _RETRIEVAL_METADATA_KEY, metadata) + + +def _get_hf_hub_download_kwargs(hf_kwargs: dict) -> dict: + """Return kwargs that ``hf_hub_download`` accepts for config sidecars.""" + return {key: value for key, value in hf_kwargs.items() if key in _HF_HUB_DOWNLOAD_CONFIG_KWARGS} + + +def _get_auto_config_load_kwargs(hf_kwargs: dict) -> dict: + """Return kwargs that ``AutoConfig.from_pretrained`` should receive.""" + return {key: value for key, value in hf_kwargs.items() if key in _AUTO_CONFIG_LOAD_KWARGS} + + +def _load_encoder_config(model_name_or_path: str, trust_remote_code: bool = False, **hf_kwargs): + """Load an encoder config and merge AutoModel v5 metadata when present.""" + model_id = str(model_name_or_path) + config = AutoConfig.from_pretrained( + model_id, + trust_remote_code=trust_remote_code, + **_get_auto_config_load_kwargs(hf_kwargs), + ) + model_path = Path(model_id) + v5_config_path = model_path / "config.v5.json" + if v5_config_path.exists(): + _load_v5_retrieval_metadata(config, v5_config_path) + return config + + # Local directory exports write config.v5.json beside config.json. Hub exports need + # an explicit sidecar download because AutoConfig only loads config.json. + if os.path.isdir(model_id) or "/" not in model_id: + return config + try: + hub_v5_config_path = hf_hub_download( + repo_id=model_id, + filename="config.v5.json", + **_get_hf_hub_download_kwargs(hf_kwargs), + ) + except Exception: + return config + + _load_v5_retrieval_metadata(config, hub_v5_config_path) return config @@ -330,7 +375,7 @@ def build_encoder_backbone( """ config = loaded_config if config is None: - config = _load_encoder_config(model_name_or_path, trust_remote_code=trust_remote_code) + config = _load_encoder_config(model_name_or_path, trust_remote_code=trust_remote_code, **hf_kwargs) model_type = getattr(config, "model_type", "") if extract_submodel is not None: @@ -472,7 +517,7 @@ def build( logger.info(f"Building BiEncoderModel from {model_name_or_path}") - config = _load_encoder_config(model_name_or_path, trust_remote_code=trust_remote_code) + config = _load_encoder_config(model_name_or_path, trust_remote_code=trust_remote_code, **hf_kwargs) pooling, l2_normalize = _resolve_bi_encoder_options(config, pooling, l2_normalize) backbone = build_encoder_backbone( model_name_or_path, diff --git a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py index b977e7ee72..f691b7d2b9 100644 --- a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py +++ b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py @@ -16,6 +16,8 @@ from __future__ import annotations +import hashlib +import json import logging import math from pathlib import Path @@ -38,6 +40,8 @@ # Cache file names QUERY_EMBEDDINGS_FNAME = "query_embeddings.npz" DOCUMENT_EMBEDDINGS_FNAME = "passage_embeddings.npz" +EMBEDDINGS_CACHE_METADATA_FNAME = "embedding_cache_metadata.json" +EMBEDDINGS_CACHE_METADATA_VERSION = 1 CORPUS_CHUNKS_DIR = "corpus_chunks" QUERY_SHARDS_DIR = "query_shards" @@ -209,6 +213,7 @@ def __init__(self, cfg): # Embeddings (populated by _generate_embeddings) self.query_embeddings = None self.document_embeddings = None + self._reuse_partial_embedding_cache = False # Mining results (populated by _mine_hard_negatives) self.mined_neg_indices = None # List[List[int]] - mined negative indices per query @@ -252,6 +257,12 @@ def setup(self): ) self.model = self.model.to(self.dist_env.device) self.model.eval() + if getattr(self.model, "pooling", None) == "colbert": + raise ValueError( + "Hard negative mining supports single-vector bi-encoder pooling modes only " + "('avg', 'weighted_avg', 'cls', or 'last'). ColBERT pooling returns token-level " + "embeddings and is not supported by this miner." + ) # Load and configure tokenizer self._configure_tokenizer() @@ -594,7 +605,7 @@ def _encode_queries_sharded(self) -> np.ndarray: shard_path = shard_dir / f"queries_rank{r:04d}.npz" # Compute or load this rank's shard - if self.load_embeddings_from_cache and shard_path.exists(): + if self._should_load_partial_cache(shard_path): local_embeds = _load_npz_array(shard_path) else: local_texts = self.questions[local_start:local_end] @@ -639,7 +650,7 @@ def _load_cached_chunk(self, cache_path: Path) -> Optional[np.ndarray]: Returns: Cached embeddings array, or None if cache doesn't exist. """ - if not self.load_embeddings_from_cache or cache_path is None or not cache_path.exists(): + if not self._should_load_partial_cache(cache_path): return None # In distributed runs, only rank0 needs the assembled chunk @@ -676,7 +687,7 @@ def _encode_chunk_distributed( rank_cache_path = cache_path.parent / f"{cache_path.stem}_rank{r:04d}{cache_path.suffix}" # Compute or load this rank's slice - if self.load_embeddings_from_cache and rank_cache_path.exists(): + if self._should_load_partial_cache(rank_cache_path): local_embeds = _load_npz_array(rank_cache_path) else: local_texts = texts[local_start:local_end] @@ -819,6 +830,143 @@ def _encode_all_documents(self) -> np.ndarray: return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.empty((0, 0), dtype=np.float32) return np.empty((0, 0), dtype=np.float32) + def _hash_cache_values(self, values: Optional[List[Any]]) -> str: + """Build a stable digest for ordered cache identity values.""" + hasher = hashlib.sha256() + for value in values or []: + hasher.update(str(value).encode("utf-8")) + hasher.update(b"\0") + return hasher.hexdigest() + + def _get_ordered_doc_ids(self) -> List[Any]: + """Return document IDs in embedding row order.""" + if isinstance(self.idx_to_doc, dict): + return [self.idx_to_doc[idx] for idx in sorted(self.idx_to_doc)] + return list(self.idx_to_doc or []) + + def _get_model_pooling(self): + """Return pooling metadata from the live model or cached model metadata.""" + if hasattr(self, "_model_pooling"): + return self._model_pooling + if self.model is None: + return None + return getattr(self.model, "pooling", None) + + def _get_model_l2_normalize(self): + """Return l2 metadata from the live model or cached model metadata.""" + if hasattr(self, "_model_l2_normalize"): + return self._model_l2_normalize + if self.model is None: + return None + return getattr(self.model, "l2_normalize", None) + + def _build_cache_fingerprint(self) -> Dict[str, Any]: + """Build the cache identity used to decide whether embeddings are reusable.""" + return { + "model_name_or_path": str(self.model_name_or_path), + "tokenizer_name_or_path": str(self.tokenizer_name_or_path), + "train_qa_file_path": str(self.train_qa_file_path), + "corpus_path": str(self.corpus_path), + "query_prefix": self.query_prefix, + "passage_prefix": self.passage_prefix, + "query_max_length": self.query_max_length, + "passage_max_length": self.passage_max_length, + "add_bos_token": self.add_bos_token, + "add_eos_token": self.add_eos_token, + "attn_implementation": self.attn_implementation, + "pooling": self._get_model_pooling(), + "l2_normalize": self._get_model_l2_normalize(), + "world_size": getattr(self.dist_env, "world_size", 1), + "num_questions": len(self.questions or []), + "num_documents": len(self.idx_to_doc or {}), + "question_ids_hash": self._hash_cache_values(self.question_ids), + "questions_hash": self._hash_cache_values(self.questions), + "document_ids_hash": self._hash_cache_values(self._get_ordered_doc_ids()), + } + + def _cache_metadata_path(self) -> Optional[Path]: + """Return the path to the embedding cache metadata file.""" + if not self.cache_embeddings_dir: + return None + return Path(self.cache_embeddings_dir) / EMBEDDINGS_CACHE_METADATA_FNAME + + def _load_cache_metadata(self) -> Optional[Dict[str, Any]]: + """Load embedding cache metadata if it exists and is valid JSON.""" + metadata_path = self._cache_metadata_path() + if metadata_path is None or not metadata_path.exists(): + return None + try: + with open(metadata_path, "r") as f: + metadata = json.load(f) + except json.JSONDecodeError: + logger.warning("Ignoring invalid embedding cache metadata at %s", metadata_path) + return None + return metadata if isinstance(metadata, dict) else None + + def _cache_metadata_matches( + self, + metadata: Optional[Dict[str, Any]], + query_shape: Optional[Tuple[int, ...]] = None, + document_shape: Optional[Tuple[int, ...]] = None, + ) -> bool: + """Check whether cache metadata matches the current mining run.""" + if metadata is None: + logger.info("Embedding cache metadata is missing; recomputing embeddings.") + return False + if metadata.get("version") != EMBEDDINGS_CACHE_METADATA_VERSION: + logger.info("Embedding cache metadata version changed; recomputing embeddings.") + return False + if metadata.get("fingerprint") != self._build_cache_fingerprint(): + logger.info("Embedding cache fingerprint does not match this mining run; recomputing embeddings.") + return False + + expected_query_shape = metadata.get("query_shape") + expected_document_shape = metadata.get("document_shape") + if ( + not isinstance(expected_query_shape, list) + or not expected_query_shape + or not isinstance(expected_document_shape, list) + or not expected_document_shape + ): + logger.info("Embedding cache metadata is missing shape information; recomputing embeddings.") + return False + if expected_query_shape[0] != len(self.questions or []): + logger.info("Query cache metadata shape does not match the current input; recomputing embeddings.") + return False + if expected_document_shape[0] != len(self.idx_to_doc or {}): + logger.info("Document cache metadata shape does not match the current corpus; recomputing embeddings.") + return False + if query_shape is not None and expected_query_shape != list(query_shape): + logger.info("Cached query embedding file shape differs from metadata; recomputing embeddings.") + return False + if document_shape is not None and expected_document_shape != list(document_shape): + logger.info("Cached document embedding file shape differs from metadata; recomputing embeddings.") + return False + return True + + def _write_cache_metadata(self, query_embeddings: np.ndarray, document_embeddings: np.ndarray) -> None: + """Write embedding cache metadata after consolidated embeddings are saved.""" + metadata_path = self._cache_metadata_path() + if metadata_path is None: + return + metadata = { + "version": EMBEDDINGS_CACHE_METADATA_VERSION, + "fingerprint": self._build_cache_fingerprint(), + "query_shape": list(query_embeddings.shape), + "document_shape": list(document_embeddings.shape), + } + with open(metadata_path, "w") as f: + json.dump(metadata, f, indent=2, sort_keys=True) + + def _should_load_partial_cache(self, cache_path: Optional[Path]) -> bool: + """Return whether query shard or corpus chunk caches may be reused.""" + return ( + self.load_embeddings_from_cache + and self._reuse_partial_embedding_cache + and cache_path is not None + and cache_path.exists() + ) + def _load_embeddings_from_cache(self) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]: """Load query and document embeddings from cache. @@ -837,19 +985,28 @@ def _load_embeddings_from_cache(self) -> Tuple[Optional[np.ndarray], Optional[np query_embeddings = _load_npz_array(query_path) doc_embeddings = _load_npz_array(doc_path) + metadata = self._load_cache_metadata() + if not self._cache_metadata_matches(metadata, query_embeddings.shape, doc_embeddings.shape): + return None, None return query_embeddings, doc_embeddings def _has_full_embeddings_cache(self) -> bool: - """Check if the consolidated (rank0) embedding cache exists. - - This is intentionally a lightweight existence check (no file reads), used to avoid - redundant IO on non-main ranks in distributed runs. - """ + """Check if the consolidated (rank0) embedding cache matches this run.""" if not self.cache_embeddings_dir: return False cache_dir = Path(self.cache_embeddings_dir) - return (cache_dir / QUERY_EMBEDDINGS_FNAME).exists() and (cache_dir / DOCUMENT_EMBEDDINGS_FNAME).exists() + query_path = cache_dir / QUERY_EMBEDDINGS_FNAME + doc_path = cache_dir / DOCUMENT_EMBEDDINGS_FNAME + if not query_path.exists() or not doc_path.exists(): + return False + try: + query_shape = _load_npz_array(query_path).shape + doc_shape = _load_npz_array(doc_path).shape + except Exception: + logger.warning("Ignoring unreadable embedding cache under %s", cache_dir) + return False + return self._cache_metadata_matches(self._load_cache_metadata(), query_shape, doc_shape) def _save_embeddings_to_cache( self, @@ -870,6 +1027,7 @@ def _save_embeddings_to_cache( np.savez(cache_dir / QUERY_EMBEDDINGS_FNAME, query_embeddings) np.savez(cache_dir / DOCUMENT_EMBEDDINGS_FNAME, document_embeddings) + self._write_cache_metadata(query_embeddings, document_embeddings) logger.info(f"Saved embeddings to cache: {cache_dir}") @@ -886,6 +1044,7 @@ def _generate_embeddings(self) -> Tuple[np.ndarray, np.ndarray]: # In distributed runs, only rank0 needs the consolidated embeddings for mining. # To avoid redundant IO, rank0 checks for cache presence and broadcasts a cache_hit flag; # only rank0 reads the cache files. + self._reuse_partial_embedding_cache = self.load_embeddings_from_cache if self.load_embeddings_from_cache and self.cache_embeddings_dir: cache_hit = False if self.dist_env.world_size > 1 and torch.distributed.is_initialized(): @@ -903,7 +1062,8 @@ def _generate_embeddings(self) -> Tuple[np.ndarray, np.ndarray]: if self.dist_env.is_main: logger.info("Loading embeddings from cache (rank0 only)...") query_embeddings, document_embeddings = self._load_embeddings_from_cache() - assert query_embeddings is not None and document_embeddings is not None + if query_embeddings is None or document_embeddings is None: + raise RuntimeError("Embedding cache was marked reusable but failed validation during load.") logger.info( f"Loaded embeddings from cache: queries={query_embeddings.shape}, " f"documents={document_embeddings.shape}" @@ -914,6 +1074,7 @@ def _generate_embeddings(self) -> Tuple[np.ndarray, np.ndarray]: if self.dist_env.is_main: logger.info("Cache not found or incomplete, generating embeddings...") + self._reuse_partial_embedding_cache = False # Generate query embeddings (shard across ranks if distributed) query_embeddings = None @@ -1013,6 +1174,12 @@ def _mine_hard_negatives( - neg_scores: Similarity scores for each hard negative - pos_scores: Similarity scores for each positive document """ + if query_embeddings.ndim != 2 or document_embeddings.ndim != 2: + raise ValueError( + "Hard negative mining supports 2D single-vector embeddings only. " + "Token-level embeddings such as ColBERT pooling are not supported by this miner." + ) + # Convert document embeddings to tensor once (encoder embeddings are 2D) device = self.dist_env.device doc_embeddings_tensor = torch.tensor(document_embeddings, device=device) @@ -1035,6 +1202,7 @@ def _mine_hard_negatives( # Compute similarity scores: [batch_size, num_docs] batch_query_tensor = torch.tensor(batch_query_embs, device=device) batch_scores = batch_query_tensor @ doc_embeddings_tensor.T + batch_scores[~torch.isfinite(batch_scores)] = float("-inf") # Extract positive scores and mask positives min_pos_scores = [] # Minimum positive score per query (for margin filtering) diff --git a/tests/unit_tests/_transformers/test_retrieval.py b/tests/unit_tests/_transformers/test_retrieval.py index 9dfd8a556b..17ff8e0f5f 100644 --- a/tests/unit_tests/_transformers/test_retrieval.py +++ b/tests/unit_tests/_transformers/test_retrieval.py @@ -243,6 +243,54 @@ def test_load_encoder_config_merges_v5_retrieval_metadata(tmp_path): assert config.nemo_retrieval == {"task": "embedding", "pooling": "last", "l2_normalize": False} +def test_load_encoder_config_merges_hub_v5_retrieval_metadata(monkeypatch, tmp_path): + """Hub exports can recover AutoModel metadata from config.v5.json sidecars.""" + from nemo_automodel._transformers import retrieval + + v5_config_path = tmp_path / "config.v5.json" + v5_config_path.write_text( + json.dumps( + { + "model_type": "bert", + "nemo_retrieval": {"task": "embedding", "pooling": "cls", "l2_normalize": True}, + } + ) + ) + captured = {} + + class FakeConfig: + model_type = "bert" + + def fake_config_from_pretrained(model_name_or_path, **kwargs): + captured["config"] = {"model_name_or_path": model_name_or_path, "kwargs": kwargs} + return FakeConfig() + + def fake_hf_hub_download(repo_id, filename, **kwargs): + captured["hub"] = {"repo_id": repo_id, "filename": filename, "kwargs": kwargs} + return str(v5_config_path) + + monkeypatch.setattr(retrieval.AutoConfig, "from_pretrained", fake_config_from_pretrained) + monkeypatch.setattr(retrieval, "hf_hub_download", fake_hf_hub_download) + + config = retrieval._load_encoder_config( + "nvidia/example-retriever", + trust_remote_code=True, + revision="main", + token="token", + torch_dtype="auto", + ) + + assert captured["config"]["kwargs"]["trust_remote_code"] is True + assert captured["config"]["kwargs"]["revision"] == "main" + assert "torch_dtype" not in captured["config"]["kwargs"] + assert captured["hub"] == { + "repo_id": "nvidia/example-retriever", + "filename": "config.v5.json", + "kwargs": {"revision": "main", "token": "token"}, + } + assert config.nemo_retrieval == {"task": "embedding", "pooling": "cls", "l2_normalize": True} + + def test_nemo_auto_biencoder_defaults_do_not_override_saved_metadata(monkeypatch): """The public AutoModel entry point should defer pooling/l2 defaults to the saved config.""" from nemo_automodel._transformers import auto_model diff --git a/tests/unit_tests/recipes/test_mine_hard_negatives.py b/tests/unit_tests/recipes/test_mine_hard_negatives.py index 12f8e3ace2..1ae4596c1c 100644 --- a/tests/unit_tests/recipes/test_mine_hard_negatives.py +++ b/tests/unit_tests/recipes/test_mine_hard_negatives.py @@ -15,6 +15,7 @@ """Unit tests for MineHardNegativesRecipe — attn_implementation support.""" import json +from types import SimpleNamespace from unittest.mock import MagicMock, patch import numpy as np @@ -22,7 +23,12 @@ import torch from nemo_automodel.components.config.loader import ConfigNode -from nemo_automodel.recipes.retrieval.mine_hard_negatives import MINING_DEFAULTS, MineHardNegativesRecipe +from nemo_automodel.recipes.retrieval.mine_hard_negatives import ( + DOCUMENT_EMBEDDINGS_FNAME, + MINING_DEFAULTS, + QUERY_EMBEDDINGS_FNAME, + MineHardNegativesRecipe, +) # --------------------------------------------------------------------------- # Helpers @@ -50,6 +56,30 @@ def _make_recipe(mining_overrides=None): return recipe +def _make_cache_ready_recipe(tmp_path): + """Create a recipe with enough state for embedding cache validation.""" + recipe = _make_recipe({"cache_embeddings_dir": str(tmp_path), "load_embeddings_from_cache": True}) + recipe.cache_embeddings_dir = str(tmp_path) + recipe.load_embeddings_from_cache = True + recipe.model_name_or_path = "/fake/model" + recipe.tokenizer_name_or_path = "/fake/tokenizer" + recipe.train_qa_file_path = "/fake/input.json" + recipe.corpus_path = "/fake/corpus" + recipe.query_prefix = "query: " + recipe.passage_prefix = "passage: " + recipe.query_max_length = 16 + recipe.passage_max_length = 32 + recipe.add_bos_token = None + recipe.add_eos_token = False + recipe.attn_implementation = None + recipe.questions = ["what is nvlink?"] + recipe.question_ids = ["q0"] + recipe.idx_to_doc = {0: "d0"} + recipe.dist_env = SimpleNamespace(world_size=1, rank=0, is_main=True, device=torch.device("cpu")) + recipe.model = SimpleNamespace(pooling="avg", l2_normalize=True) + return recipe + + def _run_setup_and_capture_from_pretrained(mining_overrides=None): """Run recipe.setup() with only the truly heavy pieces stubbed out. @@ -237,6 +267,78 @@ def test_load_cached_chunk_ignored_when_cache_loading_disabled(tmp_path): assert recipe._load_cached_chunk(cache_path) is None +def test_full_embeddings_cache_requires_matching_metadata(tmp_path): + recipe = _make_cache_ready_recipe(tmp_path) + query_embeddings = np.ones((1, 2), dtype=np.float32) + document_embeddings = np.ones((1, 2), dtype=np.float32) + + recipe._save_embeddings_to_cache(query_embeddings, document_embeddings) + + assert recipe._has_full_embeddings_cache() + cached_query_embeddings, cached_document_embeddings = recipe._load_embeddings_from_cache() + np.testing.assert_array_equal(cached_query_embeddings, query_embeddings) + np.testing.assert_array_equal(cached_document_embeddings, document_embeddings) + + recipe.query_prefix = "different query: " + assert not recipe._has_full_embeddings_cache() + assert recipe._load_embeddings_from_cache() == (None, None) + + +def test_full_embeddings_cache_rejects_shape_mismatch(tmp_path): + recipe = _make_cache_ready_recipe(tmp_path) + query_embeddings = np.ones((1, 2), dtype=np.float32) + document_embeddings = np.ones((1, 2), dtype=np.float32) + recipe._save_embeddings_to_cache(query_embeddings, document_embeddings) + np.savez(tmp_path / QUERY_EMBEDDINGS_FNAME, np.ones((2, 2), dtype=np.float32)) + + assert not recipe._has_full_embeddings_cache() + assert recipe._load_embeddings_from_cache() == (None, None) + + +def test_encode_queries_sharded_handles_empty_rank_shard(tmp_path): + recipe = _make_recipe({"cache_embeddings_dir": str(tmp_path), "load_embeddings_from_cache": False}) + recipe.cache_embeddings_dir = str(tmp_path) + recipe.load_embeddings_from_cache = False + recipe.query_embedding_batch_size = 2 + recipe.query_max_length = 16 + recipe.query_prefix = "query: " + recipe.questions = ["what is nvlink?"] + recipe.dist_env = SimpleNamespace(world_size=2, rank=0, is_main=True, device=torch.device("cpu")) + recipe._encode_texts = lambda **_: np.ones((1, 2), dtype=np.float32) + + def write_empty_peer_shard(): + shard_dir = tmp_path / "query_shards" + np.savez(shard_dir / "queries_rank0001.npz", np.empty((0, 0), dtype=np.float32)) + + recipe._synchronize_ranks = write_empty_peer_shard + + query_embeddings = recipe._encode_queries_sharded() + + assert query_embeddings.shape == (1, 2) + + +def test_encode_chunk_distributed_handles_empty_rank_shard(tmp_path): + recipe = _make_recipe({"cache_embeddings_dir": str(tmp_path), "load_embeddings_from_cache": False}) + recipe.cache_embeddings_dir = str(tmp_path) + recipe.load_embeddings_from_cache = False + recipe.document_embedding_batch_size = 2 + recipe.passage_max_length = 16 + recipe.passage_prefix = "passage: " + recipe.dist_env = SimpleNamespace(world_size=2, rank=0, is_main=True, device=torch.device("cpu")) + recipe._encode_texts = lambda **_: np.ones((1, 2), dtype=np.float32) + cache_path = tmp_path / "chunk_0000.npz" + + def write_empty_peer_shard(): + np.savez(tmp_path / "chunk_0000_rank0001.npz", np.empty((0, 0), dtype=np.float32)) + + recipe._synchronize_ranks = write_empty_peer_shard + + document_embeddings = recipe._encode_chunk_distributed(["NVLink is fast."], cache_path) + + assert document_embeddings.shape == (1, 2) + assert (tmp_path / DOCUMENT_EMBEDDINGS_FNAME).exists() is False + + def test_mine_hard_negatives_drops_margin_filtered_candidates(): recipe = _make_recipe() recipe.dist_env = MagicMock(device=torch.device("cpu")) @@ -263,3 +365,43 @@ def test_mine_hard_negatives_drops_margin_filtered_candidates(): assert neg_indices == [[2]] assert neg_scores[0][0] == pytest.approx(0.2) assert pos_scores[0][0] == pytest.approx(1.0) + + +def test_mine_hard_negatives_drops_raw_non_finite_scores(): + recipe = _make_recipe() + recipe.dist_env = MagicMock(device=torch.device("cpu")) + query_embeddings = np.array([[1.0, 0.0]], dtype=np.float32) + document_embeddings = np.array( + [ + [1.0, 0.0], + [np.nan, 0.0], + [0.2, 0.0], + ], + dtype=np.float32, + ) + + neg_indices, neg_scores, pos_scores = recipe._mine_hard_negatives( + query_embeddings=query_embeddings, + document_embeddings=document_embeddings, + pos_doc_indices=[[0]], + batch_size=1, + num_negs=2, + ) + + assert neg_indices == [[2]] + assert neg_scores[0][0] == pytest.approx(0.2) + assert pos_scores[0][0] == pytest.approx(1.0) + + +def test_mine_hard_negatives_rejects_token_level_embeddings(): + recipe = _make_recipe() + recipe.dist_env = MagicMock(device=torch.device("cpu")) + + with pytest.raises(ValueError, match="2D single-vector embeddings"): + recipe._mine_hard_negatives( + query_embeddings=np.ones((1, 2, 3), dtype=np.float32), + document_embeddings=np.ones((1, 2, 3), dtype=np.float32), + pos_doc_indices=[[0]], + batch_size=1, + num_negs=1, + ) From 1066da654fa75647b66598f52666d7b76151bab9 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 20:34:41 +0100 Subject: [PATCH 21/25] fix(retrieval): harden cache identity and mining handoff Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-finetuning.md | 34 ++++++-- .../data_utils/audit_mined_negatives.py | 22 ++++- .../materialize_hf_retrieval_subset.py | 77 ++++++++++++++++++ .../retrieval/data_utils/mining_config.yaml | 5 +- nemo_automodel/_transformers/retrieval.py | 23 ++++-- .../recipes/retrieval/mine_hard_negatives.py | 81 ++++++++++++++++--- .../_transformers/test_retrieval.py | 39 +++++++++ .../llm/test_audit_mined_negatives.py | 58 +++++++++++++ .../test_materialize_hf_retrieval_subset.py | 68 ++++++++++++++++ .../recipes/test_mine_hard_negatives.py | 69 +++++++++++++++- 10 files changed, 447 insertions(+), 29 deletions(-) create mode 100644 examples/retrieval/data_utils/materialize_hf_retrieval_subset.py create mode 100644 tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 651ea2e46a..7c75bb7c6e 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -579,6 +579,18 @@ materialize or preprocess your selected HF subset into the corpus ID JSON schema [Retrieval Dataset](retrieval-dataset.md), then set `--mining.train_qa_file_path` to that local JSON file. The mining commands below assume that local corpus-backed input. +For an AutoModel-schema HF subset such as `FEVER`, materialize one corpus-backed mining input with: + +```bash +uv run python examples/retrieval/data_utils/materialize_hf_retrieval_subset.py \ + nvidia/embed-nemotron-dataset-v1 \ + FEVER \ + /path/to/retrieval-data/fever-mining +``` + +This writes `/path/to/retrieval-data/fever-mining/train.json` and a local `FEVER_corpus/` directory. Run the command +once per subset/corpus that you want to mine; the mining helper intentionally processes one corpus per run. + Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: - `hard_negatives_to_mine`: target number of negatives to add per query. The miner can return fewer when the corpus has @@ -601,10 +613,11 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: assembles the final embedding cache and score outputs, so plan memory and local disk accordingly. In multi-node mining, this must be a shared writable path mounted at the same location on every node; node-local cache paths leave rank `0` unable to read remote-rank shards. Use a fresh cache directory for each model, dataset, prefix, sequence - length, and world-size combination. The miner validates a cache fingerprint before reuse, but fresh run-specific - paths are still easier to reason about. Set `load_embeddings_from_cache: true` only when you intentionally want to - reuse every cached query shard, corpus chunk, and consolidated embedding file from the same - model/input/prefix/length/world-size run. + length, and world-size combination. The miner validates a cache fingerprint that includes the mining input file, + local model/tokenizer path state, ordered document IDs/content, and embedding settings before reusing consolidated + caches. Fresh run-specific paths are still easier to reason about, especially for mutable Hub IDs or paths that are + overwritten in place. Set `load_embeddings_from_cache: true` only when you intentionally want to reuse every cached + query shard, corpus chunk, and consolidated embedding file from the same model/input/prefix/length/world-size run. `pooling` and `l2_normalize` are saved bi-encoder wrapper metadata, not `mining.*` config fields. Do not pass `--mining.pooling` or `--mining.l2_normalize`; the miner rejects unknown mining keys. Mine from a saved bi-encoder export @@ -635,11 +648,14 @@ Run the audit utility before reusing mined output: ```bash uv run python examples/retrieval/data_utils/audit_mined_negatives.py \ /path/to/mined.json \ + --min-negatives 1 \ --allow-findings ``` `--allow-findings` keeps this first inspection command from failing the shell when it finds issues. Omit it in CI or -quality gates when findings should fail the job. +quality gates when findings should fail the job. `--min-negatives 1` flags rows that would fail or become degenerate +when retraining with `n_passages > 1`; increase it if your next training config requires more distinct negatives before +oversampling. If the report only contains issues that you want to drop automatically, write a cleaned copy: @@ -647,13 +663,15 @@ If the report only contains issues that you want to drop automatically, write a uv run python examples/retrieval/data_utils/audit_mined_negatives.py \ /path/to/mined.json \ --drop-invalid-negatives \ + --min-negatives 1 \ --output /path/to/mined_audited.json ``` With `--drop-invalid-negatives --output`, the command exits successfully when the cleaned output has no remaining audit -findings. The audit flags and drops negatives whose IDs also appear in the row's `pos_doc`, duplicate negative IDs in the -same row, missing negative scores, and non-finite negative scores. The cleaned output preserves query lineage fields such -as `original_question_id`, so unrolled examples remain traceable to their source question. +findings and still satisfies `--min-negatives` if you set it. The audit flags and drops negatives whose IDs also appear +in the row's `pos_doc`, duplicate negative IDs in the same row, missing negative scores, and non-finite negative scores. +The cleaned output preserves query lineage fields such as `original_question_id`, so unrolled examples remain traceable +to their source question. ## Save, Resume, and Use the Checkpoint diff --git a/examples/retrieval/data_utils/audit_mined_negatives.py b/examples/retrieval/data_utils/audit_mined_negatives.py index 61c8c2fc36..0f4d8e24a0 100644 --- a/examples/retrieval/data_utils/audit_mined_negatives.py +++ b/examples/retrieval/data_utils/audit_mined_negatives.py @@ -45,6 +45,7 @@ def audit_records( *, drop_invalid_negatives: bool = False, max_findings: int = 20, + min_negatives: int = 0, ) -> tuple[dict[str, int], list[dict[str, Any]], list[dict[str, Any]]]: """Audit retrieval records and optionally drop invalid mined negatives. @@ -65,6 +66,7 @@ def audit_records( "duplicate_negative": 0, "missing_negative_score": 0, "non_finite_negative_score": 0, + "rows_with_too_few_negatives": 0, "dropped_negatives": 0, "total_findings": 0, } @@ -112,6 +114,11 @@ def audit_records( continue cleaned_negatives.append(neg_doc) + if len(cleaned_negatives) < min_negatives: + summary["rows_with_too_few_negatives"] += 1 + row_has_findings = True + _append_finding(findings, max_findings, row_idx, record, None, "too_few_negatives") + if row_has_findings: summary["rows_with_findings"] += 1 cleaned_record = dict(record) @@ -123,6 +130,7 @@ def audit_records( + summary["duplicate_negative"] + summary["missing_negative_score"] + summary["non_finite_negative_score"] + + summary["rows_with_too_few_negatives"] ) return summary, cleaned_records, findings @@ -132,7 +140,7 @@ def _append_finding( max_findings: int, row_idx: int, record: dict[str, Any], - neg_id: str, + neg_id: str | None, issue: str, ) -> None: """Append a compact finding example up to the configured limit.""" @@ -154,6 +162,7 @@ def audit_training_data( *, drop_invalid_negatives: bool = False, max_findings: int = 20, + min_negatives: int = 0, ) -> tuple[dict[str, int], dict[str, Any], list[dict[str, Any]]]: """Audit a top-level retrieval JSON object.""" records = training_data.get("data", []) @@ -161,6 +170,7 @@ def audit_training_data( records, drop_invalid_negatives=drop_invalid_negatives, max_findings=max_findings, + min_negatives=min_negatives, ) cleaned_training_data = dict(training_data) cleaned_training_data["data"] = cleaned_records @@ -187,12 +197,20 @@ def main() -> int: ), ) parser.add_argument("--max-findings", type=int, default=20, help="Maximum finding examples to print") + parser.add_argument( + "--min-negatives", + type=int, + default=0, + help="Require at least this many negatives per row after optional cleanup.", + ) parser.add_argument( "--allow-findings", action="store_true", help="Exit with status 0 even when the audit reports findings", ) args = parser.parse_args() + if args.min_negatives < 0: + parser.error("--min-negatives must be non-negative") input_path = Path(args.input_file) with open(input_path, "r") as f: @@ -202,6 +220,7 @@ def main() -> int: training_data, drop_invalid_negatives=args.drop_invalid_negatives, max_findings=args.max_findings, + min_negatives=args.min_negatives, ) exit_summary = summary @@ -216,6 +235,7 @@ def main() -> int: cleaned_training_data, drop_invalid_negatives=False, max_findings=args.max_findings, + min_negatives=args.min_negatives, ) payload["remaining_summary"] = remaining_summary payload["remaining_findings"] = remaining_findings diff --git a/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py b/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py new file mode 100644 index 0000000000..af004712fc --- /dev/null +++ b/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py @@ -0,0 +1,77 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Materialize one AutoModel retrieval HF subset as local corpus JSON for mining.""" + +import argparse +import json +import logging +from pathlib import Path + +from datasets import Dataset + +from nemo_automodel.components.datasets.llm.retrieval_dataset import _load_hf_subset + +logger = logging.getLogger(__name__) + + +def main() -> int: + """Run the HF subset materialization CLI.""" + parser = argparse.ArgumentParser( + description="Write one AutoModel-schema hf:// retrieval subset as a local corpus-backed JSON file" + ) + parser.add_argument( + "repo_id", type=str, help="Hugging Face dataset repo, for example nvidia/embed-nemotron-dataset-v1" + ) + parser.add_argument("subset", type=str, help="Subset name, for example FEVER") + parser.add_argument( + "output_dir", type=str, help="Directory where train.json and the corpus directory will be written" + ) + args = parser.parse_args() + + logging.basicConfig(level=logging.INFO) + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + corpus_dir = output_dir / f"{args.subset}_corpus" + corpus_dir.mkdir(parents=True, exist_ok=True) + + data_list, corpus_info = _load_hf_subset(args.repo_id, args.subset) + doc_rows = [] + for doc_id in corpus_info.get_all_ids(): + document = corpus_info.get_document_by_id(doc_id) + doc_rows.append({"id": str(doc_id), "text": document.get("text", "")}) + + Dataset.from_list(doc_rows).to_parquet(str(corpus_dir / "train.parquet")) + metadata = { + **corpus_info.metadata, + "class": "TextQADataset", + "corpus_id": corpus_info.corpus_id, + } + with open(corpus_dir / "merlin_metadata.json", "w") as f: + json.dump(metadata, f, indent=2, sort_keys=True) + + train_json = { + "corpus": [{"path": f"./{corpus_dir.name}"}], + "data": data_list, + } + with open(output_dir / "train.json", "w") as f: + json.dump(train_json, f, indent=2) + + logger.info("Wrote %s records and %s corpus documents to %s", len(data_list), len(doc_rows), output_dir) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/retrieval/data_utils/mining_config.yaml b/examples/retrieval/data_utils/mining_config.yaml index e743638925..97048b1301 100644 --- a/examples/retrieval/data_utils/mining_config.yaml +++ b/examples/retrieval/data_utils/mining_config.yaml @@ -65,7 +65,8 @@ mining: # Caching (required for multi-rank torchrun; use a shared writable path for multi-node). # Use a fresh directory per model, dataset, prefix, sequence-length, and world-size combination. - # The miner validates cache metadata before reuse, but fresh run-specific paths are still clearest. + # The miner validates cache metadata before reuse, but fresh run-specific paths are still clearest, + # especially for mutable Hub IDs or local paths that are overwritten in place. # cache_embeddings_dir: /path/to/cache/ # Set true only when every cached shard/chunk came from the same model/input/prefix/length/world-size run. load_embeddings_from_cache: false @@ -97,5 +98,5 @@ mining: # Whether to include negatives from the input file. Existing negatives are prepended to the mined output; # deduplicate/audit the final file before training: - # uv run python examples/retrieval/data_utils/audit_mined_negatives.py /path/to/output.json --allow-findings + # uv run python examples/retrieval/data_utils/audit_mined_negatives.py /path/to/output.json --min-negatives 1 --allow-findings use_negatives_from_file: false diff --git a/nemo_automodel/_transformers/retrieval.py b/nemo_automodel/_transformers/retrieval.py index 95d51fcb9a..3618124f0b 100644 --- a/nemo_automodel/_transformers/retrieval.py +++ b/nemo_automodel/_transformers/retrieval.py @@ -24,6 +24,7 @@ import torch.nn as nn import torch.nn.functional as F from huggingface_hub import hf_hub_download +from huggingface_hub.utils import EntryNotFoundError from transformers import AutoConfig, AutoModel, AutoModelForSequenceClassification, PreTrainedModel from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING from transformers.utils import logging @@ -83,10 +84,15 @@ def _load_encoder_config(model_name_or_path: str, trust_remote_code: bool = Fals **_get_auto_config_load_kwargs(hf_kwargs), ) model_path = Path(model_id) - v5_config_path = model_path / "config.v5.json" - if v5_config_path.exists(): - _load_v5_retrieval_metadata(config, v5_config_path) - return config + subfolder = hf_kwargs.get("subfolder") + local_v5_config_paths = [] + if subfolder is not None: + local_v5_config_paths.append(model_path / str(subfolder) / "config.v5.json") + local_v5_config_paths.append(model_path / "config.v5.json") + for v5_config_path in local_v5_config_paths: + if v5_config_path.exists(): + _load_v5_retrieval_metadata(config, v5_config_path) + return config # Local directory exports write config.v5.json beside config.json. Hub exports need # an explicit sidecar download because AutoConfig only loads config.json. @@ -98,7 +104,14 @@ def _load_encoder_config(model_name_or_path: str, trust_remote_code: bool = Fals filename="config.v5.json", **_get_hf_hub_download_kwargs(hf_kwargs), ) - except Exception: + except EntryNotFoundError: + return config + except Exception as exc: + logger.warning( + "Unable to load config.v5.json for %s; falling back to config.json retrieval metadata/defaults. Error: %s", + model_id, + exc, + ) return config _load_v5_retrieval_metadata(config, hub_v5_config_path) diff --git a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py index f691b7d2b9..a4e9ea82a9 100644 --- a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py +++ b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py @@ -193,6 +193,7 @@ def __init__(self, cfg): # Data (populated in setup) self.questions_dataset = None self.documents_dataset = None + self.corpus_id = None self.corpus_path = None self.doc_to_idx = None self.idx_to_doc = None @@ -417,11 +418,12 @@ def _load_data(self): f"Corpus paths: {list(corpus_dict.keys())}" ) - self.corpus_path = list(corpus_dict.keys())[0] - self.documents_dataset = corpus_dict[self.corpus_path] + self.corpus_id = list(corpus_dict.keys())[0] + self.documents_dataset = corpus_dict[self.corpus_id] + self.corpus_path = getattr(self.documents_dataset, "path", self.corpus_id) self.questions_dataset = dataset - logger.info(f"Loaded {len(dataset)} questions from corpus: {self.corpus_path}") + logger.info(f"Loaded {len(dataset)} questions from corpus: {self.corpus_id} ({self.corpus_path})") def _build_document_mappings(self): """Build bidirectional mappings between document IDs and indices. @@ -838,12 +840,61 @@ def _hash_cache_values(self, values: Optional[List[Any]]) -> str: hasher.update(b"\0") return hasher.hexdigest() + def _hash_file_contents(self, path_value: Optional[str]) -> Optional[str]: + """Hash the contents of a local file when it is available.""" + if path_value is None: + return None + path = Path(str(path_value)) + if not path.is_file(): + return None + hasher = hashlib.sha256() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + hasher.update(chunk) + return hasher.hexdigest() + + def _hash_local_path_state(self, path_value: Optional[str]) -> Optional[str]: + """Hash local file names, sizes, and mtimes for mutable model/tokenizer paths.""" + if path_value is None: + return None + path = Path(str(path_value)) + if not path.exists(): + return None + files = [path] if path.is_file() else sorted(file_path for file_path in path.rglob("*") if file_path.is_file()) + root = path if path.is_dir() else path.parent + hasher = hashlib.sha256() + hasher.update(str(path.resolve()).encode("utf-8")) + for file_path in files: + try: + stat = file_path.stat() + except OSError: + continue + relative_path = file_path.relative_to(root) + hasher.update(str(relative_path).encode("utf-8")) + hasher.update(str(stat.st_size).encode("utf-8")) + hasher.update(str(stat.st_mtime_ns).encode("utf-8")) + return hasher.hexdigest() + def _get_ordered_doc_ids(self) -> List[Any]: """Return document IDs in embedding row order.""" if isinstance(self.idx_to_doc, dict): return [self.idx_to_doc[idx] for idx in sorted(self.idx_to_doc)] return list(self.idx_to_doc or []) + def _hash_ordered_document_contents(self) -> Optional[str]: + """Hash ordered corpus document IDs and text used for passage embeddings.""" + if self.documents_dataset is None: + return None + hasher = hashlib.sha256() + for doc_id in self._get_ordered_doc_ids(): + document = self.documents_dataset.get_document_by_id(doc_id) + hasher.update(str(doc_id).encode("utf-8")) + hasher.update(b"\0") + serialized_document = json.dumps(document, sort_keys=True, default=str, separators=(",", ":")) + hasher.update(serialized_document.encode("utf-8")) + hasher.update(b"\0") + return hasher.hexdigest() + def _get_model_pooling(self): """Return pooling metadata from the live model or cached model metadata.""" if hasattr(self, "_model_pooling"): @@ -866,7 +917,12 @@ def _build_cache_fingerprint(self) -> Dict[str, Any]: "model_name_or_path": str(self.model_name_or_path), "tokenizer_name_or_path": str(self.tokenizer_name_or_path), "train_qa_file_path": str(self.train_qa_file_path), + "train_qa_file_hash": self._hash_file_contents(self.train_qa_file_path), + "model_path_state_hash": self._hash_local_path_state(self.model_name_or_path), + "tokenizer_path_state_hash": self._hash_local_path_state(self.tokenizer_name_or_path), + "corpus_id": str(self.corpus_id), "corpus_path": str(self.corpus_path), + "corpus_path_state_hash": self._hash_local_path_state(self.corpus_path), "query_prefix": self.query_prefix, "passage_prefix": self.passage_prefix, "query_max_length": self.query_max_length, @@ -882,6 +938,7 @@ def _build_cache_fingerprint(self) -> Dict[str, Any]: "question_ids_hash": self._hash_cache_values(self.question_ids), "questions_hash": self._hash_cache_values(self.questions), "document_ids_hash": self._hash_cache_values(self._get_ordered_doc_ids()), + "document_content_hash": self._hash_ordered_document_contents(), } def _cache_metadata_path(self) -> Optional[Path]: @@ -1000,13 +1057,7 @@ def _has_full_embeddings_cache(self) -> bool: doc_path = cache_dir / DOCUMENT_EMBEDDINGS_FNAME if not query_path.exists() or not doc_path.exists(): return False - try: - query_shape = _load_npz_array(query_path).shape - doc_shape = _load_npz_array(doc_path).shape - except Exception: - logger.warning("Ignoring unreadable embedding cache under %s", cache_dir) - return False - return self._cache_metadata_matches(self._load_cache_metadata(), query_shape, doc_shape) + return self._cache_metadata_matches(self._load_cache_metadata()) def _save_embeddings_to_cache( self, @@ -1206,6 +1257,7 @@ def _mine_hard_negatives( # Extract positive scores and mask positives min_pos_scores = [] # Minimum positive score per query (for margin filtering) + apply_margin = [] batch_pos_scores = [] for i, pos_indices in enumerate(batch_pos_indices): @@ -1224,10 +1276,13 @@ def _mine_hard_negatives( batch_pos_scores.append(query_pos_scores) # Track minimum positive score for margin filtering - if query_pos_scores: - min_pos_scores.append(min(scores_by_idx.values())) + finite_pos_scores = [score for score in scores_by_idx.values() if math.isfinite(score)] + if finite_pos_scores: + min_pos_scores.append(min(finite_pos_scores)) + apply_margin.append(True) else: min_pos_scores.append(0.0) # Handle edge case of no positives + apply_margin.append(False) # Vectorized margin filtering if hard_neg_margin is not None: @@ -1242,6 +1297,8 @@ def _mine_hard_negatives( if threshold is not None: downscore_mask = batch_scores > threshold + apply_margin_tensor = torch.tensor(apply_margin, dtype=torch.bool, device=device).unsqueeze(1) + downscore_mask = downscore_mask & apply_margin_tensor batch_scores[downscore_mask] = float("-inf") # Batch-level top-k selection diff --git a/tests/unit_tests/_transformers/test_retrieval.py b/tests/unit_tests/_transformers/test_retrieval.py index 17ff8e0f5f..9116cc8fd7 100644 --- a/tests/unit_tests/_transformers/test_retrieval.py +++ b/tests/unit_tests/_transformers/test_retrieval.py @@ -15,6 +15,7 @@ """Functional tests for retrieval backbone extraction.""" import json +import logging import pytest import torch @@ -243,6 +244,27 @@ def test_load_encoder_config_merges_v5_retrieval_metadata(tmp_path): assert config.nemo_retrieval == {"task": "embedding", "pooling": "last", "l2_normalize": False} +def test_load_encoder_config_merges_local_subfolder_v5_retrieval_metadata(tmp_path): + """Local exports with a subfolder keep metadata beside the loaded config.""" + from nemo_automodel._transformers.retrieval import _load_encoder_config + + model_subfolder = tmp_path / "model" + model_subfolder.mkdir() + (model_subfolder / "config.json").write_text(json.dumps({"model_type": "bert"})) + (model_subfolder / "config.v5.json").write_text( + json.dumps( + { + "model_type": "bert", + "nemo_retrieval": {"task": "embedding", "pooling": "weighted_avg", "l2_normalize": False}, + } + ) + ) + + config = _load_encoder_config(str(tmp_path), subfolder="model") + + assert config.nemo_retrieval == {"task": "embedding", "pooling": "weighted_avg", "l2_normalize": False} + + def test_load_encoder_config_merges_hub_v5_retrieval_metadata(monkeypatch, tmp_path): """Hub exports can recover AutoModel metadata from config.v5.json sidecars.""" from nemo_automodel._transformers import retrieval @@ -291,6 +313,23 @@ def fake_hf_hub_download(repo_id, filename, **kwargs): assert config.nemo_retrieval == {"task": "embedding", "pooling": "cls", "l2_normalize": True} +def test_load_encoder_config_warns_on_unexpected_hub_sidecar_failure(monkeypatch, caplog): + """Transient Hub/cache failures should not silently fall back to defaults.""" + from nemo_automodel._transformers import retrieval + + class FakeConfig: + model_type = "bert" + + monkeypatch.setattr(retrieval.AutoConfig, "from_pretrained", lambda *_, **__: FakeConfig()) + monkeypatch.setattr(retrieval, "hf_hub_download", lambda *_, **__: (_ for _ in ()).throw(RuntimeError("boom"))) + + caplog.set_level(logging.WARNING, logger="nemo_automodel._transformers.retrieval") + config = retrieval._load_encoder_config("nvidia/example-retriever") + + assert not hasattr(config, "nemo_retrieval") + assert "Unable to load config.v5.json" in caplog.text + + def test_nemo_auto_biencoder_defaults_do_not_override_saved_metadata(monkeypatch): """The public AutoModel entry point should defer pooling/l2 defaults to the saved config.""" from nemo_automodel._transformers import auto_model diff --git a/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py b/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py index cea41cdfdf..1bb87f43eb 100644 --- a/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py +++ b/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py @@ -50,6 +50,7 @@ def test_audit_mined_negatives_reports_and_cleans_invalid_rows(): "duplicate_negative": 1, "missing_negative_score": 1, "non_finite_negative_score": 1, + "rows_with_too_few_negatives": 0, "dropped_negatives": 4, "total_findings": 4, } @@ -146,3 +147,60 @@ def test_audit_cli_exits_nonzero_when_findings_remain(tmp_path, monkeypatch): monkeypatch.setattr(sys, "argv", ["audit_mined_negatives.py", str(input_file)]) assert audit_main() == 1 + + +def test_audit_cli_allow_findings_exits_zero(tmp_path, monkeypatch): + input_file = tmp_path / "mined.json" + input_file.write_text( + json.dumps( + { + "data": [ + { + "question_id": "q0", + "pos_doc": [{"id": "1"}], + "neg_doc": [{"id": "1", "score": 1.0}], + } + ] + } + ) + ) + monkeypatch.setattr(sys, "argv", ["audit_mined_negatives.py", str(input_file), "--allow-findings"]) + + assert audit_main() == 0 + + +def test_audit_cli_min_negatives_catches_rows_cleaned_to_empty(tmp_path, monkeypatch, capsys): + input_file = tmp_path / "mined.json" + output_file = tmp_path / "cleaned.json" + input_file.write_text( + json.dumps( + { + "data": [ + { + "question_id": "q0", + "pos_doc": [{"id": "1"}], + "neg_doc": [{"id": "1", "score": 1.0}], + } + ] + } + ) + ) + monkeypatch.setattr( + sys, + "argv", + [ + "audit_mined_negatives.py", + str(input_file), + "--drop-invalid-negatives", + "--output", + str(output_file), + "--min-negatives", + "1", + ], + ) + + assert audit_main() == 1 + report = json.loads(capsys.readouterr().out) + + assert report["remaining_summary"]["rows_with_too_few_negatives"] == 1 + assert report["remaining_findings"][0]["issue"] == "too_few_negatives" diff --git a/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py b/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py new file mode 100644 index 0000000000..783a11033c --- /dev/null +++ b/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py @@ -0,0 +1,68 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import sys + +from examples.retrieval.data_utils import materialize_hf_retrieval_subset + + +class _FakeCorpusInfo: + metadata = {"corpus_id": "demo", "query_instruction": "query:"} + corpus_id = "demo" + + def get_all_ids(self): + return ["d0"] + + def get_document_by_id(self, doc_id): + return {"text": f"document {doc_id}"} + + +def test_materialize_hf_retrieval_subset_writes_local_corpus_json(tmp_path, monkeypatch): + data_list = [ + { + "question_id": "q0", + "original_question_id": "q0", + "question": "What is the document?", + "corpus_id": "demo", + "pos_doc": [{"id": "d0"}], + "neg_doc": [{"id": "d1"}], + } + ] + monkeypatch.setattr( + materialize_hf_retrieval_subset, + "_load_hf_subset", + lambda repo_id, subset: (data_list, _FakeCorpusInfo()), + ) + monkeypatch.setattr( + sys, + "argv", + [ + "materialize_hf_retrieval_subset.py", + "nvidia/embed-nemotron-dataset-v1", + "FEVER", + str(tmp_path), + ], + ) + + assert materialize_hf_retrieval_subset.main() == 0 + + train_json = json.loads((tmp_path / "train.json").read_text()) + metadata = json.loads((tmp_path / "FEVER_corpus" / "merlin_metadata.json").read_text()) + + assert train_json["corpus"] == [{"path": "./FEVER_corpus"}] + assert train_json["data"] == data_list + assert metadata["class"] == "TextQADataset" + assert metadata["corpus_id"] == "demo" + assert (tmp_path / "FEVER_corpus" / "train.parquet").exists() diff --git a/tests/unit_tests/recipes/test_mine_hard_negatives.py b/tests/unit_tests/recipes/test_mine_hard_negatives.py index 1ae4596c1c..7ecc5703ef 100644 --- a/tests/unit_tests/recipes/test_mine_hard_negatives.py +++ b/tests/unit_tests/recipes/test_mine_hard_negatives.py @@ -42,6 +42,16 @@ } +class _FakeDocumentsDataset: + path = "/fake/corpus" + + def __init__(self, text="NVLink is a high-bandwidth GPU interconnect."): + self.text = text + + def get_document_by_id(self, doc_id): + return {"text": self.text, "image": "", "nr_ocr": ""} + + def _make_recipe(mining_overrides=None): """Create a MineHardNegativesRecipe with a real ConfigNode config. @@ -64,6 +74,7 @@ def _make_cache_ready_recipe(tmp_path): recipe.model_name_or_path = "/fake/model" recipe.tokenizer_name_or_path = "/fake/tokenizer" recipe.train_qa_file_path = "/fake/input.json" + recipe.corpus_id = "demo" recipe.corpus_path = "/fake/corpus" recipe.query_prefix = "query: " recipe.passage_prefix = "passage: " @@ -75,6 +86,7 @@ def _make_cache_ready_recipe(tmp_path): recipe.questions = ["what is nvlink?"] recipe.question_ids = ["q0"] recipe.idx_to_doc = {0: "d0"} + recipe.documents_dataset = _FakeDocumentsDataset() recipe.dist_env = SimpleNamespace(world_size=1, rank=0, is_main=True, device=torch.device("cpu")) recipe.model = SimpleNamespace(pooling="avg", l2_normalize=True) return recipe @@ -202,6 +214,24 @@ def test_setup_with_attn_implementation(attn_impl): assert kwargs["use_sdpa_patching"] is True +def test_setup_rejects_colbert_pooling(): + """The public setup path should reject token-level ColBERT embeddings before mining.""" + cfg = ConfigNode({"mining": dict(_BASE_MINING)}) + recipe = MineHardNegativesRecipe(cfg) + mock_model = MagicMock(pooling="colbert") + mock_model.to.return_value = mock_model + + with ( + patch("nemo_automodel.recipes.retrieval.mine_hard_negatives.build_distributed") as mock_dist, + patch("nemo_automodel.recipes.retrieval.mine_hard_negatives.NeMoAutoModelBiEncoder") as mock_auto, + pytest.raises(ValueError, match="ColBERT pooling"), + ): + mock_dist.return_value = MagicMock(device="cpu") + mock_auto.from_pretrained.return_value = mock_model + + recipe.setup() + + def test_write_output_preserves_original_question_id(tmp_path): """Mined outputs should keep query lineage added by unroll_pos_docs.py.""" input_file = tmp_path / "input.json" @@ -284,6 +314,16 @@ def test_full_embeddings_cache_requires_matching_metadata(tmp_path): assert recipe._load_embeddings_from_cache() == (None, None) +def test_full_embeddings_cache_rejects_changed_document_content(tmp_path): + recipe = _make_cache_ready_recipe(tmp_path) + recipe._save_embeddings_to_cache(np.ones((1, 2), dtype=np.float32), np.ones((1, 2), dtype=np.float32)) + + recipe.documents_dataset.text = "The corpus text changed under the same document ID." + + assert not recipe._has_full_embeddings_cache() + assert recipe._load_embeddings_from_cache() == (None, None) + + def test_full_embeddings_cache_rejects_shape_mismatch(tmp_path): recipe = _make_cache_ready_recipe(tmp_path) query_embeddings = np.ones((1, 2), dtype=np.float32) @@ -291,7 +331,7 @@ def test_full_embeddings_cache_rejects_shape_mismatch(tmp_path): recipe._save_embeddings_to_cache(query_embeddings, document_embeddings) np.savez(tmp_path / QUERY_EMBEDDINGS_FNAME, np.ones((2, 2), dtype=np.float32)) - assert not recipe._has_full_embeddings_cache() + assert recipe._has_full_embeddings_cache() assert recipe._load_embeddings_from_cache() == (None, None) @@ -393,6 +433,33 @@ def test_mine_hard_negatives_drops_raw_non_finite_scores(): assert pos_scores[0][0] == pytest.approx(1.0) +def test_mine_hard_negatives_skips_margin_when_positive_score_is_non_finite(): + recipe = _make_recipe() + recipe.dist_env = MagicMock(device=torch.device("cpu")) + query_embeddings = np.array([[1.0, 0.0]], dtype=np.float32) + document_embeddings = np.array( + [ + [np.nan, 0.0], + [0.2, 0.0], + ], + dtype=np.float32, + ) + + neg_indices, neg_scores, pos_scores = recipe._mine_hard_negatives( + query_embeddings=query_embeddings, + document_embeddings=document_embeddings, + pos_doc_indices=[[0]], + batch_size=1, + num_negs=1, + hard_neg_margin=0.95, + hard_neg_margin_type="perc", + ) + + assert neg_indices == [[1]] + assert neg_scores[0][0] == pytest.approx(0.2) + assert pos_scores[0][0] == float("-inf") + + def test_mine_hard_negatives_rejects_token_level_embeddings(): recipe = _make_recipe() recipe.dist_env = MagicMock(device=torch.device("cpu")) From c2985547aa060c6cbc3ca625e54c68cea125a1a9 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 21:00:17 +0100 Subject: [PATCH 22/25] fix(retrieval): recover from stale mining caches Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-dataset.md | 2 +- docs/guides/llm/retrieval-finetuning.md | 67 ++++++------- .../data_utils/audit_mined_negatives.py | 32 ++++++- .../materialize_hf_retrieval_subset.py | 13 ++- .../retrieval/data_utils/mining_config.yaml | 4 +- .../recipes/retrieval/mine_hard_negatives.py | 96 ++++++++++++++----- .../_transformers/test_retrieval.py | 38 ++++++++ .../llm/test_audit_mined_negatives.py | 36 +++++-- .../test_materialize_hf_retrieval_subset.py | 53 ++++++++++ .../recipes/test_mine_hard_negatives.py | 55 ++++++++++- 10 files changed, 324 insertions(+), 72 deletions(-) diff --git a/docs/guides/llm/retrieval-dataset.md b/docs/guides/llm/retrieval-dataset.md index 39d48ed6e0..c6847dd1f5 100644 --- a/docs/guides/llm/retrieval-dataset.md +++ b/docs/guides/llm/retrieval-dataset.md @@ -50,7 +50,7 @@ records before training: 1. Put every passage in a corpus split with stable `id` and `text` values. 2. For each query, write one or more training records with `question_id`, `question`, `corpus_id`, `pos_doc`, and - `neg_doc`. + `neg_doc`. Use unique `question_id` values within each mining file; hard-negative mining writes results back by ID. 3. For training, use the first relevant document in each record as `pos_doc[0]`; expand multi-positive queries into multiple records if you want every positive to become a supervised positive. 4. For hard-negative mining, include all known positive document IDs for that query in the row's `pos_doc`. The miner diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 7c75bb7c6e..06c364d828 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -526,14 +526,38 @@ memory and want maximum adaptation. ## Mine Hard Negatives -After an initial bi-encoder run, mine harder negatives with the consolidated encoder checkpoint. This single-node -example uses `--standalone`: +After an initial bi-encoder run, mine harder negatives with the consolidated encoder checkpoint. Hard-negative mining +expects the corpus ID-based retrieval JSON format described in the dataset guide, not the inline JSONL shortcut. The +input must reference one corpus so the miner can build a passage embedding cache, retrieve candidates, and write mined +negatives back to each query. Each mining row must have a unique `question_id`; duplicate IDs make the output mapping +ambiguous and are rejected. + +The quickstart configs use `hf://` sources for the first train/eval path. The miner currently reads a local +corpus-backed retrieval JSON file instead of `hf://` URIs directly. For a train -> mine -> retrain loop, first +materialize or preprocess your selected HF subset into the corpus ID JSON schema from +[Retrieval Dataset](retrieval-dataset.md), then set `--mining.train_qa_file_path` to that local JSON file. + +For an AutoModel-schema HF subset such as `FEVER`, materialize one corpus-backed mining input with: + +```bash +uv run python examples/retrieval/data_utils/materialize_hf_retrieval_subset.py \ + nvidia/embed-nemotron-dataset-v1 \ + FEVER \ + /path/to/retrieval-data/fever-mining +``` + +This writes `/path/to/retrieval-data/fever-mining/train.json` and a local `FEVER_corpus/` directory. Run the command +once per subset/corpus that you want to mine; the helper intentionally processes one corpus per run and refuses to write +into a non-empty output directory unless you pass `--overwrite`. The mining examples below use that local +`train.json` path. + +This single-node example uses `--standalone`: ```bash uv run torchrun --standalone --nproc_per_node=8 examples/retrieval/data_utils/mine_hard_negatives.py \ --config examples/retrieval/data_utils/mining_config.yaml \ --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_499/model/consolidated \ - --mining.train_qa_file_path /path/to/input.json \ + --mining.train_qa_file_path /path/to/retrieval-data/fever-mining/train.json \ --mining.train_file_output_path /path/to/output.json \ --mining.cache_embeddings_dir /shared/path/to/cache/llama3_2_1b_fever_mine_v1 \ --mining.query_prefix "query: " \ @@ -555,7 +579,7 @@ uv run torchrun \ examples/retrieval/data_utils/mine_hard_negatives.py \ --config examples/retrieval/data_utils/mining_config.yaml \ --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_499/model/consolidated \ - --mining.train_qa_file_path /path/to/input.json \ + --mining.train_qa_file_path /path/to/retrieval-data/fever-mining/train.json \ --mining.train_file_output_path /path/to/output.json \ --mining.cache_embeddings_dir /shared/path/to/cache/llama3_2_1b_fever_mine_v1 \ --mining.query_prefix "query: " \ @@ -569,28 +593,6 @@ Replace `epoch_0_step_499` with the explicit checkpoint directory that you want `LATEST.txt`, read it first and substitute the resolved `epoch_*_step_*` directory; the mining script loads the Hugging Face export directly and does not apply AutoModel's checkpoint resolver. -Hard-negative mining expects the corpus ID-based retrieval JSON format described in the dataset guide, not the inline -JSONL shortcut. The input must reference one corpus so the miner can build a passage embedding cache, retrieve -candidates, and write mined negatives back to each query. - -The quickstart configs use `hf://` sources for the first train/eval path. The miner currently reads a local -corpus-backed retrieval JSON file instead of `hf://` URIs directly. For a train -> mine -> retrain loop, first -materialize or preprocess your selected HF subset into the corpus ID JSON schema from -[Retrieval Dataset](retrieval-dataset.md), then set `--mining.train_qa_file_path` to that local JSON file. The mining -commands below assume that local corpus-backed input. - -For an AutoModel-schema HF subset such as `FEVER`, materialize one corpus-backed mining input with: - -```bash -uv run python examples/retrieval/data_utils/materialize_hf_retrieval_subset.py \ - nvidia/embed-nemotron-dataset-v1 \ - FEVER \ - /path/to/retrieval-data/fever-mining -``` - -This writes `/path/to/retrieval-data/fever-mining/train.json` and a local `FEVER_corpus/` directory. Run the command -once per subset/corpus that you want to mine; the mining helper intentionally processes one corpus per run. - Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: - `hard_negatives_to_mine`: target number of negatives to add per query. The miner can return fewer when the corpus has @@ -613,10 +615,10 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: assembles the final embedding cache and score outputs, so plan memory and local disk accordingly. In multi-node mining, this must be a shared writable path mounted at the same location on every node; node-local cache paths leave rank `0` unable to read remote-rank shards. Use a fresh cache directory for each model, dataset, prefix, sequence - length, and world-size combination. The miner validates a cache fingerprint that includes the mining input file, - local model/tokenizer path state, ordered document IDs/content, and embedding settings before reusing consolidated - caches. Fresh run-specific paths are still easier to reason about, especially for mutable Hub IDs or paths that are - overwritten in place. Set `load_embeddings_from_cache: true` only when you intentionally want to reuse every cached + length, and world-size combination. The miner validates cache metadata and loads the consolidated arrays to verify + fingerprint, shape, and readability before reusing a consolidated cache. The fingerprint includes the mining input + file, local model/tokenizer path state, ordered document IDs/content, and embedding settings. Fresh run-specific + paths are still easier to reason about, especially for mutable Hub IDs or paths that are overwritten in place. Set `load_embeddings_from_cache: true` only when you intentionally want to reuse every cached query shard, corpus chunk, and consolidated embedding file from the same model/input/prefix/length/world-size run. `pooling` and `l2_normalize` are saved bi-encoder wrapper metadata, not `mining.*` config fields. Do not pass @@ -668,8 +670,9 @@ uv run python examples/retrieval/data_utils/audit_mined_negatives.py \ ``` With `--drop-invalid-negatives --output`, the command exits successfully when the cleaned output has no remaining audit -findings and still satisfies `--min-negatives` if you set it. The audit flags and drops negatives whose IDs also appear -in the row's `pos_doc`, duplicate negative IDs in the same row, missing negative scores, and non-finite negative scores. +findings and still satisfies `--min-negatives` if you set it. The audit flags missing or non-finite positive scores, and +flags and drops negatives whose IDs also appear in the row's `pos_doc`, duplicate negative IDs in the same row, missing +negative scores, and non-finite negative scores. The cleaned output preserves query lineage fields such as `original_question_id`, so unrolled examples remain traceable to their source question. diff --git a/examples/retrieval/data_utils/audit_mined_negatives.py b/examples/retrieval/data_utils/audit_mined_negatives.py index 0f4d8e24a0..17af86263f 100644 --- a/examples/retrieval/data_utils/audit_mined_negatives.py +++ b/examples/retrieval/data_utils/audit_mined_negatives.py @@ -30,7 +30,7 @@ def _doc_id(doc: Any) -> str: def _score_state(doc: Any) -> str: - """Classify a mined negative score as finite, missing, or non-finite.""" + """Classify a mined document score as finite, missing, or non-finite.""" if not isinstance(doc, dict) or "score" not in doc: return "missing" try: @@ -54,6 +54,7 @@ def audit_records( drop_invalid_negatives: Drop negatives that duplicate positives, duplicate another negative in the same row, or have a missing/non-finite score. max_findings: Maximum example findings to return. + min_negatives: Minimum number of negatives each row must retain after optional cleanup. Returns: A tuple of ``(summary, cleaned_records, finding_examples)``. @@ -62,6 +63,8 @@ def audit_records( "records": len(records), "negatives": 0, "rows_with_findings": 0, + "missing_positive_score": 0, + "non_finite_positive_score": 0, "negative_is_known_positive": 0, "duplicate_negative": 0, "missing_negative_score": 0, @@ -79,6 +82,22 @@ def audit_records( cleaned_negatives = [] row_has_findings = False + for pos_doc in record.get("pos_doc", []): + pos_id = _doc_id(pos_doc) + score_state = _score_state(pos_doc) + if score_state == "missing": + summary["missing_positive_score"] += 1 + row_has_findings = True + _append_finding( + findings, max_findings, row_idx, record, pos_id, "missing_positive_score", doc_role="positive" + ) + elif score_state == "non_finite": + summary["non_finite_positive_score"] += 1 + row_has_findings = True + _append_finding( + findings, max_findings, row_idx, record, pos_id, "non_finite_positive_score", doc_role="positive" + ) + for neg_doc in record.get("neg_doc", []): summary["negatives"] += 1 neg_id = _doc_id(neg_doc) @@ -126,7 +145,9 @@ def audit_records( cleaned_records.append(cleaned_record) summary["total_findings"] = ( - summary["negative_is_known_positive"] + summary["missing_positive_score"] + + summary["non_finite_positive_score"] + + summary["negative_is_known_positive"] + summary["duplicate_negative"] + summary["missing_negative_score"] + summary["non_finite_negative_score"] @@ -140,16 +161,19 @@ def _append_finding( max_findings: int, row_idx: int, record: dict[str, Any], - neg_id: str | None, + doc_id: str | None, issue: str, + *, + doc_role: str = "negative", ) -> None: """Append a compact finding example up to the configured limit.""" if len(findings) >= max_findings: return + id_key = "positive_id" if doc_role == "positive" else "negative_id" finding = { "row": row_idx, "question_id": record.get("question_id"), - "negative_id": neg_id, + id_key: doc_id, "issue": issue, } if "original_question_id" in record: diff --git a/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py b/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py index af004712fc..9cd189254f 100644 --- a/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py +++ b/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py @@ -38,12 +38,23 @@ def main() -> int: parser.add_argument( "output_dir", type=str, help="Directory where train.json and the corpus directory will be written" ) + parser.add_argument( + "--overwrite", + action="store_true", + help="Allow writing into a non-empty output directory.", + ) args = parser.parse_args() logging.basicConfig(level=logging.INFO) output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) + if output_dir.exists(): + if not output_dir.is_dir(): + parser.error(f"output_dir must be a directory: {output_dir}") + if any(output_dir.iterdir()) and not args.overwrite: + parser.error(f"output_dir is not empty: {output_dir}. Pass --overwrite to replace existing files.") + else: + output_dir.mkdir(parents=True) corpus_dir = output_dir / f"{args.subset}_corpus" corpus_dir.mkdir(parents=True, exist_ok=True) diff --git a/examples/retrieval/data_utils/mining_config.yaml b/examples/retrieval/data_utils/mining_config.yaml index 97048b1301..935091645e 100644 --- a/examples/retrieval/data_utils/mining_config.yaml +++ b/examples/retrieval/data_utils/mining_config.yaml @@ -65,8 +65,8 @@ mining: # Caching (required for multi-rank torchrun; use a shared writable path for multi-node). # Use a fresh directory per model, dataset, prefix, sequence-length, and world-size combination. - # The miner validates cache metadata before reuse, but fresh run-specific paths are still clearest, - # especially for mutable Hub IDs or local paths that are overwritten in place. + # The miner validates cache metadata plus consolidated array shape/readability before reuse. + # Fresh directories are still clearest for mutable Hub IDs or overwritten local paths. # cache_embeddings_dir: /path/to/cache/ # Set true only when every cached shard/chunk came from the same model/input/prefix/length/world-size run. load_embeddings_from_cache: false diff --git a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py index a4e9ea82a9..385e07b05e 100644 --- a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py +++ b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py @@ -104,8 +104,10 @@ def _load_npz_array(path: Path) -> np.ndarray: Returns: Loaded numpy array. """ - cached = np.load(path) - return cached[cached.files[0]] + with np.load(path) as cached: + if not cached.files: + raise ValueError(f"NPZ archive contains no arrays: {path}") + return cached[cached.files[0]] def _compute_rank_partition(total_size: int, world_size: int, rank: int) -> Tuple[int, int]: @@ -475,6 +477,18 @@ def _prepare_data(self): supplied_negative_document_indices.append(neg_indices) assert len(questions) == len(question_ids) + seen_question_ids = set() + duplicate_question_ids = [] + for question_id in question_ids: + if question_id in seen_question_ids: + duplicate_question_ids.append(question_id) + seen_question_ids.add(question_id) + if duplicate_question_ids: + duplicate_examples = ", ".join(str(question_id) for question_id in duplicate_question_ids[:5]) + raise ValueError( + "Mining requires unique question_id values so mined negatives can be written back unambiguously. " + f"Duplicate question_id values include: {duplicate_examples}" + ) self.questions = questions self.question_ids = question_ids @@ -965,6 +979,7 @@ def _cache_metadata_matches( metadata: Optional[Dict[str, Any]], query_shape: Optional[Tuple[int, ...]] = None, document_shape: Optional[Tuple[int, ...]] = None, + expected_fingerprint: Optional[Dict[str, Any]] = None, ) -> bool: """Check whether cache metadata matches the current mining run.""" if metadata is None: @@ -973,7 +988,9 @@ def _cache_metadata_matches( if metadata.get("version") != EMBEDDINGS_CACHE_METADATA_VERSION: logger.info("Embedding cache metadata version changed; recomputing embeddings.") return False - if metadata.get("fingerprint") != self._build_cache_fingerprint(): + if expected_fingerprint is None: + expected_fingerprint = self._build_cache_fingerprint() + if metadata.get("fingerprint") != expected_fingerprint: logger.info("Embedding cache fingerprint does not match this mining run; recomputing embeddings.") return False @@ -1024,7 +1041,10 @@ def _should_load_partial_cache(self, cache_path: Optional[Path]) -> bool: and cache_path.exists() ) - def _load_embeddings_from_cache(self) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]: + def _load_embeddings_from_cache( + self, + expected_fingerprint: Optional[Dict[str, Any]] = None, + ) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]: """Load query and document embeddings from cache. Returns: @@ -1040,10 +1060,25 @@ def _load_embeddings_from_cache(self) -> Tuple[Optional[np.ndarray], Optional[np if not query_path.exists() or not doc_path.exists(): return None, None - query_embeddings = _load_npz_array(query_path) - doc_embeddings = _load_npz_array(doc_path) + if expected_fingerprint is None: + expected_fingerprint = self._build_cache_fingerprint() metadata = self._load_cache_metadata() - if not self._cache_metadata_matches(metadata, query_embeddings.shape, doc_embeddings.shape): + if not self._cache_metadata_matches(metadata, expected_fingerprint=expected_fingerprint): + return None, None + + try: + query_embeddings = _load_npz_array(query_path) + doc_embeddings = _load_npz_array(doc_path) + except Exception as exc: + logger.warning("Ignoring unreadable embedding cache in %s: %s", cache_dir, exc) + return None, None + + if not self._cache_metadata_matches( + metadata, + query_embeddings.shape, + doc_embeddings.shape, + expected_fingerprint=expected_fingerprint, + ): return None, None return query_embeddings, doc_embeddings @@ -1057,7 +1092,8 @@ def _has_full_embeddings_cache(self) -> bool: doc_path = cache_dir / DOCUMENT_EMBEDDINGS_FNAME if not query_path.exists() or not doc_path.exists(): return False - return self._cache_metadata_matches(self._load_cache_metadata()) + query_embeddings, doc_embeddings = self._load_embeddings_from_cache() + return query_embeddings is not None and doc_embeddings is not None def _save_embeddings_to_cache( self, @@ -1093,28 +1129,31 @@ def _generate_embeddings(self) -> Tuple[np.ndarray, np.ndarray]: # Try loading from cache first. # # In distributed runs, only rank0 needs the consolidated embeddings for mining. - # To avoid redundant IO, rank0 checks for cache presence and broadcasts a cache_hit flag; - # only rank0 reads the cache files. + # To avoid redundant IO, rank0 validates the consolidated cache before broadcasting a hit; + # non-main ranks only proceed with dummy arrays when rank0 has already loaded a valid cache. self._reuse_partial_embedding_cache = self.load_embeddings_from_cache if self.load_embeddings_from_cache and self.cache_embeddings_dir: cache_hit = False + query_embeddings = None + document_embeddings = None + expected_fingerprint = None if self.dist_env.world_size > 1 and torch.distributed.is_initialized(): if self.dist_env.is_main: - cache_hit = self._has_full_embeddings_cache() + expected_fingerprint = self._build_cache_fingerprint() + query_embeddings, document_embeddings = self._load_embeddings_from_cache(expected_fingerprint) + cache_hit = query_embeddings is not None and document_embeddings is not None # Broadcast decision from rank0 to all ranks (NCCL requires CUDA tensors). flag_device = self.dist_env.device if self.dist_env.device.type == "cuda" else torch.device("cpu") flag = torch.tensor([1 if cache_hit else 0], dtype=torch.int64, device=flag_device) torch.distributed.broadcast(flag, src=0) cache_hit = bool(flag.item()) else: - cache_hit = self._has_full_embeddings_cache() + expected_fingerprint = self._build_cache_fingerprint() + query_embeddings, document_embeddings = self._load_embeddings_from_cache(expected_fingerprint) + cache_hit = query_embeddings is not None and document_embeddings is not None if cache_hit: if self.dist_env.is_main: - logger.info("Loading embeddings from cache (rank0 only)...") - query_embeddings, document_embeddings = self._load_embeddings_from_cache() - if query_embeddings is None or document_embeddings is None: - raise RuntimeError("Embedding cache was marked reusable but failed validation during load.") logger.info( f"Loaded embeddings from cache: queries={query_embeddings.shape}, " f"documents={document_embeddings.shape}" @@ -1125,7 +1164,13 @@ def _generate_embeddings(self) -> Tuple[np.ndarray, np.ndarray]: if self.dist_env.is_main: logger.info("Cache not found or incomplete, generating embeddings...") - self._reuse_partial_embedding_cache = False + metadata = self._load_cache_metadata() + if expected_fingerprint is None: + expected_fingerprint = self._build_cache_fingerprint() + self._reuse_partial_embedding_cache = self._cache_metadata_matches( + metadata, + expected_fingerprint=expected_fingerprint, + ) # Generate query embeddings (shard across ranks if distributed) query_embeddings = None @@ -1201,7 +1246,7 @@ def _mine_hard_negatives( num_negs: int, hard_neg_margin: Optional[float] = None, hard_neg_margin_type: Optional[str] = None, - ) -> Tuple[List[List[int]], List[List[float]], List[List[float]]]: + ) -> Tuple[List[List[int]], List[List[float]], List[List[float | None]]]: """Mine hard negatives for each query. This implementation uses the following key behaviors: @@ -1223,7 +1268,7 @@ def _mine_hard_negatives( Tuple of: - neg_indices: List of hard negative indices per query - neg_scores: Similarity scores for each hard negative - - pos_scores: Similarity scores for each positive document + - pos_scores: Similarity scores for each positive document; None when the raw score is non-finite """ if query_embeddings.ndim != 2 or document_embeddings.ndim != 2: raise ValueError( @@ -1272,7 +1317,10 @@ def _mine_hard_negatives( batch_scores[i, pos_idx] = float("-inf") # Reconstruct scores in original order (preserves pos_doc order for output) - query_pos_scores = [scores_by_idx[pos_idx] for pos_idx in pos_indices] + query_pos_scores = [ + score if math.isfinite(score) else None + for score in (scores_by_idx[pos_idx] for pos_idx in pos_indices) + ] batch_pos_scores.append(query_pos_scores) # Track minimum positive score for margin filtering @@ -1397,10 +1445,10 @@ def _build_negative_docs_by_question_id(self) -> Dict[str, List[Dict[str, Any]]] return negative_docs_by_question_id - def _build_positive_scores_by_question_id(self) -> Dict[str, List[float]]: + def _build_positive_scores_by_question_id(self) -> Dict[str, List[float | None]]: """Build mapping from question_id to positive document scores. - Scores are in the same order as pos_doc in the original data. + Scores are in the same order as pos_doc in the original data. Non-finite scores are stored as None. Returns: Dict mapping question_id to list of positive scores. @@ -1444,7 +1492,9 @@ def _write_output(self) -> None: pos_scores = pos_scores_by_qid.get(question_id, []) for j, pos_doc in enumerate(row.get("pos_doc", [])): if j < len(pos_scores): - pos_doc["score"] = pos_scores[j] + score = pos_scores[j] + if score is not None and math.isfinite(float(score)): + pos_doc["score"] = score # Remove legacy score fields if present if "pos_score" in row: diff --git a/tests/unit_tests/_transformers/test_retrieval.py b/tests/unit_tests/_transformers/test_retrieval.py index 9116cc8fd7..3bd22d54a5 100644 --- a/tests/unit_tests/_transformers/test_retrieval.py +++ b/tests/unit_tests/_transformers/test_retrieval.py @@ -352,3 +352,41 @@ def fake_base_from_pretrained(cls, pretrained_model_name_or_path, **kwargs): assert captured["pretrained_model_name_or_path"] == "saved-export" assert captured["kwargs"]["pooling"] is None assert captured["kwargs"]["l2_normalize"] is None + + +def test_biencoder_build_applies_saved_retrieval_metadata(monkeypatch): + """BiEncoderModel.build should use pooling/l2 metadata loaded from the saved config.""" + from nemo_automodel._transformers import retrieval + + class FakeConfig: + name_or_path = "saved-export" + nemo_retrieval = {"task": "embedding", "pooling": "last", "l2_normalize": False} + + class FakeModel: + config = FakeConfig() + + config = FakeConfig() + captured = {} + + def fake_build_encoder_backbone(model_name_or_path, task, **kwargs): + captured["model_name_or_path"] = model_name_or_path + captured["task"] = task + captured["pooling"] = kwargs.get("pooling") + captured["loaded_config"] = kwargs.get("loaded_config") + return FakeModel() + + monkeypatch.setattr(retrieval, "_load_encoder_config", lambda *_, **__: config) + monkeypatch.setattr(retrieval, "build_encoder_backbone", fake_build_encoder_backbone) + + model = retrieval.BiEncoderModel.build("saved-export") + + assert model.pooling == "last" + assert model.l2_normalize is False + assert captured == { + "model_name_or_path": "saved-export", + "task": "embedding", + "pooling": "last", + "loaded_config": config, + } + assert model.config.nemo_retrieval["pooling"] == "last" + assert model.config.nemo_retrieval["l2_normalize"] is False diff --git a/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py b/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py index 1bb87f43eb..e11334044a 100644 --- a/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py +++ b/tests/unit_tests/datasets/llm/test_audit_mined_negatives.py @@ -28,7 +28,7 @@ def test_audit_mined_negatives_reports_and_cleans_invalid_rows(): "original_question_id": "q0", "question": "Which document is positive?", "corpus_id": "demo", - "pos_doc": [{"id": 1}], + "pos_doc": [{"id": 1, "score": 0.8}], "neg_doc": [ {"id": "1", "score": 0.9}, {"id": "2", "score": 0.4}, @@ -46,6 +46,8 @@ def test_audit_mined_negatives_reports_and_cleans_invalid_rows(): "records": 1, "negatives": 5, "rows_with_findings": 1, + "missing_positive_score": 0, + "non_finite_positive_score": 0, "negative_is_known_positive": 1, "duplicate_negative": 1, "missing_negative_score": 1, @@ -61,13 +63,33 @@ def test_audit_mined_negatives_reports_and_cleans_invalid_rows(): "original_question_id": "q0", "question": "Which document is positive?", "corpus_id": "demo", - "pos_doc": [{"id": 1}], + "pos_doc": [{"id": 1, "score": 0.8}], "neg_doc": [{"id": "2", "score": 0.4}], } ] assert findings[0]["original_question_id"] == "q0" +def test_audit_mined_negatives_reports_positive_score_findings(): + training_data = { + "data": [ + { + "question_id": "q0", + "pos_doc": [{"id": "1"}, {"id": "2", "score": float("nan")}], + "neg_doc": [{"id": "3", "score": 0.1}], + } + ] + } + + summary, _, findings = audit_training_data(training_data) + + assert summary["missing_positive_score"] == 1 + assert summary["non_finite_positive_score"] == 1 + assert summary["total_findings"] == 2 + assert findings[0]["positive_id"] == "1" + assert findings[1]["positive_id"] == "2" + + def test_audit_mined_negatives_preserves_records_without_findings(): training_data = { "corpus": {"path": "/corpus"}, @@ -76,7 +98,7 @@ def test_audit_mined_negatives_preserves_records_without_findings(): "question_id": "q0", "question": "Which document is positive?", "corpus_id": "demo", - "pos_doc": [{"id": "1"}], + "pos_doc": [{"id": "1", "score": 0.8}], "neg_doc": [{"id": "2", "score": 0.2}], } ], @@ -101,7 +123,7 @@ def test_audit_cli_writes_cleaned_output_and_exits_zero(tmp_path, monkeypatch, c "question_id": "q0", "question": "Which document is positive?", "corpus_id": "demo", - "pos_doc": [{"id": "1"}], + "pos_doc": [{"id": "1", "score": 0.8}], "neg_doc": [{"id": "1", "score": 1.0}, {"id": "2"}], } ], @@ -137,7 +159,7 @@ def test_audit_cli_exits_nonzero_when_findings_remain(tmp_path, monkeypatch): "data": [ { "question_id": "q0", - "pos_doc": [{"id": "1"}], + "pos_doc": [{"id": "1", "score": 0.8}], "neg_doc": [{"id": "1", "score": 1.0}], } ] @@ -157,7 +179,7 @@ def test_audit_cli_allow_findings_exits_zero(tmp_path, monkeypatch): "data": [ { "question_id": "q0", - "pos_doc": [{"id": "1"}], + "pos_doc": [{"id": "1", "score": 0.8}], "neg_doc": [{"id": "1", "score": 1.0}], } ] @@ -178,7 +200,7 @@ def test_audit_cli_min_negatives_catches_rows_cleaned_to_empty(tmp_path, monkeyp "data": [ { "question_id": "q0", - "pos_doc": [{"id": "1"}], + "pos_doc": [{"id": "1", "score": 0.8}], "neg_doc": [{"id": "1", "score": 1.0}], } ] diff --git a/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py b/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py index 783a11033c..0c663fe2e5 100644 --- a/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py +++ b/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py @@ -15,6 +15,8 @@ import json import sys +import pytest + from examples.retrieval.data_utils import materialize_hf_retrieval_subset @@ -66,3 +68,54 @@ def test_materialize_hf_retrieval_subset_writes_local_corpus_json(tmp_path, monk assert metadata["class"] == "TextQADataset" assert metadata["corpus_id"] == "demo" assert (tmp_path / "FEVER_corpus" / "train.parquet").exists() + + +def test_materialize_hf_retrieval_subset_rejects_non_empty_output_dir(tmp_path, monkeypatch): + (tmp_path / "existing.txt").write_text("keep me") + monkeypatch.setattr( + sys, + "argv", + [ + "materialize_hf_retrieval_subset.py", + "nvidia/embed-nemotron-dataset-v1", + "FEVER", + str(tmp_path), + ], + ) + + with pytest.raises(SystemExit) as exc_info: + materialize_hf_retrieval_subset.main() + + assert exc_info.value.code == 2 + + +def test_materialize_hf_retrieval_subset_overwrites_when_requested(tmp_path, monkeypatch): + (tmp_path / "existing.txt").write_text("replace allowed") + data_list = [ + { + "question_id": "q0", + "question": "What is the document?", + "corpus_id": "demo", + "pos_doc": [{"id": "d0"}], + "neg_doc": [], + } + ] + monkeypatch.setattr( + materialize_hf_retrieval_subset, + "_load_hf_subset", + lambda repo_id, subset: (data_list, _FakeCorpusInfo()), + ) + monkeypatch.setattr( + sys, + "argv", + [ + "materialize_hf_retrieval_subset.py", + "nvidia/embed-nemotron-dataset-v1", + "FEVER", + str(tmp_path), + "--overwrite", + ], + ) + + assert materialize_hf_retrieval_subset.main() == 0 + assert (tmp_path / "train.json").exists() diff --git a/tests/unit_tests/recipes/test_mine_hard_negatives.py b/tests/unit_tests/recipes/test_mine_hard_negatives.py index 7ecc5703ef..d9bc2894c2 100644 --- a/tests/unit_tests/recipes/test_mine_hard_negatives.py +++ b/tests/unit_tests/recipes/test_mine_hard_negatives.py @@ -331,10 +331,61 @@ def test_full_embeddings_cache_rejects_shape_mismatch(tmp_path): recipe._save_embeddings_to_cache(query_embeddings, document_embeddings) np.savez(tmp_path / QUERY_EMBEDDINGS_FNAME, np.ones((2, 2), dtype=np.float32)) - assert recipe._has_full_embeddings_cache() + assert not recipe._has_full_embeddings_cache() assert recipe._load_embeddings_from_cache() == (None, None) +def test_generate_embeddings_recomputes_when_full_cache_payload_is_invalid(tmp_path): + recipe = _make_cache_ready_recipe(tmp_path) + recipe._save_embeddings_to_cache(np.ones((1, 2), dtype=np.float32), np.ones((1, 2), dtype=np.float32)) + np.savez(tmp_path / QUERY_EMBEDDINGS_FNAME, np.ones((2, 2), dtype=np.float32)) + recomputed_query = np.full((1, 2), 2.0, dtype=np.float32) + recomputed_documents = np.full((1, 2), 3.0, dtype=np.float32) + recipe._encode_queries = lambda: recomputed_query + recipe._encode_all_documents = lambda: recomputed_documents + + query_embeddings, document_embeddings = recipe._generate_embeddings() + + np.testing.assert_array_equal(query_embeddings, recomputed_query) + np.testing.assert_array_equal(document_embeddings, recomputed_documents) + + +def test_generate_embeddings_reuses_partial_cache_when_metadata_matches(tmp_path): + recipe = _make_cache_ready_recipe(tmp_path) + expected_query = np.ones((1, 2), dtype=np.float32) + expected_documents = np.ones((1, 2), dtype=np.float32) + recipe._write_cache_metadata(expected_query, expected_documents) + + def encode_queries(): + assert recipe._reuse_partial_embedding_cache + return expected_query + + def encode_documents(): + assert recipe._reuse_partial_embedding_cache + return expected_documents + + recipe._encode_queries = encode_queries + recipe._encode_all_documents = encode_documents + + query_embeddings, document_embeddings = recipe._generate_embeddings() + + np.testing.assert_array_equal(query_embeddings, expected_query) + np.testing.assert_array_equal(document_embeddings, expected_documents) + + +def test_prepare_data_rejects_duplicate_question_ids(): + recipe = _make_recipe() + recipe.questions_dataset = [ + {"question_id": "q0", "question": "first", "corpus_id": "demo", "pos_doc": [{"id": "d0"}]}, + {"question_id": "q0", "question": "second", "corpus_id": "demo", "pos_doc": [{"id": "d0"}]}, + ] + recipe.doc_to_idx = {"d0": 0} + recipe.use_negatives_from_file = False + + with pytest.raises(ValueError, match="unique question_id"): + recipe._prepare_data() + + def test_encode_queries_sharded_handles_empty_rank_shard(tmp_path): recipe = _make_recipe({"cache_embeddings_dir": str(tmp_path), "load_embeddings_from_cache": False}) recipe.cache_embeddings_dir = str(tmp_path) @@ -457,7 +508,7 @@ def test_mine_hard_negatives_skips_margin_when_positive_score_is_non_finite(): assert neg_indices == [[1]] assert neg_scores[0][0] == pytest.approx(0.2) - assert pos_scores[0][0] == float("-inf") + assert pos_scores[0][0] is None def test_mine_hard_negatives_rejects_token_level_embeddings(): From 46aeba1723f9b1649e45c4982e255022f982a8d2 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 21:10:20 +0100 Subject: [PATCH 23/25] fix(retrieval): validate partial mining caches Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- .../materialize_hf_retrieval_subset.py | 3 + .../recipes/retrieval/mine_hard_negatives.py | 49 +++++++++++----- .../test_materialize_hf_retrieval_subset.py | 5 ++ .../recipes/test_mine_hard_negatives.py | 57 +++++++++++++++++++ 4 files changed, 99 insertions(+), 15 deletions(-) diff --git a/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py b/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py index 9cd189254f..2000694ec6 100644 --- a/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py +++ b/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py @@ -17,6 +17,7 @@ import argparse import json import logging +import shutil from pathlib import Path from datasets import Dataset @@ -56,6 +57,8 @@ def main() -> int: else: output_dir.mkdir(parents=True) corpus_dir = output_dir / f"{args.subset}_corpus" + if args.overwrite and corpus_dir.exists(): + shutil.rmtree(corpus_dir) corpus_dir.mkdir(parents=True, exist_ok=True) data_list, corpus_info = _load_hf_subset(args.repo_id, args.subset) diff --git a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py index 385e07b05e..5a2911df0d 100644 --- a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py +++ b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py @@ -655,25 +655,29 @@ def _encode_queries_sharded(self) -> np.ndarray: return np.concatenate(parts, axis=0) if parts else np.empty((0, 0), dtype=np.float32) - def _load_cached_chunk(self, cache_path: Path) -> Optional[np.ndarray]: - """Load a fully-assembled chunk cache if it exists. - - In distributed mode, only rank0 loads the cache to avoid redundant IO. + def _load_cached_chunk(self, cache_path: Path, expected_size: Optional[int] = None) -> Optional[np.ndarray]: + """Load a fully-assembled chunk cache if it exists and matches the current chunk.""" + if not self._should_load_partial_cache(cache_path): + return None - Args: - cache_path: Path to cached chunk file. + try: + cached_chunk = _load_npz_array(cache_path) + except Exception as exc: + logger.warning("Ignoring unreadable document chunk cache %s: %s", cache_path, exc) + return None - Returns: - Cached embeddings array, or None if cache doesn't exist. - """ - if not self._should_load_partial_cache(cache_path): + if expected_size is not None and cached_chunk.shape[0] != expected_size: + logger.info( + "Cached document chunk %s has %s rows, expected %s; recomputing chunk.", + cache_path, + cached_chunk.shape[0], + expected_size, + ) return None - # In distributed runs, only rank0 needs the assembled chunk if self.dist_env.world_size > 1 and not self.dist_env.is_main: return np.empty((0, 0), dtype=np.float32) - - return _load_npz_array(cache_path) + return cached_chunk def _encode_chunk_distributed( self, @@ -703,9 +707,19 @@ def _encode_chunk_distributed( rank_cache_path = cache_path.parent / f"{cache_path.stem}_rank{r:04d}{cache_path.suffix}" # Compute or load this rank's slice + local_expected = local_end - local_start + local_embeds = None if self._should_load_partial_cache(rank_cache_path): local_embeds = _load_npz_array(rank_cache_path) - else: + if local_embeds.shape[0] != local_expected: + logger.info( + "Cached rank shard %s has %s rows, expected %s; recomputing shard.", + rank_cache_path, + local_embeds.shape[0], + local_expected, + ) + local_embeds = None + if local_embeds is None: local_texts = texts[local_start:local_end] local_embeds = self._encode_texts( texts=local_texts, @@ -780,7 +794,7 @@ def _encode_documents_chunk( numpy array of document embeddings [num_docs, embedding_dim]. """ # Fast path: load from cache if available - cached_result = self._load_cached_chunk(cache_path) + cached_result = self._load_cached_chunk(cache_path, expected_size=len(doc_indices)) if cached_result is not None: return cached_result @@ -941,6 +955,7 @@ def _build_cache_fingerprint(self) -> Dict[str, Any]: "passage_prefix": self.passage_prefix, "query_max_length": self.query_max_length, "passage_max_length": self.passage_max_length, + "corpus_chunk_size": self.corpus_chunk_size, "add_bos_token": self.add_bos_token, "add_eos_token": self.add_eos_token, "attn_implementation": self.attn_implementation, @@ -1495,6 +1510,10 @@ def _write_output(self) -> None: score = pos_scores[j] if score is not None and math.isfinite(float(score)): pos_doc["score"] = score + else: + pos_doc.pop("score", None) + else: + pos_doc.pop("score", None) # Remove legacy score fields if present if "pos_score" in row: diff --git a/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py b/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py index 0c663fe2e5..f798c6667a 100644 --- a/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py +++ b/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py @@ -91,6 +91,9 @@ def test_materialize_hf_retrieval_subset_rejects_non_empty_output_dir(tmp_path, def test_materialize_hf_retrieval_subset_overwrites_when_requested(tmp_path, monkeypatch): (tmp_path / "existing.txt").write_text("replace allowed") + stale_corpus = tmp_path / "FEVER_corpus" + stale_corpus.mkdir() + (stale_corpus / "train-00000-of-00001.parquet").write_text("stale") data_list = [ { "question_id": "q0", @@ -119,3 +122,5 @@ def test_materialize_hf_retrieval_subset_overwrites_when_requested(tmp_path, mon assert materialize_hf_retrieval_subset.main() == 0 assert (tmp_path / "train.json").exists() + assert not (tmp_path / "FEVER_corpus" / "train-00000-of-00001.parquet").exists() + assert (tmp_path / "FEVER_corpus" / "train.parquet").exists() diff --git a/tests/unit_tests/recipes/test_mine_hard_negatives.py b/tests/unit_tests/recipes/test_mine_hard_negatives.py index d9bc2894c2..bfda6326b2 100644 --- a/tests/unit_tests/recipes/test_mine_hard_negatives.py +++ b/tests/unit_tests/recipes/test_mine_hard_negatives.py @@ -279,6 +279,43 @@ def test_write_output_preserves_original_question_id(tmp_path): ] +def test_write_output_removes_stale_positive_score_for_non_finite_current_score(tmp_path): + input_file = tmp_path / "input.json" + output_file = tmp_path / "output.json" + input_file.write_text( + json.dumps( + { + "data": [ + { + "question_id": "q0", + "question": "Which doc is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": "p1", "score": 0.9}], + "neg_doc": [], + } + ] + } + ) + ) + recipe = _make_recipe( + { + "train_qa_file_path": str(input_file), + "train_file_output_path": str(output_file), + } + ) + recipe.train_qa_file_path = str(input_file) + recipe.train_file_output_path = str(output_file) + recipe.questions_dataset = json.loads(input_file.read_text())["data"] + recipe._build_negative_docs_by_question_id = lambda: {"q0": []} + recipe._build_positive_scores_by_question_id = lambda: {"q0": [None]} + recipe._get_mining_args_dict = lambda: {} + + recipe._write_output() + + output = json.loads(output_file.read_text()) + assert output["data"][0]["pos_doc"] == [{"id": "p1"}] + + def test_encode_texts_empty_input_returns_empty_array(): recipe = _make_recipe() @@ -297,6 +334,26 @@ def test_load_cached_chunk_ignored_when_cache_loading_disabled(tmp_path): assert recipe._load_cached_chunk(cache_path) is None +def test_load_cached_chunk_rejects_shape_mismatch(tmp_path): + recipe = _make_cache_ready_recipe(tmp_path) + recipe._reuse_partial_embedding_cache = True + cache_path = tmp_path / "chunk_0000.npz" + np.savez(cache_path, np.ones((2, 2), dtype=np.float32)) + + assert recipe._load_cached_chunk(cache_path, expected_size=1) is None + + +def test_cache_fingerprint_includes_corpus_chunk_size(tmp_path): + recipe = _make_cache_ready_recipe(tmp_path) + recipe.corpus_chunk_size = 2 + first_fingerprint = recipe._build_cache_fingerprint() + + recipe.corpus_chunk_size = 3 + + assert first_fingerprint["corpus_chunk_size"] == 2 + assert recipe._build_cache_fingerprint()["corpus_chunk_size"] == 3 + + def test_full_embeddings_cache_requires_matching_metadata(tmp_path): recipe = _make_cache_ready_recipe(tmp_path) query_embeddings = np.ones((1, 2), dtype=np.float32) From f983d557371f36833ee536bc4baed72959cb1f36 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 21:39:17 +0100 Subject: [PATCH 24/25] fix(retrieval): close mining review gaps Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-finetuning.md | 97 +++++--- .../llama_embed_nemotron_8b/README.md | 9 +- .../materialize_hf_retrieval_subset.py | 68 ++++-- .../retrieval/data_utils/mining_config.yaml | 17 +- .../recipes/retrieval/mine_hard_negatives.py | 88 +++++--- .../test_materialize_hf_retrieval_subset.py | 30 +++ .../recipes/test_mine_hard_negatives.py | 207 ++++++++++++++++++ 7 files changed, 430 insertions(+), 86 deletions(-) diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 06c364d828..1bc0ac435e 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -50,10 +50,13 @@ Before running the examples: The commands below use `automodel`; if you are running from a source checkout, prefix them with `uv run`. For direct `torchrun` commands, use `uv run torchrun ...` from a source checkout, or activate an installed environment first. -Start with a one-GPU smoke test: +Start with a one-GPU smoke test. The timestamped checkpoint directory keeps this first command from silently resuming +or appending metrics to an older run: ```bash +RUN_ID=$(date +%Y%m%d-%H%M%S) automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 1 \ + --checkpoint.checkpoint_dir ./output/retrieval_smoke_${RUN_ID}/checkpoints \ --dist_env.timeout_minutes 30 \ --step_scheduler.global_batch_size 4 \ --step_scheduler.local_batch_size 1 \ @@ -61,22 +64,33 @@ automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 1 \ --dataloader.dataset.max_train_samples 40 ``` +`max_train_samples` shortens the training rows after the configured `hf://` split and corpus are loaded. This is a +training-step smoke test, not a cheap data-loading smoke test; first-run downloads and corpus loading can still take +time and disk. For a tiny data-loading test, point the config at a small local retrieval JSON/JSONL sample. + The first artifact to check is `training.jsonl` under `checkpoint.checkpoint_dir`. JSONL metrics are buffered, so stdout/stderr are still the best live signal during a very short run. Scale the Llama 3.2 1B bi-encoder example to the GPUs on your machine: ```bash -automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 8 +RUN_ID=$(date +%Y%m%d-%H%M%S) +automodel examples/retrieval/bi_encoder/llama3_2_1b.yaml --nproc-per-node 8 \ + --checkpoint.checkpoint_dir ./output/llama3_2_1b_encoder_${RUN_ID}/checkpoints ``` Run the matching cross-encoder example: ```bash -automodel examples/retrieval/cross_encoder/llama3_2_1b.yaml --nproc-per-node 8 +RUN_ID=$(date +%Y%m%d-%H%M%S) +automodel examples/retrieval/cross_encoder/llama3_2_1b.yaml --nproc-per-node 8 \ + --checkpoint.checkpoint_dir ./output/llama3_2_1b_cross_encoder_${RUN_ID}/checkpoints ``` -Adjust `--nproc-per-node` to the number of GPUs on your machine. The examples use FSDP2 and bfloat16 by default. +Adjust `--nproc-per-node` to the number of GPUs on your machine. The examples use FSDP2 and bfloat16 by default. The +example scheduler uses `global_batch_size: 128` and `local_batch_size: 4`, so GPU counts that do not divide `32` need an +explicit `--step_scheduler.global_batch_size` override. For example, 6 GPUs can use +`--step_scheduler.global_batch_size 120` or another multiple of `4 * 6`. ## Choose a Recipe @@ -113,6 +127,11 @@ cross-encoder scoring recipe. If you are extracting a text tower from a parent c types use registered retrieval classes and unsupported extracted types can fall back to Hugging Face sequence classification for cross-encoder scoring. +Treat unregistered decoder-only fallback models as an architecture experiment, not just a drop-in model swap. Registered +retrieval backbones such as the Llama bidirectional path use retrieval-specific attention behavior; a vanilla +Hugging Face `AutoModel` fallback can keep the source model's original causal behavior, which may be lower quality for +symmetric embedding retrieval. + ## Prepare Data Use the retrieval dataset format described in [Retrieval Dataset](retrieval-dataset.md). Choose the data path that @@ -216,7 +235,7 @@ tokenizer: dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader dataset: - _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset + _target_: nemo_automodel.components.datasets.llm.retrieval_dataset.make_retrieval_dataset model_type: bi_encoder data_dir_list: - /path/to/train.jsonl @@ -288,7 +307,7 @@ tokenizer: dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader dataset: - _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset + _target_: nemo_automodel.components.datasets.llm.retrieval_dataset.make_retrieval_dataset model_type: bi_encoder data_dir_list: - /path/to/train.jsonl @@ -426,16 +445,20 @@ HF data on each node or use a shared cache, and budget CPU RAM and local disk pe ## Add Validation -Both examples include a commented `validation_dataloader` block. Enable it when you have a held-out retrieval file: +Both examples include a commented `validation_dataloader` block. Enable it when you have a held-out retrieval file. +Use the same dataset family as the validation source: +`nemo_automodel.components.datasets.llm.retrieval_dataset.make_retrieval_dataset` for `hf://` or corpus ID-based JSON, +and `retrieval_dataset_inline.make_retrieval_dataset` only for inline JSONL. This corpus-backed example mirrors the +shipped configs: ```yaml validation_dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader dataset: - _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset + _target_: nemo_automodel.components.datasets.llm.retrieval_dataset.make_retrieval_dataset model_type: bi_encoder data_dir_list: - - /path/to/validation.jsonl + - /path/to/validation.json data_type: eval n_passages: 5 seed: 42 @@ -454,7 +477,10 @@ validation_dataloader: Validation logs `val_loss`, `val_acc1`, and `val_mrr` to `validation.jsonl` under `checkpoint.checkpoint_dir`. These metrics measure ranking within each candidate group in the validation file; they are not full-corpus Recall@K or nDCG -metrics. For cross-encoder validation, use `model_type: cross_encoder` and `CrossEncoderCollator` instead. +metrics. For cross-encoder validation, use `model_type: cross_encoder` and `CrossEncoderCollator` instead. In multi-rank +runs, validation uses the same distributed sampler path as training and can drop tail examples to keep rank shapes even; +make the validation set divisible by the data-parallel world size or run validation on one GPU when you need every +candidate group included. ```bash tail -n 5 ./output/llama3_2_1b_encoder/checkpoints/validation.jsonl @@ -474,7 +500,9 @@ candidate generation, evaluate against a fixed held-out corpus and qrels: AutoModel does not currently provide a one-command full-corpus retrieval evaluator in this guide. Use your existing IR evaluation stack or a small script around the consolidated checkpoint and report enough run details to make the result repeatable: query count, corpus size, qrels source, judged/unjudged handling, exact versus ANN search settings, K -values, baseline checkpoint, and whether confidence intervals or significance tests were used. +values, baseline checkpoint, and whether confidence intervals or significance tests were used. At minimum, make the +script inputs explicit: a consolidated bi-encoder checkpoint, a corpus table with stable document IDs, a query table with +stable query IDs, qrels keyed by those IDs, the query/passage prefixes and max lengths, and the K values to report. For cross-encoders, freeze a first-stage retriever, rerank its top-K candidates, and report reranking metrics on that same candidate set. Also report first-stage candidate recall or coverage: if a query's positive document is missing @@ -529,8 +557,8 @@ memory and want maximum adaptation. After an initial bi-encoder run, mine harder negatives with the consolidated encoder checkpoint. Hard-negative mining expects the corpus ID-based retrieval JSON format described in the dataset guide, not the inline JSONL shortcut. The input must reference one corpus so the miner can build a passage embedding cache, retrieve candidates, and write mined -negatives back to each query. Each mining row must have a unique `question_id`; duplicate IDs make the output mapping -ambiguous and are rejected. +negatives back to each query. Every row's `corpus_id` must match that single loaded corpus, and each mining row must have +a unique `question_id`; mismatched corpus IDs and duplicate question IDs are rejected before mining. The quickstart configs use `hf://` sources for the first train/eval path. The miner currently reads a local corpus-backed retrieval JSON file instead of `hf://` URIs directly. For a train -> mine -> retrain loop, first @@ -548,8 +576,14 @@ uv run python examples/retrieval/data_utils/materialize_hf_retrieval_subset.py \ This writes `/path/to/retrieval-data/fever-mining/train.json` and a local `FEVER_corpus/` directory. Run the command once per subset/corpus that you want to mine; the helper intentionally processes one corpus per run and refuses to write -into a non-empty output directory unless you pass `--overwrite`. The mining examples below use that local -`train.json` path. +into a non-empty output directory unless you pass `--overwrite`. With `--overwrite`, it writes replacement files in +temporary paths and swaps them in only after the new subset has loaded and serialized successfully. The mining examples +below use that local `train.json` path. + +The default bi-encoder example trains from both `FEVER` and `SyntheticClassificationData`. For a full train -> mine -> +retrain loop, materialize and mine each subset separately, give each mined output and embedding cache its own run-specific +path, then list all mined JSON files in the next config's `data_dir_list`. Replacing a multi-source config with only one +mined file intentionally trains on that subset only. This single-node example uses `--standalone`: @@ -558,7 +592,7 @@ uv run torchrun --standalone --nproc_per_node=8 examples/retrieval/data_utils/mi --config examples/retrieval/data_utils/mining_config.yaml \ --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_499/model/consolidated \ --mining.train_qa_file_path /path/to/retrieval-data/fever-mining/train.json \ - --mining.train_file_output_path /path/to/output.json \ + --mining.train_file_output_path /path/to/retrieval-data/fever-mined/train.json \ --mining.cache_embeddings_dir /shared/path/to/cache/llama3_2_1b_fever_mine_v1 \ --mining.query_prefix "query: " \ --mining.passage_prefix "passage: " \ @@ -580,7 +614,7 @@ uv run torchrun \ --config examples/retrieval/data_utils/mining_config.yaml \ --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_499/model/consolidated \ --mining.train_qa_file_path /path/to/retrieval-data/fever-mining/train.json \ - --mining.train_file_output_path /path/to/output.json \ + --mining.train_file_output_path /path/to/retrieval-data/fever-mined/train.json \ --mining.cache_embeddings_dir /shared/path/to/cache/llama3_2_1b_fever_mine_v1 \ --mining.query_prefix "query: " \ --mining.passage_prefix "passage: " \ @@ -591,7 +625,9 @@ uv run torchrun \ Replace `epoch_0_step_499` with the explicit checkpoint directory that you want to mine from. If you only have `LATEST.txt`, read it first and substitute the resolved `epoch_*_step_*` directory; the mining script loads the -Hugging Face export directly and does not apply AutoModel's checkpoint resolver. +Hugging Face export directly and does not apply AutoModel's checkpoint resolver. The miner refuses to overwrite an +existing `train_file_output_path` by default. Choose a new output path for each mining run, or pass +`--mining.overwrite_output true` only when replacing that file is intentional. Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: @@ -611,15 +647,20 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: to tokenizer defaults, which can differ from the training config. - `use_negatives_from_file`: include existing negatives from the input file when mining. Existing negatives are prepended to the output and mined negatives are appended, so deduplicate and audit the output before using it for training. +- `overwrite_output`: defaults to `false`. Set it to `true` only when you intentionally want to replace an existing + mined output file; the input and output paths must still be different. +- `attn_implementation`: optional model-loading escape hatch for mining exports that need `sdpa`, `eager`, or + `flash_attention_2` pinned. - `cache_embeddings_dir`: required for distributed mining so ranks can share cached passage embeddings. Rank `0` assembles the final embedding cache and score outputs, so plan memory and local disk accordingly. In multi-node mining, this must be a shared writable path mounted at the same location on every node; node-local cache paths leave rank `0` unable to read remote-rank shards. Use a fresh cache directory for each model, dataset, prefix, sequence - length, and world-size combination. The miner validates cache metadata and loads the consolidated arrays to verify - fingerprint, shape, and readability before reusing a consolidated cache. The fingerprint includes the mining input + length, `corpus_chunk_size`, and world-size combination. The miner validates cache metadata and loads the + consolidated arrays to verify fingerprint, shape, and readability before reusing a consolidated cache. The fingerprint includes the mining input file, local model/tokenizer path state, ordered document IDs/content, and embedding settings. Fresh run-specific paths are still easier to reason about, especially for mutable Hub IDs or paths that are overwritten in place. Set `load_embeddings_from_cache: true` only when you intentionally want to reuse every cached - query shard, corpus chunk, and consolidated embedding file from the same model/input/prefix/length/world-size run. + query shard, corpus chunk, and consolidated embedding file from the same + model/input/prefix/length/`corpus_chunk_size`/world-size run. `pooling` and `l2_normalize` are saved bi-encoder wrapper metadata, not `mining.*` config fields. Do not pass `--mining.pooling` or `--mining.l2_normalize`; the miner rejects unknown mining keys. Mine from a saved bi-encoder export @@ -638,12 +679,14 @@ Hard-negative mining parallelizes embedding generation across ranks, but the fin rank `0` and materializes the full document embedding matrix there. For very large corpora, use a smaller mining slice or a custom ANN/blockwise mining workflow instead of expecting this helper to scale to web-scale indexing. -Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. Hard -negative mining excludes document IDs listed in each input row's `pos_doc`, but it cannot read an external qrels file or -know every semantically relevant duplicate. Put all known positive IDs for the query in the mining input, deduplicate the -corpus, inspect mined samples, filter duplicate IDs and non-finite scores such as `-inf` from mined outputs, and avoid -mining from validation or test corpora. If you unroll multi-positive training data, mine from rows that still carry every -known positive in `pos_doc`; otherwise sibling positives can be mined as false negatives. +Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. If +the previous run used multiple sources, list the mined file for each source. Hard negative mining excludes document IDs +listed in each input row's `pos_doc`, but it cannot read an external qrels file or know every semantically relevant +duplicate. Put all known positive IDs for the query in the mining input, deduplicate the corpus, inspect mined samples, +filter duplicate IDs and non-finite scores such as `-inf` from mined outputs, and avoid mining from validation or test +corpora. If you unroll multi-positive training data, mine from rows that still carry every known positive in `pos_doc`; +otherwise sibling positives can be mined as false negatives. Custom row-level metadata from the input JSON is preserved +in the mined output, while `neg_doc` and positive-document scores are refreshed. Run the audit utility before reusing mined output: diff --git a/examples/retrieval/bi_encoder/llama_embed_nemotron_8b/README.md b/examples/retrieval/bi_encoder/llama_embed_nemotron_8b/README.md index c93debfd69..d6716ecf09 100644 --- a/examples/retrieval/bi_encoder/llama_embed_nemotron_8b/README.md +++ b/examples/retrieval/bi_encoder/llama_embed_nemotron_8b/README.md @@ -12,19 +12,20 @@ This guide provides step-by-step instructions to reproduce the training pipeline Download and prepare the `nvidia/embed-nemotron-dataset-v1` dataset from [Hugging Face](https://huggingface.co/datasets/nvidia/embed-nemotron-dataset-v1). This dataset is a selected subset of the fine-tuning data used for training the `llama-embed-nemotron-8b` model: -```python -python examples/retrieval/bi_encoder/llama_embed_nemotron_8b/data_preparation.py \ +```bash +uv run python examples/retrieval/bi_encoder/llama_embed_nemotron_8b/data_preparation.py \ --download-path ./embed_nemotron_dataset_v1 ``` -This script will download the dataset and prepare it for training. +Run this command from the repository root, or update the relative paths in the YAML. This script will download the dataset and prepare it for training. + ### 2. Run Model Finetuning Run the model finetuning with the specified configuration using 8 GPUs: ```bash -automodel examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml --nproc-per-node 8 +uv run automodel examples/retrieval/bi_encoder/llama_embed_nemotron_8b/llama_embed_nemotron_8b.yaml --nproc-per-node 8 ``` The final model checkpoint in Hugging Face format will be stored in `output/llama_embed_nemotron_8b/epoch_0_step_28614/model/consolidated` diff --git a/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py b/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py index 2000694ec6..08de08224e 100644 --- a/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py +++ b/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py @@ -18,6 +18,7 @@ import json import logging import shutil +import tempfile from pathlib import Path from datasets import Dataset @@ -56,32 +57,55 @@ def main() -> int: parser.error(f"output_dir is not empty: {output_dir}. Pass --overwrite to replace existing files.") else: output_dir.mkdir(parents=True) + data_list, corpus_info = _load_hf_subset(args.repo_id, args.subset) corpus_dir = output_dir / f"{args.subset}_corpus" - if args.overwrite and corpus_dir.exists(): - shutil.rmtree(corpus_dir) - corpus_dir.mkdir(parents=True, exist_ok=True) + tmp_corpus_dir = Path(tempfile.mkdtemp(prefix=f".{args.subset}_corpus.", dir=output_dir)) + tmp_train_path = None + backup_corpus_dir = None + try: + doc_rows = [] + for doc_id in corpus_info.get_all_ids(): + document = corpus_info.get_document_by_id(doc_id) + doc_rows.append({"id": str(doc_id), "text": document.get("text", "")}) - data_list, corpus_info = _load_hf_subset(args.repo_id, args.subset) - doc_rows = [] - for doc_id in corpus_info.get_all_ids(): - document = corpus_info.get_document_by_id(doc_id) - doc_rows.append({"id": str(doc_id), "text": document.get("text", "")}) + Dataset.from_list(doc_rows).to_parquet(str(tmp_corpus_dir / "train.parquet")) + metadata = { + **corpus_info.metadata, + "class": "TextQADataset", + "corpus_id": corpus_info.corpus_id, + } + with open(tmp_corpus_dir / "merlin_metadata.json", "w") as f: + json.dump(metadata, f, indent=2, sort_keys=True) - Dataset.from_list(doc_rows).to_parquet(str(corpus_dir / "train.parquet")) - metadata = { - **corpus_info.metadata, - "class": "TextQADataset", - "corpus_id": corpus_info.corpus_id, - } - with open(corpus_dir / "merlin_metadata.json", "w") as f: - json.dump(metadata, f, indent=2, sort_keys=True) + train_json = { + "corpus": [{"path": f"./{corpus_dir.name}"}], + "data": data_list, + } + with tempfile.NamedTemporaryFile("w", dir=output_dir, prefix=".train.", suffix=".json", delete=False) as f: + tmp_train_path = Path(f.name) + json.dump(train_json, f, indent=2) - train_json = { - "corpus": [{"path": f"./{corpus_dir.name}"}], - "data": data_list, - } - with open(output_dir / "train.json", "w") as f: - json.dump(train_json, f, indent=2) + if corpus_dir.exists(): + backup_corpus_dir = Path(tempfile.mkdtemp(prefix=f".{corpus_dir.name}.backup.", dir=output_dir)) + backup_corpus_dir.rmdir() + shutil.move(str(corpus_dir), str(backup_corpus_dir)) + shutil.move(str(tmp_corpus_dir), str(corpus_dir)) + tmp_corpus_dir = None + tmp_train_path.replace(output_dir / "train.json") + tmp_train_path = None + except Exception: + if backup_corpus_dir is not None and backup_corpus_dir.exists(): + if corpus_dir.exists(): + shutil.rmtree(corpus_dir) + shutil.move(str(backup_corpus_dir), str(corpus_dir)) + raise + finally: + if tmp_corpus_dir is not None and tmp_corpus_dir.exists(): + shutil.rmtree(tmp_corpus_dir) + if tmp_train_path is not None and tmp_train_path.exists(): + tmp_train_path.unlink() + if backup_corpus_dir is not None and backup_corpus_dir.exists(): + shutil.rmtree(backup_corpus_dir) logger.info("Wrote %s records and %s corpus documents to %s", len(data_list), len(doc_rows), output_dir) return 0 diff --git a/examples/retrieval/data_utils/mining_config.yaml b/examples/retrieval/data_utils/mining_config.yaml index 935091645e..b4fd59700b 100644 --- a/examples/retrieval/data_utils/mining_config.yaml +++ b/examples/retrieval/data_utils/mining_config.yaml @@ -21,7 +21,7 @@ # --config examples/retrieval/data_utils/mining_config.yaml \ # --mining.model_name_or_path ./output/llama3_2_1b_encoder/checkpoints/epoch_0_step_499/model/consolidated \ # --mining.train_qa_file_path /path/to/input.json \ -# --mining.train_file_output_path /path/to/output.json \ +# --mining.train_file_output_path /path/to/mined.json \ # --mining.cache_embeddings_dir /shared/path/to/cache/ \ # --mining.query_prefix "query: " \ # --mining.passage_prefix "passage: " \ @@ -43,7 +43,7 @@ dist_env: # mining.train_qa_file_path: Input QA file # mining.train_file_output_path: Output file with mined negatives # mining.cache_embeddings_dir: Required for multi-rank torchrun; optional only for single-process mining. -# Use a fresh path per model/dataset/prefix/length/world-size combination. +# Use a fresh path per model/dataset/prefix/length/corpus_chunk_size/world-size combination. mining: # Model path - REQUIRED (override via --mining.model_name_or_path) @@ -52,6 +52,9 @@ mining: # Tokenizer path - defaults to model_name_or_path if not specified # tokenizer_name_or_path: /path/to/tokenizer + # Optional attention implementation for model loading: sdpa, eager, or flash_attention_2. + # attn_implementation: sdpa + # Tokenizer special token behavior. Match the tokenizer settings used during training. # If left commented, mining falls back to tokenizer defaults, which may differ from # the fine-tuning config. @@ -61,14 +64,16 @@ mining: # Input/Output - REQUIRED # train_qa_file_path: /path/to/input.json - # train_file_output_path: /path/to/output.json + # train_file_output_path: /path/to/mined.json + # Existing output files are rejected unless overwrite_output is true. + overwrite_output: false # Caching (required for multi-rank torchrun; use a shared writable path for multi-node). - # Use a fresh directory per model, dataset, prefix, sequence-length, and world-size combination. + # Use a fresh directory per model, dataset, prefix, sequence-length, corpus_chunk_size, and world-size combination. # The miner validates cache metadata plus consolidated array shape/readability before reuse. # Fresh directories are still clearest for mutable Hub IDs or overwritten local paths. # cache_embeddings_dir: /path/to/cache/ - # Set true only when every cached shard/chunk came from the same model/input/prefix/length/world-size run. + # Set true only when every cached shard/chunk came from the same model/input/prefix/length/corpus_chunk_size/world-size run. load_embeddings_from_cache: false # Mining parameters @@ -98,5 +103,5 @@ mining: # Whether to include negatives from the input file. Existing negatives are prepended to the mined output; # deduplicate/audit the final file before training: - # uv run python examples/retrieval/data_utils/audit_mined_negatives.py /path/to/output.json --min-negatives 1 --allow-findings + # uv run python examples/retrieval/data_utils/audit_mined_negatives.py /path/to/mined.json --min-negatives 1 --allow-findings use_negatives_from_file: false diff --git a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py index 5a2911df0d..2ace77e6e9 100644 --- a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py +++ b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py @@ -59,6 +59,7 @@ "corpus_chunk_size": 50000, "load_embeddings_from_cache": False, "use_negatives_from_file": False, + "overwrite_output": False, # Prefix and length configuration for embedding generation "query_prefix": "", "passage_prefix": "", @@ -180,6 +181,7 @@ def __init__(self, cfg): self.corpus_chunk_size = None self.load_embeddings_from_cache = None self.use_negatives_from_file = None + self.overwrite_output = None # Model loading parameters (populated in setup) self.model_name_or_path = None @@ -320,6 +322,7 @@ def _extract_mining_params(self): self.corpus_chunk_size = self._get_mining_param("corpus_chunk_size") self.load_embeddings_from_cache = self._get_mining_param("load_embeddings_from_cache") self.use_negatives_from_file = self._get_mining_param("use_negatives_from_file") + self.overwrite_output = self._get_mining_param("overwrite_output") # Model loading: tokenizer defaults to model path if not specified self.tokenizer_name_or_path = self._get_mining_param("tokenizer_name_or_path") @@ -352,6 +355,16 @@ def _validate_mining_params(self): if self.model_name_or_path is None: raise ValueError("Missing required parameter: --mining.model_name_or_path") + input_path = Path(self.train_qa_file_path).expanduser().resolve(strict=False) + output_path = Path(self.train_file_output_path).expanduser().resolve(strict=False) + if output_path == input_path: + raise ValueError("train_file_output_path must be different from train_qa_file_path") + if output_path.exists() and not self.overwrite_output: + raise ValueError( + f"Output file already exists: {output_path}. " + "Choose a new --mining.train_file_output_path or pass --mining.overwrite_output true." + ) + # Validate margin type if margin is specified if self.hard_neg_margin is not None: valid_types = ["perc", "abs"] @@ -464,7 +477,13 @@ def _prepare_data(self): questions.append(question_text) question_ids.append(row["question_id"]) - corpus_ids.append(row["corpus_id"]) + corpus_id = row["corpus_id"] + if corpus_id != self.corpus_id: + raise ValueError( + f"Mining input row {row['question_id']} references corpus_id {corpus_id!r}, " + f"but this mining run loaded corpus_id {self.corpus_id!r}. Run one corpus per mining job." + ) + corpus_ids.append(corpus_id) # Map positive doc IDs to indices pos_indices = [self.doc_to_idx[doc["id"]] for doc in row["pos_doc"]] @@ -621,9 +640,9 @@ def _encode_queries_sharded(self) -> np.ndarray: shard_path = shard_dir / f"queries_rank{r:04d}.npz" # Compute or load this rank's shard - if self._should_load_partial_cache(shard_path): - local_embeds = _load_npz_array(shard_path) - else: + local_expected = local_end - local_start + local_embeds = self._load_partial_cache_shard(shard_path, local_expected, "query shard") + if local_embeds is None: local_texts = self.questions[local_start:local_end] local_embeds = self._encode_texts( texts=local_texts, @@ -655,26 +674,39 @@ def _encode_queries_sharded(self) -> np.ndarray: return np.concatenate(parts, axis=0) if parts else np.empty((0, 0), dtype=np.float32) - def _load_cached_chunk(self, cache_path: Path, expected_size: Optional[int] = None) -> Optional[np.ndarray]: - """Load a fully-assembled chunk cache if it exists and matches the current chunk.""" + def _load_partial_cache_shard( + self, + cache_path: Path, + expected_size: Optional[int] = None, + label: str = "embedding shard", + ) -> Optional[np.ndarray]: + """Load a reusable partial cache shard, validating readability and row count.""" if not self._should_load_partial_cache(cache_path): return None try: - cached_chunk = _load_npz_array(cache_path) + cached_shard = _load_npz_array(cache_path) except Exception as exc: - logger.warning("Ignoring unreadable document chunk cache %s: %s", cache_path, exc) + logger.warning("Ignoring unreadable %s cache %s: %s", label, cache_path, exc) return None - if expected_size is not None and cached_chunk.shape[0] != expected_size: + if expected_size is not None and cached_shard.shape[0] != expected_size: logger.info( - "Cached document chunk %s has %s rows, expected %s; recomputing chunk.", + "Cached %s %s has %s rows, expected %s; recomputing shard.", + label, cache_path, - cached_chunk.shape[0], + cached_shard.shape[0], expected_size, ) return None + return cached_shard + + def _load_cached_chunk(self, cache_path: Path, expected_size: Optional[int] = None) -> Optional[np.ndarray]: + """Load a fully-assembled chunk cache if it exists and matches the current chunk.""" + cached_chunk = self._load_partial_cache_shard(cache_path, expected_size, "document chunk") + if cached_chunk is None: + return None if self.dist_env.world_size > 1 and not self.dist_env.is_main: return np.empty((0, 0), dtype=np.float32) return cached_chunk @@ -708,17 +740,7 @@ def _encode_chunk_distributed( # Compute or load this rank's slice local_expected = local_end - local_start - local_embeds = None - if self._should_load_partial_cache(rank_cache_path): - local_embeds = _load_npz_array(rank_cache_path) - if local_embeds.shape[0] != local_expected: - logger.info( - "Cached rank shard %s has %s rows, expected %s; recomputing shard.", - rank_cache_path, - local_embeds.shape[0], - local_expected, - ) - local_embeds = None + local_embeds = self._load_partial_cache_shard(rank_cache_path, local_expected, "document rank shard") if local_embeds is None: local_texts = texts[local_start:local_end] local_embeds = self._encode_texts( @@ -1413,6 +1435,7 @@ def _get_mining_args_dict(self) -> Dict[str, Any]: "document_embedding_batch_size": self.document_embedding_batch_size, "corpus_chunk_size": self.corpus_chunk_size, "use_negatives_from_file": self.use_negatives_from_file, + "overwrite_output": self.overwrite_output, "query_prefix": self.query_prefix, "passage_prefix": self.passage_prefix, "query_max_length": self.query_max_length, @@ -1493,11 +1516,11 @@ def _write_output(self) -> None: # Add mining metadata output["mining"] = {"args": self._get_mining_args_dict()} - # Clear data and rebuild with enriched rows + # Iterate through original rows when available so custom row metadata round-trips. + source_rows = output.get("data") or self.questions_dataset output["data"] = [] - - # Iterate through original dataset and enrich with mining results - for row in self.questions_dataset: + for source_row in source_rows: + row = dict(source_row) question_id = row["question_id"] # Replace neg_doc with mined negatives @@ -1505,7 +1528,8 @@ def _write_output(self) -> None: # Add scores to positive docs pos_scores = pos_scores_by_qid.get(question_id, []) - for j, pos_doc in enumerate(row.get("pos_doc", [])): + row["pos_doc"] = [dict(pos_doc) for pos_doc in row.get("pos_doc", [])] + for j, pos_doc in enumerate(row["pos_doc"]): if j < len(pos_scores): score = pos_scores[j] if score is not None and math.isfinite(float(score)): @@ -1525,6 +1549,15 @@ def _write_output(self) -> None: # Ensure output directory exists output_path = Path(self.train_file_output_path) + input_path = Path(self.train_qa_file_path).expanduser().resolve(strict=False) + resolved_output_path = output_path.expanduser().resolve(strict=False) + if resolved_output_path == input_path: + raise ValueError("train_file_output_path must be different from train_qa_file_path") + if resolved_output_path.exists() and not self.overwrite_output: + raise ValueError( + f"Output file already exists: {resolved_output_path}. " + "Choose a new --mining.train_file_output_path or pass --mining.overwrite_output true." + ) output_path.parent.mkdir(parents=True, exist_ok=True) # Write output file with formatting @@ -1612,6 +1645,7 @@ def _print_configuration(self): print(f" corpus_chunk_size: {self.corpus_chunk_size}") print(f" load_embeddings_from_cache: {self.load_embeddings_from_cache}") print(f" use_negatives_from_file: {self.use_negatives_from_file}") + print(f" overwrite_output: {self.overwrite_output}") print("\nEmbedding configuration:") print(f" query_prefix: '{self.query_prefix}'") print(f" passage_prefix: '{self.passage_prefix}'") diff --git a/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py b/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py index f798c6667a..1d31a9fddc 100644 --- a/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py +++ b/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py @@ -124,3 +124,33 @@ def test_materialize_hf_retrieval_subset_overwrites_when_requested(tmp_path, mon assert (tmp_path / "train.json").exists() assert not (tmp_path / "FEVER_corpus" / "train-00000-of-00001.parquet").exists() assert (tmp_path / "FEVER_corpus" / "train.parquet").exists() + + +def test_materialize_hf_retrieval_subset_overwrite_keeps_existing_output_when_load_fails(tmp_path, monkeypatch): + existing_train_json = tmp_path / "train.json" + existing_train_json.write_text('{"corpus": [{"path": "./FEVER_corpus"}], "data": []}') + stale_corpus = tmp_path / "FEVER_corpus" + stale_corpus.mkdir() + (stale_corpus / "train.parquet").write_text("still valid") + + def raise_load_error(repo_id, subset): + raise RuntimeError("temporary load failure") + + monkeypatch.setattr(materialize_hf_retrieval_subset, "_load_hf_subset", raise_load_error) + monkeypatch.setattr( + sys, + "argv", + [ + "materialize_hf_retrieval_subset.py", + "nvidia/embed-nemotron-dataset-v1", + "FEVER", + str(tmp_path), + "--overwrite", + ], + ) + + with pytest.raises(RuntimeError, match="temporary load failure"): + materialize_hf_retrieval_subset.main() + + assert existing_train_json.read_text() == '{"corpus": [{"path": "./FEVER_corpus"}], "data": []}' + assert (stale_corpus / "train.parquet").read_text() == "still valid" diff --git a/tests/unit_tests/recipes/test_mine_hard_negatives.py b/tests/unit_tests/recipes/test_mine_hard_negatives.py index bfda6326b2..9e1c74b1c5 100644 --- a/tests/unit_tests/recipes/test_mine_hard_negatives.py +++ b/tests/unit_tests/recipes/test_mine_hard_negatives.py @@ -279,6 +279,54 @@ def test_write_output_preserves_original_question_id(tmp_path): ] +def test_write_output_preserves_custom_row_metadata(tmp_path): + input_file = tmp_path / "input.json" + output_file = tmp_path / "output.json" + input_file.write_text( + json.dumps( + { + "data": [ + { + "question_id": "q0", + "question": "Which doc is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": "p1"}], + "neg_doc": [{"id": "old"}], + "source_split": "curated-dev", + } + ] + } + ) + ) + recipe = _make_recipe( + { + "train_qa_file_path": str(input_file), + "train_file_output_path": str(output_file), + } + ) + recipe.train_qa_file_path = str(input_file) + recipe.train_file_output_path = str(output_file) + recipe.questions_dataset = [ + { + "question_id": "q0", + "question": "Which doc is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": "p1"}], + "neg_doc": [], + } + ] + recipe._build_negative_docs_by_question_id = lambda: {"q0": [{"id": "n1", "score": 0.2}]} + recipe._build_positive_scores_by_question_id = lambda: {"q0": [0.9]} + recipe._get_mining_args_dict = lambda: {} + + recipe._write_output() + + output = json.loads(output_file.read_text()) + assert output["data"][0]["source_split"] == "curated-dev" + assert output["data"][0]["neg_doc"] == [{"id": "n1", "score": 0.2}] + assert output["data"][0]["pos_doc"] == [{"id": "p1", "score": 0.9}] + + def test_write_output_removes_stale_positive_score_for_non_finite_current_score(tmp_path): input_file = tmp_path / "input.json" output_file = tmp_path / "output.json" @@ -316,6 +364,56 @@ def test_write_output_removes_stale_positive_score_for_non_finite_current_score( assert output["data"][0]["pos_doc"] == [{"id": "p1"}] +def test_validate_mining_params_rejects_existing_output_without_overwrite(tmp_path): + input_file = tmp_path / "input.json" + output_file = tmp_path / "output.json" + input_file.write_text("{}") + output_file.write_text("existing") + recipe = _make_recipe( + { + "train_qa_file_path": str(input_file), + "train_file_output_path": str(output_file), + } + ) + recipe._extract_mining_params() + + with pytest.raises(ValueError, match="Output file already exists"): + recipe._validate_mining_params() + + +def test_validate_mining_params_allows_existing_output_with_overwrite(tmp_path): + input_file = tmp_path / "input.json" + output_file = tmp_path / "output.json" + input_file.write_text("{}") + output_file.write_text("existing") + recipe = _make_recipe( + { + "train_qa_file_path": str(input_file), + "train_file_output_path": str(output_file), + "overwrite_output": True, + } + ) + recipe._extract_mining_params() + + recipe._validate_mining_params() + + +def test_validate_mining_params_rejects_input_output_same_path(tmp_path): + input_file = tmp_path / "input.json" + input_file.write_text("{}") + recipe = _make_recipe( + { + "train_qa_file_path": str(input_file), + "train_file_output_path": str(input_file), + "overwrite_output": True, + } + ) + recipe._extract_mining_params() + + with pytest.raises(ValueError, match="must be different"): + recipe._validate_mining_params() + + def test_encode_texts_empty_input_returns_empty_array(): recipe = _make_recipe() @@ -354,6 +452,25 @@ def test_cache_fingerprint_includes_corpus_chunk_size(tmp_path): assert recipe._build_cache_fingerprint()["corpus_chunk_size"] == 3 +def test_load_partial_cache_shard_rejects_shape_mismatch(tmp_path): + recipe = _make_cache_ready_recipe(tmp_path) + recipe._reuse_partial_embedding_cache = True + cache_path = tmp_path / "query_shards" / "queries_rank0000.npz" + cache_path.parent.mkdir() + np.savez(cache_path, np.ones((2, 2), dtype=np.float32)) + + assert recipe._load_partial_cache_shard(cache_path, expected_size=1, label="query shard") is None + + +def test_load_partial_cache_shard_rejects_unreadable_npz(tmp_path): + recipe = _make_cache_ready_recipe(tmp_path) + recipe._reuse_partial_embedding_cache = True + cache_path = tmp_path / "chunk_0000_rank0000.npz" + cache_path.write_text("not an npz") + + assert recipe._load_partial_cache_shard(cache_path, expected_size=1, label="document rank shard") is None + + def test_full_embeddings_cache_requires_matching_metadata(tmp_path): recipe = _make_cache_ready_recipe(tmp_path) query_embeddings = np.ones((1, 2), dtype=np.float32) @@ -437,12 +554,26 @@ def test_prepare_data_rejects_duplicate_question_ids(): {"question_id": "q0", "question": "second", "corpus_id": "demo", "pos_doc": [{"id": "d0"}]}, ] recipe.doc_to_idx = {"d0": 0} + recipe.corpus_id = "demo" recipe.use_negatives_from_file = False with pytest.raises(ValueError, match="unique question_id"): recipe._prepare_data() +def test_prepare_data_rejects_mismatched_corpus_id(): + recipe = _make_recipe() + recipe.corpus_id = "expected" + recipe.questions_dataset = [ + {"question_id": "q0", "question": "first", "corpus_id": "other", "pos_doc": [{"id": "d0"}]}, + ] + recipe.doc_to_idx = {"d0": 0} + recipe.use_negatives_from_file = False + + with pytest.raises(ValueError, match="Run one corpus per mining job"): + recipe._prepare_data() + + def test_encode_queries_sharded_handles_empty_rank_shard(tmp_path): recipe = _make_recipe({"cache_embeddings_dir": str(tmp_path), "load_embeddings_from_cache": False}) recipe.cache_embeddings_dir = str(tmp_path) @@ -465,6 +596,58 @@ def write_empty_peer_shard(): assert query_embeddings.shape == (1, 2) +def test_encode_queries_sharded_recomputes_stale_local_query_shard(tmp_path): + recipe = _make_recipe({"cache_embeddings_dir": str(tmp_path), "load_embeddings_from_cache": True}) + recipe.cache_embeddings_dir = str(tmp_path) + recipe.load_embeddings_from_cache = True + recipe._reuse_partial_embedding_cache = True + recipe.query_embedding_batch_size = 2 + recipe.query_max_length = 16 + recipe.query_prefix = "query: " + recipe.questions = ["what is nvlink?"] + recipe.dist_env = SimpleNamespace(world_size=2, rank=0, is_main=True, device=torch.device("cpu")) + shard_dir = tmp_path / "query_shards" + shard_dir.mkdir() + np.savez(shard_dir / "queries_rank0000.npz", np.ones((2, 2), dtype=np.float32)) + recomputed = np.full((1, 2), 4.0, dtype=np.float32) + recipe._encode_texts = lambda **_: recomputed + + def write_empty_peer_shard(): + np.savez(shard_dir / "queries_rank0001.npz", np.empty((0, 0), dtype=np.float32)) + + recipe._synchronize_ranks = write_empty_peer_shard + + query_embeddings = recipe._encode_queries_sharded() + + np.testing.assert_array_equal(query_embeddings, recomputed) + + +def test_encode_queries_sharded_recomputes_corrupt_local_query_shard(tmp_path): + recipe = _make_recipe({"cache_embeddings_dir": str(tmp_path), "load_embeddings_from_cache": True}) + recipe.cache_embeddings_dir = str(tmp_path) + recipe.load_embeddings_from_cache = True + recipe._reuse_partial_embedding_cache = True + recipe.query_embedding_batch_size = 2 + recipe.query_max_length = 16 + recipe.query_prefix = "query: " + recipe.questions = ["what is nvlink?"] + recipe.dist_env = SimpleNamespace(world_size=2, rank=0, is_main=True, device=torch.device("cpu")) + shard_dir = tmp_path / "query_shards" + shard_dir.mkdir() + (shard_dir / "queries_rank0000.npz").write_text("corrupt") + recomputed = np.full((1, 2), 5.0, dtype=np.float32) + recipe._encode_texts = lambda **_: recomputed + + def write_empty_peer_shard(): + np.savez(shard_dir / "queries_rank0001.npz", np.empty((0, 0), dtype=np.float32)) + + recipe._synchronize_ranks = write_empty_peer_shard + + query_embeddings = recipe._encode_queries_sharded() + + np.testing.assert_array_equal(query_embeddings, recomputed) + + def test_encode_chunk_distributed_handles_empty_rank_shard(tmp_path): recipe = _make_recipe({"cache_embeddings_dir": str(tmp_path), "load_embeddings_from_cache": False}) recipe.cache_embeddings_dir = str(tmp_path) @@ -487,6 +670,30 @@ def write_empty_peer_shard(): assert (tmp_path / DOCUMENT_EMBEDDINGS_FNAME).exists() is False +def test_encode_chunk_distributed_recomputes_corrupt_local_rank_shard(tmp_path): + recipe = _make_recipe({"cache_embeddings_dir": str(tmp_path), "load_embeddings_from_cache": True}) + recipe.cache_embeddings_dir = str(tmp_path) + recipe.load_embeddings_from_cache = True + recipe._reuse_partial_embedding_cache = True + recipe.document_embedding_batch_size = 2 + recipe.passage_max_length = 16 + recipe.passage_prefix = "passage: " + recipe.dist_env = SimpleNamespace(world_size=2, rank=0, is_main=True, device=torch.device("cpu")) + cache_path = tmp_path / "chunk_0000.npz" + (tmp_path / "chunk_0000_rank0000.npz").write_text("corrupt") + recomputed = np.full((1, 2), 6.0, dtype=np.float32) + recipe._encode_texts = lambda **_: recomputed + + def write_empty_peer_shard(): + np.savez(tmp_path / "chunk_0000_rank0001.npz", np.empty((0, 0), dtype=np.float32)) + + recipe._synchronize_ranks = write_empty_peer_shard + + document_embeddings = recipe._encode_chunk_distributed(["NVLink is fast."], cache_path) + + np.testing.assert_array_equal(document_embeddings, recomputed) + + def test_mine_hard_negatives_drops_margin_filtered_candidates(): recipe = _make_recipe() recipe.dist_env = MagicMock(device=torch.device("cpu")) From 5a41c642485918c66916d0b51a41d0ab3ac35fc9 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 22 May 2026 21:56:53 +0100 Subject: [PATCH 25/25] fix(retrieval): harden mining output handoff Signed-off-by: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> --- docs/guides/llm/retrieval-finetuning.md | 34 ++-- .../materialize_hf_retrieval_subset.py | 10 +- .../recipes/retrieval/mine_hard_negatives.py | 129 +++++++++++++-- .../test_materialize_hf_retrieval_subset.py | 48 ++++++ .../recipes/test_mine_hard_negatives.py | 151 ++++++++++++++++++ 5 files changed, 339 insertions(+), 33 deletions(-) diff --git a/docs/guides/llm/retrieval-finetuning.md b/docs/guides/llm/retrieval-finetuning.md index 1bc0ac435e..0724639267 100644 --- a/docs/guides/llm/retrieval-finetuning.md +++ b/docs/guides/llm/retrieval-finetuning.md @@ -238,7 +238,7 @@ dataloader: _target_: nemo_automodel.components.datasets.llm.retrieval_dataset.make_retrieval_dataset model_type: bi_encoder data_dir_list: - - /path/to/train.jsonl + - /path/to/train.json data_type: train n_passages: 5 seed: 42 @@ -310,7 +310,7 @@ dataloader: _target_: nemo_automodel.components.datasets.llm.retrieval_dataset.make_retrieval_dataset model_type: bi_encoder data_dir_list: - - /path/to/train.jsonl + - /path/to/train.json data_type: train n_passages: 5 seed: 42 @@ -372,10 +372,10 @@ tokenizer: dataloader: _target_: torchdata.stateful_dataloader.StatefulDataLoader dataset: - _target_: nemo_automodel.components.datasets.llm.retrieval_dataset_inline.make_retrieval_dataset + _target_: nemo_automodel.components.datasets.llm.retrieval_dataset.make_retrieval_dataset model_type: cross_encoder data_dir_list: - - /path/to/train.jsonl + - /path/to/train.json data_type: train n_passages: 5 seed: 42 @@ -601,7 +601,9 @@ uv run torchrun --standalone --nproc_per_node=8 examples/retrieval/data_utils/mi --mining.add_eos_token false ``` -For multi-node mining, replace `--standalone` with the same explicit rendezvous flags you use for multi-node training: +For multi-node mining, replace `--standalone` with the same explicit rendezvous flags you use for multi-node training. +Every rank must be able to read `model_name_or_path`, `tokenizer_name_or_path` if set, `train_qa_file_path`, and the +corpus path referenced by that JSON at the same filesystem paths: ```bash uv run torchrun \ @@ -627,7 +629,9 @@ Replace `epoch_0_step_499` with the explicit checkpoint directory that you want `LATEST.txt`, read it first and substitute the resolved `epoch_*_step_*` directory; the mining script loads the Hugging Face export directly and does not apply AutoModel's checkpoint resolver. The miner refuses to overwrite an existing `train_file_output_path` by default. Choose a new output path for each mining run, or pass -`--mining.overwrite_output true` only when replacing that file is intentional. +`--mining.overwrite_output true` only when replacing that file is intentional. If the output JSON is written to a +different directory from the input JSON, the miner rewrites relative `corpus` paths so retraining still resolves the +original corpus. Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: @@ -654,13 +658,15 @@ Key mining settings in `examples/retrieval/data_utils/mining_config.yaml`: - `cache_embeddings_dir`: required for distributed mining so ranks can share cached passage embeddings. Rank `0` assembles the final embedding cache and score outputs, so plan memory and local disk accordingly. In multi-node mining, this must be a shared writable path mounted at the same location on every node; node-local cache paths leave - rank `0` unable to read remote-rank shards. Use a fresh cache directory for each model, dataset, prefix, sequence - length, `corpus_chunk_size`, and world-size combination. The miner validates cache metadata and loads the - consolidated arrays to verify fingerprint, shape, and readability before reusing a consolidated cache. The fingerprint includes the mining input - file, local model/tokenizer path state, ordered document IDs/content, and embedding settings. Fresh run-specific - paths are still easier to reason about, especially for mutable Hub IDs or paths that are overwritten in place. Set `load_embeddings_from_cache: true` only when you intentionally want to reuse every cached - query shard, corpus chunk, and consolidated embedding file from the same - model/input/prefix/length/`corpus_chunk_size`/world-size run. + rank `0` unable to read remote-rank shards. +- Cache reuse: use a fresh cache directory for each model, dataset, prefix, sequence length, `corpus_chunk_size`, and + world-size combination. The miner validates cache metadata and loads the consolidated arrays to verify fingerprint, + shape, and readability before reusing a consolidated cache. The fingerprint includes the mining input file, local + model/tokenizer path state, ordered document IDs/content, and embedding settings. +- `load_embeddings_from_cache`: set this to `true` only when you intentionally want to reuse every cached query shard, + corpus chunk, and consolidated embedding file from the same + model/input/prefix/length/`corpus_chunk_size`/world-size run. Fresh run-specific paths are still easier to reason + about, especially for mutable Hub IDs or paths that are overwritten in place. `pooling` and `l2_normalize` are saved bi-encoder wrapper metadata, not `mining.*` config fields. Do not pass `--mining.pooling` or `--mining.l2_normalize`; the miner rejects unknown mining keys. Mine from a saved bi-encoder export @@ -679,7 +685,7 @@ Hard-negative mining parallelizes embedding generation across ranks, but the fin rank `0` and materializes the full document embedding matrix there. For very large corpora, use a smaller mining slice or a custom ANN/blockwise mining workflow instead of expecting this helper to scale to web-scale indexing. -Use the mined output as the next `data_dir_list` source for another bi-encoder pass or for cross-encoder training. If +Use the mined output as the next corpus-backed `data_dir_list` source for another bi-encoder pass or for cross-encoder training. If the previous run used multiple sources, list the mined file for each source. Hard negative mining excludes document IDs listed in each input row's `pos_doc`, but it cannot read an external qrels file or know every semantically relevant duplicate. Put all known positive IDs for the query in the mining input, deduplicate the corpus, inspect mined samples, diff --git a/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py b/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py index 08de08224e..97670b3277 100644 --- a/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py +++ b/examples/retrieval/data_utils/materialize_hf_retrieval_subset.py @@ -62,6 +62,7 @@ def main() -> int: tmp_corpus_dir = Path(tempfile.mkdtemp(prefix=f".{args.subset}_corpus.", dir=output_dir)) tmp_train_path = None backup_corpus_dir = None + committed = False try: doc_rows = [] for doc_id in corpus_info.get_all_ids(): @@ -93,18 +94,17 @@ def main() -> int: tmp_corpus_dir = None tmp_train_path.replace(output_dir / "train.json") tmp_train_path = None - except Exception: - if backup_corpus_dir is not None and backup_corpus_dir.exists(): + committed = True + finally: + if not committed and backup_corpus_dir is not None and backup_corpus_dir.exists(): if corpus_dir.exists(): shutil.rmtree(corpus_dir) shutil.move(str(backup_corpus_dir), str(corpus_dir)) - raise - finally: if tmp_corpus_dir is not None and tmp_corpus_dir.exists(): shutil.rmtree(tmp_corpus_dir) if tmp_train_path is not None and tmp_train_path.exists(): tmp_train_path.unlink() - if backup_corpus_dir is not None and backup_corpus_dir.exists(): + if committed and backup_corpus_dir is not None and backup_corpus_dir.exists(): shutil.rmtree(backup_corpus_dir) logger.info("Wrote %s records and %s corpus documents to %s", len(data_list), len(doc_rows), output_dir) diff --git a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py index 2ace77e6e9..d8b647915b 100644 --- a/nemo_automodel/recipes/retrieval/mine_hard_negatives.py +++ b/nemo_automodel/recipes/retrieval/mine_hard_negatives.py @@ -20,6 +20,8 @@ import json import logging import math +import os +import tempfile from pathlib import Path from typing import Any, Dict, List, Optional, Tuple @@ -641,7 +643,9 @@ def _encode_queries_sharded(self) -> np.ndarray: # Compute or load this rank's shard local_expected = local_end - local_start - local_embeds = self._load_partial_cache_shard(shard_path, local_expected, "query shard") + local_embeds = self._load_partial_cache_shard( + shard_path, local_expected, "query shard", self._get_cached_embedding_width("query_shape") + ) if local_embeds is None: local_texts = self.questions[local_start:local_end] local_embeds = self._encode_texts( @@ -679,8 +683,9 @@ def _load_partial_cache_shard( cache_path: Path, expected_size: Optional[int] = None, label: str = "embedding shard", + expected_width: Optional[int] = None, ) -> Optional[np.ndarray]: - """Load a reusable partial cache shard, validating readability and row count.""" + """Load a reusable partial cache shard, validating readability and shape.""" if not self._should_load_partial_cache(cache_path): return None @@ -690,6 +695,10 @@ def _load_partial_cache_shard( logger.warning("Ignoring unreadable %s cache %s: %s", label, cache_path, exc) return None + if cached_shard.ndim != 2: + logger.info("Cached %s %s is %sD, expected 2D; recomputing shard.", label, cache_path, cached_shard.ndim) + return None + if expected_size is not None and cached_shard.shape[0] != expected_size: logger.info( "Cached %s %s has %s rows, expected %s; recomputing shard.", @@ -700,11 +709,34 @@ def _load_partial_cache_shard( ) return None + if expected_width is not None and expected_size != 0 and cached_shard.shape[1] != expected_width: + logger.info( + "Cached %s %s has embedding width %s, expected %s; recomputing shard.", + label, + cache_path, + cached_shard.shape[1], + expected_width, + ) + return None + return cached_shard + def _get_cached_embedding_width(self, shape_key: str) -> Optional[int]: + """Return expected embedding width from matching cache metadata when available.""" + metadata = self._load_cache_metadata() + if metadata is None or metadata.get("version") != EMBEDDINGS_CACHE_METADATA_VERSION: + return None + shape = metadata.get(shape_key) + if not isinstance(shape, list) or len(shape) != 2: + return None + width = shape[1] + return width if isinstance(width, int) and width > 0 else None + def _load_cached_chunk(self, cache_path: Path, expected_size: Optional[int] = None) -> Optional[np.ndarray]: """Load a fully-assembled chunk cache if it exists and matches the current chunk.""" - cached_chunk = self._load_partial_cache_shard(cache_path, expected_size, "document chunk") + cached_chunk = self._load_partial_cache_shard( + cache_path, expected_size, "document chunk", self._get_cached_embedding_width("document_shape") + ) if cached_chunk is None: return None if self.dist_env.world_size > 1 and not self.dist_env.is_main: @@ -740,7 +772,9 @@ def _encode_chunk_distributed( # Compute or load this rank's slice local_expected = local_end - local_start - local_embeds = self._load_partial_cache_shard(rank_cache_path, local_expected, "document rank shard") + local_embeds = self._load_partial_cache_shard( + rank_cache_path, local_expected, "document rank shard", self._get_cached_embedding_width("document_shape") + ) if local_embeds is None: local_texts = texts[local_start:local_end] local_embeds = self._encode_texts( @@ -815,10 +849,26 @@ def _encode_documents_chunk( Returns: numpy array of document embeddings [num_docs, embedding_dim]. """ - # Fast path: load from cache if available - cached_result = self._load_cached_chunk(cache_path, expected_size=len(doc_indices)) - if cached_result is not None: - return cached_result + # Fast path: load from cache if available. In initialized distributed runs, rank0 decides whether the + # assembled chunk cache is reusable and broadcasts that decision so every rank takes the same branch. + cached_result = None + if self.dist_env.world_size > 1 and torch.distributed.is_initialized(): + cache_hit = False + if self.dist_env.is_main: + cached_result = self._load_cached_chunk(cache_path, expected_size=len(doc_indices)) + cache_hit = cached_result is not None + flag_device = self.dist_env.device if self.dist_env.device.type == "cuda" else torch.device("cpu") + flag = torch.tensor([1 if cache_hit else 0], dtype=torch.int64, device=flag_device) + torch.distributed.broadcast(flag, src=0) + cache_hit = bool(flag.item()) + if cache_hit: + if self.dist_env.is_main: + return cached_result + return np.empty((0, 0), dtype=np.float32) + else: + cached_result = self._load_cached_chunk(cache_path, expected_size=len(doc_indices)) + if cached_result is not None: + return cached_result # Fetch document texts doc_ids = [self.idx_to_doc[idx] for idx in doc_indices] @@ -1386,7 +1436,9 @@ def _mine_hard_negatives( downscore_mask = downscore_mask & apply_margin_tensor batch_scores[downscore_mask] = float("-inf") - # Batch-level top-k selection + # Batch-level top-k selection. Positives, non-finite scores, and margin-filtered candidates are already + # set to -inf, so the initial buffer should normally contain enough valid negatives. The per-query fallback + # below expands to all documents only when the initial window is too small. k = min(num_negs * TOPK_BUFFER_MULTIPLIER, batch_scores.shape[1]) topk = batch_scores.topk(k=k, dim=1) topk_indices = topk.indices.tolist() @@ -1405,6 +1457,18 @@ def _mine_hard_negatives( hard_neg_candidates.append(idx) hard_neg_scores.append(score) + if len(hard_neg_candidates) < num_negs and k < batch_scores.shape[1]: + expanded_topk = batch_scores[i].topk(k=batch_scores.shape[1], dim=0) + hard_neg_candidates = [] + hard_neg_scores = [] + for idx, score in zip(expanded_topk.indices.tolist(), expanded_topk.values.tolist()): + if idx in pos_set or not math.isfinite(score): + continue + hard_neg_candidates.append(idx) + hard_neg_scores.append(score) + if len(hard_neg_candidates) == num_negs: + break + # Limit to num_negs neg_indices_all.append(hard_neg_candidates[:num_negs]) neg_scores_all.append(hard_neg_scores[:num_negs]) @@ -1493,6 +1557,25 @@ def _build_positive_scores_by_question_id(self) -> Dict[str, List[float | None]] """ return {question_id: scores for question_id, scores in zip(self.question_ids, self.pos_scores)} + def _rewrite_corpus_paths_for_output(self, corpus_config: Any, output_path: Path) -> Any: + """Rewrite relative corpus paths so they still point to the input corpus from the output JSON location.""" + input_dir = Path(self.train_qa_file_path).expanduser().resolve(strict=False).parent + output_dir = output_path.expanduser().resolve(strict=False).parent + + def rewrite_entry(entry: Any) -> Any: + if not isinstance(entry, dict): + return entry + rewritten = dict(entry) + corpus_path = rewritten.get("path") + if isinstance(corpus_path, str) and corpus_path and not os.path.isabs(corpus_path): + source_corpus_path = (input_dir / corpus_path).resolve(strict=False) + rewritten["path"] = os.path.relpath(source_corpus_path, output_dir) + return rewritten + + if isinstance(corpus_config, list): + return [rewrite_entry(entry) for entry in corpus_config] + return rewrite_entry(corpus_config) + def _write_output(self) -> None: """Write the output JSON file with mined hard negatives. @@ -1505,6 +1588,8 @@ def _write_output(self) -> None: """ import json + output_path = Path(self.train_file_output_path) + # Load original input file (preserves all top-level keys like corpus) with open(self.train_qa_file_path, "r") as f: output = json.load(f) @@ -1513,8 +1598,10 @@ def _write_output(self) -> None: neg_docs_by_qid = self._build_negative_docs_by_question_id() pos_scores_by_qid = self._build_positive_scores_by_question_id() - # Add mining metadata + # Add mining metadata and keep relative corpus references valid from the output JSON location. output["mining"] = {"args": self._get_mining_args_dict()} + if "corpus" in output: + output["corpus"] = self._rewrite_corpus_paths_for_output(output["corpus"], output_path) # Iterate through original rows when available so custom row metadata round-trips. source_rows = output.get("data") or self.questions_dataset @@ -1548,7 +1635,6 @@ def _write_output(self) -> None: output["data"].append(row) # Ensure output directory exists - output_path = Path(self.train_file_output_path) input_path = Path(self.train_qa_file_path).expanduser().resolve(strict=False) resolved_output_path = output_path.expanduser().resolve(strict=False) if resolved_output_path == input_path: @@ -1560,9 +1646,24 @@ def _write_output(self) -> None: ) output_path.parent.mkdir(parents=True, exist_ok=True) - # Write output file with formatting - with open(output_path, "w") as f: - json.dump(output, f, indent=4, ensure_ascii=False) + # Write output file with formatting. Write through a same-directory temp file so overwrite runs keep the + # previous mined dataset intact if serialization or the filesystem fails mid-write. + tmp_output_path = None + try: + with tempfile.NamedTemporaryFile( + "w", + dir=output_path.parent, + prefix=f".{output_path.name}.", + suffix=".tmp", + delete=False, + ) as f: + tmp_output_path = Path(f.name) + json.dump(output, f, indent=4, ensure_ascii=False) + tmp_output_path.replace(output_path) + tmp_output_path = None + finally: + if tmp_output_path is not None and tmp_output_path.exists(): + tmp_output_path.unlink() logger.info(f"Output written to {output_path}") diff --git a/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py b/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py index 1d31a9fddc..94850f8d30 100644 --- a/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py +++ b/tests/unit_tests/datasets/llm/test_materialize_hf_retrieval_subset.py @@ -154,3 +154,51 @@ def raise_load_error(repo_id, subset): assert existing_train_json.read_text() == '{"corpus": [{"path": "./FEVER_corpus"}], "data": []}' assert (stale_corpus / "train.parquet").read_text() == "still valid" + + +def test_materialize_hf_retrieval_subset_overwrite_restores_existing_corpus_on_interruption(tmp_path, monkeypatch): + existing_train_json = tmp_path / "train.json" + existing_train_json.write_text('{"corpus": [{"path": "./FEVER_corpus"}], "data": []}') + stale_corpus = tmp_path / "FEVER_corpus" + stale_corpus.mkdir() + (stale_corpus / "train.parquet").write_text("still valid") + data_list = [ + { + "question_id": "q0", + "question": "What is the document?", + "corpus_id": "demo", + "pos_doc": [{"id": "d0"}], + "neg_doc": [], + } + ] + monkeypatch.setattr( + materialize_hf_retrieval_subset, + "_load_hf_subset", + lambda repo_id, subset: (data_list, _FakeCorpusInfo()), + ) + original_replace = materialize_hf_retrieval_subset.Path.replace + + def interrupt_train_replace(self, target): + if self.name.startswith(".train."): + raise KeyboardInterrupt("stop during commit") + return original_replace(self, target) + + monkeypatch.setattr(materialize_hf_retrieval_subset.Path, "replace", interrupt_train_replace) + monkeypatch.setattr( + sys, + "argv", + [ + "materialize_hf_retrieval_subset.py", + "nvidia/embed-nemotron-dataset-v1", + "FEVER", + str(tmp_path), + "--overwrite", + ], + ) + + with pytest.raises(KeyboardInterrupt, match="stop during commit"): + materialize_hf_retrieval_subset.main() + + assert existing_train_json.read_text() == '{"corpus": [{"path": "./FEVER_corpus"}], "data": []}' + assert (stale_corpus / "train.parquet").read_text() == "still valid" + assert not list(tmp_path.glob(".FEVER_corpus.backup.*")) diff --git a/tests/unit_tests/recipes/test_mine_hard_negatives.py b/tests/unit_tests/recipes/test_mine_hard_negatives.py index 9e1c74b1c5..111fb0c40a 100644 --- a/tests/unit_tests/recipes/test_mine_hard_negatives.py +++ b/tests/unit_tests/recipes/test_mine_hard_negatives.py @@ -15,6 +15,7 @@ """Unit tests for MineHardNegativesRecipe — attn_implementation support.""" import json +import os from types import SimpleNamespace from unittest.mock import MagicMock, patch @@ -327,6 +328,102 @@ def test_write_output_preserves_custom_row_metadata(tmp_path): assert output["data"][0]["pos_doc"] == [{"id": "p1", "score": 0.9}] +def test_write_output_rewrites_relative_corpus_path_for_new_output_dir(tmp_path): + input_dir = tmp_path / "source" + output_dir = tmp_path / "mined" + input_dir.mkdir() + output_dir.mkdir() + (input_dir / "FEVER_corpus").mkdir() + input_file = input_dir / "train.json" + output_file = output_dir / "train.json" + input_file.write_text( + json.dumps( + { + "corpus": [{"path": "./FEVER_corpus"}], + "data": [ + { + "question_id": "q0", + "question": "Which doc is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": "p1"}], + "neg_doc": [], + } + ], + } + ) + ) + recipe = _make_recipe( + { + "train_qa_file_path": str(input_file), + "train_file_output_path": str(output_file), + } + ) + recipe.train_qa_file_path = str(input_file) + recipe.train_file_output_path = str(output_file) + recipe.questions_dataset = json.loads(input_file.read_text())["data"] + recipe._build_negative_docs_by_question_id = lambda: {"q0": []} + recipe._build_positive_scores_by_question_id = lambda: {"q0": [0.9]} + recipe._get_mining_args_dict = lambda: {} + + recipe._write_output() + + output = json.loads(output_file.read_text()) + corpus_path = output["corpus"][0]["path"] + assert os.path.normpath(corpus_path) == os.path.normpath("../source/FEVER_corpus") + assert (output_dir / corpus_path).resolve() == (input_dir / "FEVER_corpus").resolve() + + +def test_write_output_uses_atomic_replace_on_dump_failure(tmp_path, monkeypatch): + input_file = tmp_path / "input.json" + output_file = tmp_path / "output.json" + input_file.write_text( + json.dumps( + { + "data": [ + { + "question_id": "q0", + "question": "Which doc is positive?", + "corpus_id": "demo", + "pos_doc": [{"id": "p1"}], + "neg_doc": [], + } + ] + } + ) + ) + output_file.write_text("existing mined data") + recipe = _make_recipe( + { + "train_qa_file_path": str(input_file), + "train_file_output_path": str(output_file), + "overwrite_output": True, + } + ) + recipe.train_qa_file_path = str(input_file) + recipe.train_file_output_path = str(output_file) + recipe.overwrite_output = True + recipe.questions_dataset = json.loads(input_file.read_text())["data"] + recipe._build_negative_docs_by_question_id = lambda: {"q0": []} + recipe._build_positive_scores_by_question_id = lambda: {"q0": [0.9]} + recipe._get_mining_args_dict = lambda: {} + + original_dump = json.dump + + def failing_dump(obj, fp, *args, **kwargs): + if isinstance(obj, dict) and "mining" in obj: + fp.write("partial") + raise OSError("disk full") + return original_dump(obj, fp, *args, **kwargs) + + monkeypatch.setattr(json, "dump", failing_dump) + + with pytest.raises(OSError, match="disk full"): + recipe._write_output() + + assert output_file.read_text() == "existing mined data" + assert not list(tmp_path.glob(".output.json.*.tmp")) + + def test_write_output_removes_stale_positive_score_for_non_finite_current_score(tmp_path): input_file = tmp_path / "input.json" output_file = tmp_path / "output.json" @@ -471,6 +568,36 @@ def test_load_partial_cache_shard_rejects_unreadable_npz(tmp_path): assert recipe._load_partial_cache_shard(cache_path, expected_size=1, label="document rank shard") is None +def test_load_partial_cache_shard_rejects_wrong_ndim(tmp_path): + recipe = _make_cache_ready_recipe(tmp_path) + recipe._reuse_partial_embedding_cache = True + cache_path = tmp_path / "queries_rank0000.npz" + np.savez(cache_path, np.ones((1, 2, 3), dtype=np.float32)) + + assert recipe._load_partial_cache_shard(cache_path, expected_size=1, label="query shard") is None + + +def test_load_partial_cache_shard_rejects_width_mismatch_from_metadata(tmp_path): + recipe = _make_cache_ready_recipe(tmp_path) + recipe._reuse_partial_embedding_cache = True + cache_path = tmp_path / "queries_rank0000.npz" + np.savez(cache_path, np.ones((1, 2), dtype=np.float32)) + recipe._write_cache_metadata( + query_embeddings=np.ones((1, 3), dtype=np.float32), + document_embeddings=np.ones((1, 3), dtype=np.float32), + ) + + assert ( + recipe._load_partial_cache_shard( + cache_path, + expected_size=1, + label="query shard", + expected_width=recipe._get_cached_embedding_width("query_shape"), + ) + is None + ) + + def test_full_embeddings_cache_requires_matching_metadata(tmp_path): recipe = _make_cache_ready_recipe(tmp_path) query_embeddings = np.ones((1, 2), dtype=np.float32) @@ -694,6 +821,30 @@ def write_empty_peer_shard(): np.testing.assert_array_equal(document_embeddings, recomputed) +def test_encode_documents_chunk_uses_broadcast_cache_hit_on_non_main_rank(tmp_path, monkeypatch): + recipe = _make_cache_ready_recipe(tmp_path) + recipe.dist_env = SimpleNamespace(world_size=2, rank=1, is_main=False, device=torch.device("cpu")) + cache_path = tmp_path / "corpus_chunks" / "chunk_0000.npz" + cache_path.parent.mkdir() + np.savez(cache_path, np.ones((1, 2), dtype=np.float32)) + recipe._write_cache_metadata( + query_embeddings=np.ones((1, 2), dtype=np.float32), + document_embeddings=np.ones((1, 2), dtype=np.float32), + ) + recipe._encode_chunk_distributed = MagicMock(side_effect=AssertionError("non-main rank should follow cache hit")) + monkeypatch.setattr(torch.distributed, "is_initialized", lambda: True) + + def broadcast_cache_hit(flag, src): + flag.fill_(1) + + monkeypatch.setattr(torch.distributed, "broadcast", broadcast_cache_hit) + + embeddings = recipe._encode_documents_chunk([0], cache_path) + + assert embeddings.shape == (0, 0) + recipe._encode_chunk_distributed.assert_not_called() + + def test_mine_hard_negatives_drops_margin_filtered_candidates(): recipe = _make_recipe() recipe.dist_env = MagicMock(device=torch.device("cpu"))