NVIDIA-NeMo · memadi-nv · May 29, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
@@ -22,8 +22,8 @@ export NVIDIA_API_KEY="your-nvidia-api-key"
 | Alias | Model | Used by |
 |-------|-------|---------|
 | `gliner-pii-detector` | [`nvidia/gliner-pii`](https://build.nvidia.com/nvidia/gliner-pii) | Entity detection (NER) |
-| `gpt-oss-120b` | [`openai/gpt-oss-120b`](https://build.nvidia.com/openai/gpt-oss-120b) | Detection validation & augmentation, replacement, rewriting |
-| `nemotron-30b-thinking` | [`nvidia/nemotron-3-nano-30b-a3b`](https://build.nvidia.com/nvidia/nemotron-3-nano-30b-a3b) | Latent detection, evaluation, final judge |
+| `gpt-oss-120b` | [`openai/gpt-oss-120b`](https://build.nvidia.com/openai/gpt-oss-120b) | Detection validation & augmentation, replacement, replace evaluation, rewriting |
+| `nemotron-30b-thinking` | [`nvidia/nemotron-3-nano-30b-a3b`](https://build.nvidia.com/nvidia/nemotron-3-nano-30b-a3b) | Latent detection, rewrite evaluation, final judge |
 
 Each pipeline stage has a **role** mapped to one of these aliases. See the full role list in the default configs: [`detection.yaml`](https://github.com/NVIDIA-NeMo/Anonymizer/blob/main/src/anonymizer/config/default_model_configs/detection.yaml), [`replace.yaml`](https://github.com/NVIDIA-NeMo/Anonymizer/blob/main/src/anonymizer/config/default_model_configs/replace.yaml), [`rewrite.yaml`](https://github.com/NVIDIA-NeMo/Anonymizer/blob/main/src/anonymizer/config/default_model_configs/rewrite.yaml).
 

@@ -115,6 +115,17 @@
 # %%
 result.dataframe.head()
 
+# %% [markdown]
+# ## 📊 (Optional) Evaluate replacement quality
+#
+# - `evaluate()` is a separate, opt-in step that scores the output with LLM-as-judge metrics.
+# - For Substitute, all four metrics run: **Detection Validity**, **Type Fidelity**, **Relational Consistency**, **Attribute Fidelity**.
+# - Skip it for routine runs; call it when you want LLM-side confidence on the output. Costs LLM calls per record, so try it on `preview` first.
+
+# %%
+evaluated = anonymizer.evaluate(preview)
+evaluated.display_record(0)
+
 # %% [markdown]
 # ## ⏭️ Next steps
 #

@@ -188,6 +188,16 @@
 else:
     print("No failed records.")
 
+# %% [markdown]
+# ## 📊 (Optional) Score the detections with an LLM judge
+#
+# - `evaluate()` is a separate, opt-in step that runs LLM-as-judge metrics on the output.
+# - This notebook uses Annotate, so only **Detection Validity** runs — it flags entities the detector got wrong (false positives, mislabels, boundary errors). Substitute would also enable Type Fidelity, Relational Consistency, and Attribute Fidelity.
+
+# %%
+evaluated = anonymizer.evaluate(result)
+evaluated.display_record(0)
+
 # %% [markdown]
 # ## ⏭️ Next steps
 #

@@ -218,6 +218,19 @@
 hash_custom_preview.display_record(0)
 
 
+# %% [markdown]
+# ## 📊 (Optional) Evaluate each strategy
+#
+# - `evaluate()` is a separate, opt-in step that scores the output with LLM-as-judge metrics. Which metrics fire depends on the strategy:
+#   - **Substitute** → 4 metrics (Detection Validity + Type Fidelity + Relational Consistency + Attribute Fidelity).
+#   - **Redact / Annotate / Hash** → Detection Validity only (no replacement map to score type/relational/attribute against).
+# - Below shows it on the Substitute preview to surface all four; the same call works on `redact_preview`, `annotate_preview`, or `hash_preview`.
+
+# %%
+substitute_evaluated = anonymizer.evaluate(substitute_preview)
+substitute_evaluated.display_record(0)
+
+
 # %% [markdown]
 # ## ⏭️ Next steps
 #

@@ -9,7 +9,14 @@
 
 from data_designer.config.models import ModelProvider as ModelProvider
 
-from anonymizer.config.anonymizer_config import AnonymizerConfig, AnonymizerInput, Detect, Rewrite, RiskTolerance
+from anonymizer.config.anonymizer_config import (
+    AnonymizerConfig,
+    AnonymizerInput,
+    Detect,
+    EvaluateConfig,
+    Rewrite,
+    RiskTolerance,
+)
 from anonymizer.config.replace_strategies import Annotate, Hash, Redact, Substitute
 from anonymizer.config.rewrite import PrivacyGoal
 from anonymizer.engine.constants import DEFAULT_ENTITY_LABELS as _DEFAULT_ENTITY_LABELS
@@ -35,6 +42,7 @@
     "Annotate",
     "DEFAULT_ENTITY_LABELS",
     "Detect",
+    "EvaluateConfig",
     "Hash",
     "InvalidConfigError",
     "InvalidInputError",

@@ -197,3 +197,20 @@ def validate_exactly_one_mode(self) -> AnonymizerConfig:
                 " Use replace=Redact() for entity replacement, or rewrite=Rewrite() for LLM rewriting."
             )
         return self
+
+
+class EvaluateConfig(BaseModel):
+    """Optional knobs for :meth:`Anonymizer.evaluate`.
+
+    Reserved for genuinely evaluation-specific configuration — metric selection,
+    per-judge model/prompt overrides, scoring thresholds, etc. The anonymization
+    mode is **not** here: it travels on the ``AnonymizerResult`` /
+    ``PreviewResult`` produced by ``run()`` / ``preview()`` and is read directly
+    by ``evaluate()``, so users don't restate it and can't mis-state it.
+
+    Today this is an empty placeholder; fields will be added as evaluation
+    knobs are introduced.
+    """
+
+    # Intentionally empty for now. New fields land here as evaluation
+    # configurability is introduced.
@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Model aliases for the LLM-as-judge evaluation step (Anonymizer.evaluate).
+# These roles are NOT needed at anonymization time — preview() / run() do not
+# consume them. They are only resolved when the user opts into evaluation.
+
+selected_models:
+  detection_validity_judge: gpt-oss-120b
+  replace_type_fidelity_judge: gpt-oss-120b
+  replace_relational_consistency_judge: gpt-oss-120b
+  replace_attribute_fidelity_judge: gpt-oss-120b
@@ -93,9 +93,25 @@ class RewriteModelSelection(BaseModel):
     judge: str
 
 
+class EvaluateModelSelection(BaseModel):
+    """Model aliases for the LLM-as-judge evaluation step.
+
+    These roles are only consumed by :meth:`Anonymizer.evaluate` — they are not
+    needed at anonymization time. Keeping them in their own section lets
+    ``preview()`` / ``run()`` validate only the roles that produce anonymized
+    output, while ``evaluate(...)`` validates the roles that score it.
+    """
+
+    detection_validity_judge: str
+    replace_type_fidelity_judge: str
+    replace_relational_consistency_judge: str
+    replace_attribute_fidelity_judge: str
+
+
 class ModelSelection(BaseModel):
     """Model alias selections for all pipelines, loaded from YAML defaults via ``load_default_model_selection()``."""
 
     detection: DetectionModelSelection
     replace: ReplaceModelSelection
     rewrite: RewriteModelSelection
+    evaluate: EvaluateModelSelection
@@ -59,6 +59,28 @@
 # Final output
 COL_FINAL_ENTITIES = "final_entities"
 
+# Replace evaluation: detection-validity judge
+COL_DETECTION_JUDGE = "_detection_judge"  # raw judge output, internal
+COL_DETECTION_VALID = "detection_valid"  # user-facing bool (None if judge unavailable)
+COL_DETECTION_INVALID_ENTITIES = "detection_invalid_entities"  # user-facing list of {value, label, reasoning}
+
+# Replace evaluation: type-fidelity judge (Substitute only)
+COL_TYPE_FIDELITY_JUDGE = "_type_fidelity_judge"  # raw judge output, internal
+COL_TYPE_FIDELITY_VALID = "type_fidelity_valid"  # user-facing bool (None if judge unavailable)
+COL_TYPE_FIDELITY_INVALID_REPLACEMENTS = (
+    "type_fidelity_invalid_replacements"  # list of {original, label, synthetic, reasoning}
+)
+
+# Replace evaluation: relational-consistency judge (Substitute only)
+COL_RELATIONAL_CONSISTENCY_JUDGE = "_relational_consistency_judge"  # raw judge output (kept for display denominator)
+COL_RELATIONAL_CONSISTENCY_VALID = "relational_consistency_valid"  # user-facing bool (None if judge unavailable)
+COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS = "relational_consistency_invalid_relations"  # list of failing relations
+
+# Replace evaluation: attribute-fidelity judge (Substitute only)
+COL_ATTRIBUTE_FIDELITY_JUDGE = "_attribute_fidelity_judge"  # raw judge output (kept for display denominator)
+COL_ATTRIBUTE_FIDELITY_VALID = "attribute_fidelity_valid"  # user-facing bool (None if judge unavailable)
+COL_ATTRIBUTE_FIDELITY_INVALID_ENTITIES = "attribute_fidelity_invalid_entities"  # list of failing per-entity checks
+
 # ---------------------------------------------------------------------------
 # Rewrite pipeline
 # ---------------------------------------------------------------------------

@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
		# SPDX-License-Identifier: Apache-2.0