\n",
@@ -1150,14 +1150,14 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "[13:21:43] [INFO] 👀 Preview mode: 📂 Loaded 3 records from https://raw.githubusercontent.com/NVIDIA-NeMo/Anonymizer/refs/heads/main/docs/data/NVIDIA_synthetic_biographies.csv (column: 'biography')\n"
+ "[13:21:43] [INFO] \ud83d\udc40 Preview mode: \ud83d\udcc2 Loaded 3 records from https://raw.githubusercontent.com/NVIDIA-NeMo/Anonymizer/refs/heads/main/docs/data/NVIDIA_synthetic_biographies.csv (column: 'biography')\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "[13:21:43] [INFO] 🔍 Running entity detection on 3 records\n"
+ "[13:21:43] [INFO] \ud83d\udd0d Running entity detection on 3 records\n"
]
},
{
@@ -1171,7 +1171,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "[13:22:18] [INFO] |-- 📋 Detection complete — 76 entities found across 3 records (0 failed) [34.9s]\n"
+ "[13:22:18] [INFO] |-- \ud83d\udccb Detection complete \u2014 76 entities found across 3 records (0 failed) [34.9s]\n"
]
},
{
@@ -1185,21 +1185,21 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "[13:22:18] [INFO] 🔄 Running Hash replacement\n"
+ "[13:22:18] [INFO] \ud83d\udd04 Running Hash replacement\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "[13:22:18] [INFO] |-- 📋 Replacement complete (0 failed) [0.0s]\n"
+ "[13:22:18] [INFO] |-- \ud83d\udccb Replacement complete (0 failed) [0.0s]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
- "[13:22:18] [INFO] 🎉 Pipeline complete — 3 records processed, 0 total failures\n"
+ "[13:22:18] [INFO] \ud83c\udf89 Pipeline complete \u2014 3 records processed, 0 total failures\n"
]
},
{
@@ -1213,15 +1213,15 @@
"
+ {detection_judge_html}
+ {type_fidelity_section}
+ {attribute_fidelity_section}
+ {relational_consistency_section}
Replacement Map
diff --git a/src/anonymizer/interface/results.py b/src/anonymizer/interface/results.py
index 9a0259f4..fa97c983 100644
--- a/src/anonymizer/interface/results.py
+++ b/src/anonymizer/interface/results.py
@@ -7,6 +7,7 @@
import pandas as pd
+from anonymizer.config.replace_strategies import ReplaceMethod
from anonymizer.engine.ndd.adapter import FailedRecord
from anonymizer.interface.display import render_record_html
@@ -54,12 +55,17 @@ class AnonymizerResult(_DisplayMixin):
to avoid colliding with an Anonymizer output column, in which case
it is the post-rename identifier (e.g. ``"final_entities__input"``).
failed_records: Records that failed during pipeline processing.
+ replace_method: The replace strategy that produced this result. Set by
+ ``run()`` / ``preview()``; consumed by ``evaluate()`` to dispatch the
+ right judges. ``None`` on results that were constructed by hand or
+ loaded from a pre-strategy-tracking format.
"""
dataframe: pd.DataFrame
trace_dataframe: pd.DataFrame
resolved_text_column: str
failed_records: list[FailedRecord]
+ replace_method: ReplaceMethod | None = None
_display_cycle_index: int = field(default=0, init=False, repr=False)
def __repr__(self) -> str:
@@ -86,6 +92,10 @@ class PreviewResult(_DisplayMixin):
it is the post-rename identifier (e.g. ``"final_entities__input"``).
failed_records: Records that failed during pipeline processing.
preview_num_records: Number of records requested for the preview.
+ replace_method: The replace strategy that produced this preview. Set by
+ ``preview()``; consumed by ``evaluate()`` to dispatch the right
+ judges. ``None`` on results that were constructed by hand or loaded
+ from a pre-strategy-tracking format.
"""
dataframe: pd.DataFrame
@@ -93,6 +103,7 @@ class PreviewResult(_DisplayMixin):
resolved_text_column: str
failed_records: list[FailedRecord]
preview_num_records: int
+ replace_method: ReplaceMethod | None = None
_display_cycle_index: int = field(default=0, init=False, repr=False)
def __repr__(self) -> str:
diff --git a/tests/conftest.py b/tests/conftest.py
index 879cc98a..8d374c95 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -13,6 +13,7 @@
from anonymizer.config.anonymizer_config import AnonymizerConfig
from anonymizer.config.models import (
DetectionModelSelection,
+ EvaluateModelSelection,
ModelSelection,
ReplaceModelSelection,
RewriteModelSelection,
@@ -79,6 +80,11 @@ def stub_rewrite_model_selection() -> RewriteModelSelection:
return load_default_model_selection().rewrite
+@pytest.fixture
+def stub_evaluate_model_selection() -> EvaluateModelSelection:
+ return load_default_model_selection().evaluate
+
+
@pytest.fixture
def stub_slim_model_selection() -> ModelSelection:
"""Selection model where every role points to the same known alias."""
@@ -100,6 +106,12 @@ def stub_slim_model_selection() -> ModelSelection:
repairer="known",
judge="known",
),
+ evaluate=EvaluateModelSelection(
+ detection_validity_judge="known",
+ replace_type_fidelity_judge="known",
+ replace_relational_consistency_judge="known",
+ replace_attribute_fidelity_judge="known",
+ ),
)
diff --git a/tests/engine/evaluation/__init__.py b/tests/engine/evaluation/__init__.py
new file mode 100644
index 00000000..1a8431c3
--- /dev/null
+++ b/tests/engine/evaluation/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/engine/evaluation/replace/__init__.py b/tests/engine/evaluation/replace/__init__.py
new file mode 100644
index 00000000..1a8431c3
--- /dev/null
+++ b/tests/engine/evaluation/replace/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/engine/evaluation/replace/test_attribute_fidelity_judge.py b/tests/engine/evaluation/replace/test_attribute_fidelity_judge.py
new file mode 100644
index 00000000..78902f9d
--- /dev/null
+++ b/tests/engine/evaluation/replace/test_attribute_fidelity_judge.py
@@ -0,0 +1,269 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pandas as pd
+from data_designer.config.column_configs import LLMStructuredColumnConfig
+
+from anonymizer.config.models import EvaluateModelSelection
+from anonymizer.engine.constants import (
+ COL_ATTRIBUTE_FIDELITY_INVALID_ENTITIES,
+ COL_ATTRIBUTE_FIDELITY_JUDGE,
+ COL_ATTRIBUTE_FIDELITY_VALID,
+ COL_REPLACEMENT_MAP,
+)
+from anonymizer.engine.evaluation.replace.attribute_fidelity_judge import (
+ AttributeFidelityJudgeWorkflow,
+ AttributeFidelityJudgmentSchema,
+ _flatten_judgment,
+ _judge_prompt,
+ _replacements_for_judge,
+)
+
+# ---------------------------------------------------------------------------
+# Tests: _judge_prompt
+# ---------------------------------------------------------------------------
+
+
+def test_judge_prompt_uses_xml_sections() -> None:
+ prompt = _judge_prompt()
+ for tag in ("scope", "replacements", "task", "salient_attributes_by_label", "rules"):
+ assert f"<{tag}>" in prompt
+ assert f"{tag}>" in prompt
+
+
+def test_judge_prompt_iterates_replacement_triples() -> None:
+ prompt = _judge_prompt()
+ assert "for entry in _replacements_for_attribute_fidelity_judge" in prompt
+ assert "entry.original" in prompt
+ assert "entry.label" in prompt
+ assert "entry.synthetic" in prompt
+
+
+def test_judge_prompt_carves_out_neighbouring_metrics() -> None:
+ """Prompt must explicitly declare type fidelity and cross-entity coherence as out of scope."""
+ prompt = _judge_prompt()
+ assert "DIFFERENT metric" in prompt
+ assert "city <-> state" in prompt # mentions cross-entity case as out of scope
+
+
+def test_judge_prompt_scopes_to_gender_and_age_bucket_only() -> None:
+ """Prompt must restrict checks to the two designated attributes and skip everything else."""
+ prompt = _judge_prompt()
+ assert "GENDER OF NAME" in prompt
+ assert "AGE BUCKET" in prompt
+ assert "ALL OTHER LABELS — SKIP" in prompt
+
+
+# ---------------------------------------------------------------------------
+# Tests: helpers
+# ---------------------------------------------------------------------------
+
+
+def test_replacements_for_judge_flattens_dict_form() -> None:
+ raw = {
+ "replacements": [
+ {"original": "Sarah", "label": "first_name", "synthetic": "Michael"},
+ {"original": "Tokyo", "label": "city", "synthetic": "Paris"},
+ ]
+ }
+ assert _replacements_for_judge(raw) == [
+ {"original": "Sarah", "label": "first_name", "synthetic": "Michael"},
+ {"original": "Tokyo", "label": "city", "synthetic": "Paris"},
+ ]
+
+
+def test_replacements_for_judge_returns_empty_for_malformed() -> None:
+ assert _replacements_for_judge(None) == []
+ assert _replacements_for_judge("not json") == []
+ assert _replacements_for_judge(42) == []
+
+
+# ---------------------------------------------------------------------------
+# Tests: _flatten_judgment
+# ---------------------------------------------------------------------------
+
+
+def test_flatten_judgment_all_valid_keeps_invalid_empty() -> None:
+ raw = {
+ "all_valid": True,
+ "entities": [
+ {
+ "original": "Sarah",
+ "label": "first_name",
+ "synthetic": "Maria",
+ "attributes_checked": ["gender"],
+ "passes": True,
+ "reasoning": "Both names imply feminine gender.",
+ }
+ ],
+ }
+ valid, invalid = _flatten_judgment(raw)
+ assert valid is True
+ assert invalid == []
+
+
+def test_flatten_judgment_extracts_failing_entries_only() -> None:
+ raw = {
+ "all_valid": False,
+ "entities": [
+ {
+ "original": "Sarah",
+ "label": "first_name",
+ "synthetic": "Maria",
+ "attributes_checked": ["gender"],
+ "passes": True,
+ "reasoning": "Both feminine.",
+ },
+ {
+ "original": "40",
+ "label": "age",
+ "synthetic": "12",
+ "attributes_checked": ["age_bucket"],
+ "passes": False,
+ "reasoning": "Adult bucket changed to child.",
+ },
+ ],
+ }
+ valid, invalid = _flatten_judgment(raw)
+ assert valid is False
+ assert len(invalid) == 1
+ assert invalid[0]["original"] == "40"
+ assert invalid[0]["passes"] is False
+
+
+def test_flatten_judgment_accepts_pydantic_model() -> None:
+ payload = AttributeFidelityJudgmentSchema(all_valid=True, entities=[])
+ valid, invalid = _flatten_judgment(payload)
+ assert valid is True
+ assert invalid == []
+
+
+def test_flatten_judgment_none_returns_unavailable_sentinel() -> None:
+ assert _flatten_judgment(None) == (None, [])
+
+
+def test_flatten_judgment_malformed_returns_unavailable_sentinel() -> None:
+ assert _flatten_judgment("not json") == (None, [])
+ assert _flatten_judgment(42) == (None, [])
+ assert _flatten_judgment({"missing": True}) == (None, [])
+
+
+# ---------------------------------------------------------------------------
+# Tests: AttributeFidelityJudgeWorkflow.evaluate
+# ---------------------------------------------------------------------------
+
+
+def _map_payload(items: list[dict]) -> dict:
+ return {"replacements": items}
+
+
+def test_evaluate_short_circuits_when_no_replacements(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ df = pd.DataFrame({COL_REPLACEMENT_MAP: [_map_payload([])]})
+
+ class _UnusedAdapter:
+ def run_workflow(self, *args, **kwargs): # pragma: no cover - should not be called
+ raise AssertionError("run_workflow should not be called when there are no replacements")
+
+ wf = AttributeFidelityJudgeWorkflow(adapter=_UnusedAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+ assert result.failed_records == []
+ assert bool(result.dataframe[COL_ATTRIBUTE_FIDELITY_VALID].iloc[0]) is True
+ assert result.dataframe[COL_ATTRIBUTE_FIDELITY_INVALID_ENTITIES].iloc[0] == []
+
+
+def test_evaluate_invokes_adapter_with_correct_alias_and_schema(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ df = pd.DataFrame(
+ {
+ COL_REPLACEMENT_MAP: [
+ _map_payload(
+ [
+ {"original": "Sarah", "label": "first_name", "synthetic": "Michael"},
+ {"original": "40", "label": "age", "synthetic": "12"},
+ ]
+ )
+ ]
+ }
+ )
+
+ captured: dict = {}
+
+ class _StubAdapter:
+ def run_workflow(self, frame, *, model_configs, columns, workflow_name, preview_num_records=None):
+ captured["columns"] = columns
+ captured["workflow_name"] = workflow_name
+ out = frame.copy()
+ out[COL_ATTRIBUTE_FIDELITY_JUDGE] = [
+ {
+ "all_valid": False,
+ "entities": [
+ {
+ "original": "Sarah",
+ "label": "first_name",
+ "synthetic": "Michael",
+ "attributes_checked": ["gender"],
+ "passes": False,
+ "reasoning": "Feminine -> masculine.",
+ },
+ {
+ "original": "40",
+ "label": "age",
+ "synthetic": "12",
+ "attributes_checked": ["age_bucket"],
+ "passes": False,
+ "reasoning": "Adult -> child.",
+ },
+ ],
+ }
+ ]
+
+ class _Result:
+ dataframe = out
+ failed_records: list = []
+
+ return _Result()
+
+ wf = AttributeFidelityJudgeWorkflow(adapter=_StubAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+
+ assert captured["workflow_name"] == "replace-attribute-fidelity-judge"
+ col = captured["columns"][0]
+ assert isinstance(col, LLMStructuredColumnConfig)
+ assert col.name == COL_ATTRIBUTE_FIDELITY_JUDGE
+ assert col.model_alias == stub_evaluate_model_selection.replace_attribute_fidelity_judge
+ assert col.output_format == AttributeFidelityJudgmentSchema.model_json_schema()
+
+ assert bool(result.dataframe[COL_ATTRIBUTE_FIDELITY_VALID].iloc[0]) is False
+ invalid = result.dataframe[COL_ATTRIBUTE_FIDELITY_INVALID_ENTITIES].iloc[0]
+ assert len(invalid) == 2
+ assert {item["original"] for item in invalid} == {"Sarah", "40"}
+
+
+def test_evaluate_marks_unavailable_for_malformed_payload(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ df = pd.DataFrame(
+ {COL_REPLACEMENT_MAP: [_map_payload([{"original": "Sarah", "label": "first_name", "synthetic": "Maria"}])]}
+ )
+
+ class _StubAdapter:
+ def run_workflow(self, frame, *, model_configs, columns, workflow_name, preview_num_records=None):
+ out = frame.copy()
+ out[COL_ATTRIBUTE_FIDELITY_JUDGE] = ["not json"]
+
+ class _Result:
+ dataframe = out
+ failed_records: list = []
+
+ return _Result()
+
+ wf = AttributeFidelityJudgeWorkflow(adapter=_StubAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+
+ assert result.dataframe[COL_ATTRIBUTE_FIDELITY_VALID].iloc[0] is None
+ assert result.dataframe[COL_ATTRIBUTE_FIDELITY_INVALID_ENTITIES].iloc[0] == []
diff --git a/tests/engine/evaluation/replace/test_relational_consistency_judge.py b/tests/engine/evaluation/replace/test_relational_consistency_judge.py
new file mode 100644
index 00000000..d4208181
--- /dev/null
+++ b/tests/engine/evaluation/replace/test_relational_consistency_judge.py
@@ -0,0 +1,348 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pandas as pd
+from data_designer.config.column_configs import LLMStructuredColumnConfig
+
+from anonymizer.config.models import EvaluateModelSelection
+from anonymizer.engine.constants import (
+ COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS,
+ COL_RELATIONAL_CONSISTENCY_JUDGE,
+ COL_RELATIONAL_CONSISTENCY_VALID,
+ COL_REPLACED_TEXT,
+ COL_REPLACEMENT_MAP,
+)
+from anonymizer.engine.evaluation.replace.relational_consistency_judge import (
+ RelationalConsistencyJudgeWorkflow,
+ RelationalConsistencyJudgmentSchema,
+ _flatten_judgment,
+ _judge_prompt,
+ _replacements_for_judge,
+)
+
+# ---------------------------------------------------------------------------
+# Tests: _judge_prompt
+# ---------------------------------------------------------------------------
+
+
+def test_judge_prompt_uses_xml_sections() -> None:
+ prompt = _judge_prompt()
+ for tag in ("scope", "replaced_text", "replacements", "task", "relations_to_inspect", "rules", "edge_cases"):
+ assert f"<{tag}>" in prompt
+ assert f"{tag}>" in prompt
+
+
+def test_judge_prompt_references_replaced_text_column() -> None:
+ prompt = _judge_prompt()
+ assert COL_REPLACED_TEXT in prompt
+
+
+def test_judge_prompt_iterates_replacement_triples() -> None:
+ prompt = _judge_prompt()
+ assert "for entry in _replacements_for_relational_consistency_judge" in prompt
+ assert "entry.original" in prompt
+ assert "entry.label" in prompt
+ assert "entry.synthetic" in prompt
+
+
+def test_judge_prompt_disambiguates_from_neighbouring_metrics() -> None:
+ """Prompt must call out that type/format and semantic-attribute checks are out of scope."""
+ prompt = _judge_prompt()
+ assert "DIFFERENT metric" in prompt
+
+
+def test_judge_prompt_requires_passing_relations_in_output() -> None:
+ """Denominator depends on the judge listing passes AND fails."""
+ prompt = _judge_prompt()
+ assert "denominator" in prompt.lower()
+
+
+def test_judge_prompt_blocks_generic_date_as_date_of_birth() -> None:
+ """The judge must not treat a generic `date` (career year, etc.) as a `date_of_birth`.
+
+ Regression guard for a real failure observed on the biographies dataset, where a
+ sentence like "returning home in 2012" caused the judge to pair the `date` entity
+ with `age` and compute `current_year - 2012 != age`, producing a false negative.
+ """
+ prompt = _judge_prompt()
+ assert "literally `date_of_birth`" in prompt
+ assert "generic `date`" in prompt
+ assert "SKIP the temporal relation" in prompt
+
+
+def test_judge_prompt_requires_literal_label_matching() -> None:
+ """Relations are matched by the literal label field, not by inferring from the value's surface form."""
+ prompt = _judge_prompt()
+ assert "LITERAL `label` field" in prompt
+ assert "Do NOT infer a label from" in prompt
+
+
+# ---------------------------------------------------------------------------
+# Tests: helpers
+# ---------------------------------------------------------------------------
+
+
+def test_replacements_for_judge_flattens_dict_form() -> None:
+ raw = {
+ "replacements": [
+ {"original": "Austin", "label": "city", "synthetic": "Portland"},
+ {"original": "TX", "label": "state", "synthetic": "OR"},
+ ]
+ }
+ assert _replacements_for_judge(raw) == [
+ {"original": "Austin", "label": "city", "synthetic": "Portland"},
+ {"original": "TX", "label": "state", "synthetic": "OR"},
+ ]
+
+
+def test_replacements_for_judge_returns_empty_for_malformed() -> None:
+ assert _replacements_for_judge(None) == []
+ assert _replacements_for_judge("not json") == []
+ assert _replacements_for_judge(42) == []
+
+
+# ---------------------------------------------------------------------------
+# Tests: _flatten_judgment
+# ---------------------------------------------------------------------------
+
+
+def test_flatten_judgment_all_consistent_keeps_invalid_empty() -> None:
+ raw = {
+ "all_consistent": True,
+ "relations": [
+ {
+ "description": "city <-> state",
+ "entities": [
+ "Austin (city) -> Portland",
+ "TX (state) -> OR",
+ ],
+ "passes": True,
+ "reasoning": "Portland is in Oregon.",
+ }
+ ],
+ }
+ valid, invalid = _flatten_judgment(raw)
+ assert valid is True
+ assert invalid == []
+
+
+def test_flatten_judgment_extracts_failing_relations_only() -> None:
+ raw = {
+ "all_consistent": False,
+ "relations": [
+ {
+ "description": "city <-> state",
+ "entities": [],
+ "passes": True,
+ "reasoning": "ok",
+ },
+ {
+ "description": "date_of_birth <-> age",
+ "entities": [],
+ "passes": False,
+ "reasoning": "DOB 1990 vs age 12 is impossible.",
+ },
+ ],
+ }
+ valid, invalid = _flatten_judgment(raw)
+ assert valid is False
+ assert len(invalid) == 1
+ assert invalid[0]["description"] == "date_of_birth <-> age"
+ assert invalid[0]["passes"] is False
+
+
+def test_flatten_judgment_accepts_pydantic_model() -> None:
+ payload = RelationalConsistencyJudgmentSchema(all_consistent=True, relations=[])
+ valid, invalid = _flatten_judgment(payload)
+ assert valid is True
+ assert invalid == []
+
+
+def test_flatten_judgment_none_returns_unavailable_sentinel() -> None:
+ assert _flatten_judgment(None) == (None, [])
+
+
+def test_flatten_judgment_malformed_returns_unavailable_sentinel() -> None:
+ assert _flatten_judgment("not json") == (None, [])
+ assert _flatten_judgment(42) == (None, [])
+ assert _flatten_judgment({"missing_top_level": True}) == (None, [])
+
+
+# ---------------------------------------------------------------------------
+# Tests: RelationalConsistencyJudgeWorkflow.evaluate
+# ---------------------------------------------------------------------------
+
+
+def _map_payload(items: list[dict]) -> dict:
+ return {"replacements": items}
+
+
+def test_evaluate_short_circuits_when_fewer_than_two_replacements(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ df = pd.DataFrame(
+ {
+ COL_REPLACED_TEXT: ["Alice"],
+ COL_REPLACEMENT_MAP: [_map_payload([{"original": "Alice", "label": "first_name", "synthetic": "Maya"}])],
+ }
+ )
+
+ class _UnusedAdapter:
+ def run_workflow(self, *args, **kwargs): # pragma: no cover - should not be called
+ raise AssertionError("run_workflow should not be called when there are <2 replacements")
+
+ wf = RelationalConsistencyJudgeWorkflow(adapter=_UnusedAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+ assert result.failed_records == []
+ assert bool(result.dataframe[COL_RELATIONAL_CONSISTENCY_VALID].iloc[0]) is True
+ assert result.dataframe[COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS].iloc[0] == []
+
+
+def test_evaluate_invokes_adapter_with_correct_alias_and_schema(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ df = pd.DataFrame(
+ {
+ COL_REPLACED_TEXT: ["Maya works in Portland, OR"],
+ COL_REPLACEMENT_MAP: [
+ _map_payload(
+ [
+ {"original": "Alice", "label": "first_name", "synthetic": "Maya"},
+ {"original": "Austin", "label": "city", "synthetic": "Portland"},
+ {"original": "TX", "label": "state", "synthetic": "OR"},
+ ]
+ )
+ ],
+ }
+ )
+
+ captured: dict = {}
+
+ class _StubAdapter:
+ def run_workflow(self, frame, *, model_configs, columns, workflow_name, preview_num_records=None):
+ captured["columns"] = columns
+ captured["workflow_name"] = workflow_name
+ out = frame.copy()
+ out[COL_RELATIONAL_CONSISTENCY_JUDGE] = [
+ {
+ "all_consistent": True,
+ "relations": [
+ {
+ "description": "city <-> state",
+ "entities": [
+ "Austin (city) -> Portland",
+ "TX (state) -> OR",
+ ],
+ "passes": True,
+ "reasoning": "Portland is in Oregon.",
+ }
+ ],
+ }
+ ]
+
+ class _Result:
+ dataframe = out
+ failed_records: list = []
+
+ return _Result()
+
+ wf = RelationalConsistencyJudgeWorkflow(adapter=_StubAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+
+ assert captured["workflow_name"] == "replace-relational-consistency-judge"
+ col = captured["columns"][0]
+ assert isinstance(col, LLMStructuredColumnConfig)
+ assert col.name == COL_RELATIONAL_CONSISTENCY_JUDGE
+ assert col.model_alias == stub_evaluate_model_selection.replace_relational_consistency_judge
+ assert col.output_format == RelationalConsistencyJudgmentSchema.model_json_schema()
+
+ assert bool(result.dataframe[COL_RELATIONAL_CONSISTENCY_VALID].iloc[0]) is True
+ assert result.dataframe[COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS].iloc[0] == []
+
+
+def test_evaluate_marks_unavailable_for_malformed_payload(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ df = pd.DataFrame(
+ {
+ COL_REPLACED_TEXT: ["Maya works in Portland, OR"],
+ COL_REPLACEMENT_MAP: [
+ _map_payload(
+ [
+ {"original": "Austin", "label": "city", "synthetic": "Portland"},
+ {"original": "TX", "label": "state", "synthetic": "OR"},
+ ]
+ )
+ ],
+ }
+ )
+
+ class _StubAdapter:
+ def run_workflow(self, frame, *, model_configs, columns, workflow_name, preview_num_records=None):
+ out = frame.copy()
+ out[COL_RELATIONAL_CONSISTENCY_JUDGE] = ["not json"]
+
+ class _Result:
+ dataframe = out
+ failed_records: list = []
+
+ return _Result()
+
+ wf = RelationalConsistencyJudgeWorkflow(adapter=_StubAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+
+ assert result.dataframe[COL_RELATIONAL_CONSISTENCY_VALID].iloc[0] is None
+ assert result.dataframe[COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS].iloc[0] == []
+
+
+def test_evaluate_propagates_failing_relations(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ df = pd.DataFrame(
+ {
+ COL_REPLACED_TEXT: ["..."],
+ COL_REPLACEMENT_MAP: [
+ _map_payload(
+ [
+ {"original": "1990", "label": "date_of_birth", "synthetic": "2015"},
+ {"original": "35", "label": "age", "synthetic": "35"},
+ ]
+ )
+ ],
+ }
+ )
+
+ class _StubAdapter:
+ def run_workflow(self, frame, *, model_configs, columns, workflow_name, preview_num_records=None):
+ out = frame.copy()
+ out[COL_RELATIONAL_CONSISTENCY_JUDGE] = [
+ {
+ "all_consistent": False,
+ "relations": [
+ {
+ "description": "date_of_birth <-> age",
+ "entities": [
+ "1990 (date_of_birth) -> 2015",
+ "35 (age) -> 35",
+ ],
+ "passes": False,
+ "reasoning": "A 2015 birthdate does not yield age 35.",
+ }
+ ],
+ }
+ ]
+
+ class _Result:
+ dataframe = out
+ failed_records: list = []
+
+ return _Result()
+
+ wf = RelationalConsistencyJudgeWorkflow(adapter=_StubAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+ assert bool(result.dataframe[COL_RELATIONAL_CONSISTENCY_VALID].iloc[0]) is False
+ invalid = result.dataframe[COL_RELATIONAL_CONSISTENCY_INVALID_RELATIONS].iloc[0]
+ assert len(invalid) == 1
+ assert invalid[0]["description"] == "date_of_birth <-> age"
diff --git a/tests/engine/evaluation/replace/test_type_fidelity_judge.py b/tests/engine/evaluation/replace/test_type_fidelity_judge.py
new file mode 100644
index 00000000..45523ae0
--- /dev/null
+++ b/tests/engine/evaluation/replace/test_type_fidelity_judge.py
@@ -0,0 +1,296 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pandas as pd
+from data_designer.config.column_configs import LLMStructuredColumnConfig
+
+from anonymizer.config.models import EvaluateModelSelection
+from anonymizer.engine.constants import (
+ COL_REPLACEMENT_MAP,
+ COL_TYPE_FIDELITY_INVALID_REPLACEMENTS,
+ COL_TYPE_FIDELITY_JUDGE,
+ COL_TYPE_FIDELITY_VALID,
+)
+from anonymizer.engine.evaluation.replace.type_fidelity_judge import (
+ TypeFidelityJudgeWorkflow,
+ TypeFidelityJudgmentSchema,
+ _flatten_judgment,
+ _judge_prompt,
+ _label_examples_for_judge,
+ _replacements_for_judge,
+)
+
+# ---------------------------------------------------------------------------
+# Tests: _judge_prompt
+# ---------------------------------------------------------------------------
+
+
+def test_judge_prompt_uses_xml_sections() -> None:
+ prompt = _judge_prompt()
+ for tag in (
+ "scope",
+ "replacements",
+ "reference_label_examples",
+ "task",
+ "class_membership_rules",
+ "format_type_rules",
+ "edge_cases",
+ "output_rules",
+ ):
+ assert f"<{tag}>" in prompt
+ assert f"{tag}>" in prompt
+
+
+def test_judge_prompt_iterates_replacement_triples() -> None:
+ prompt = _judge_prompt()
+ assert "for entry in _replacements_for_type_fidelity_judge" in prompt
+ assert "entry.original" in prompt
+ assert "entry.label" in prompt
+ assert "entry.synthetic" in prompt
+
+
+def test_judge_prompt_disambiguates_from_neighbouring_metrics() -> None:
+ """Prompt must call out that semantic attributes and cross-entity consistency are
+ OUT of scope, otherwise the judge will silently penalize valid replacements."""
+ prompt = _judge_prompt()
+ assert "DIFFERENT metric" in prompt
+ assert "gender of a name" in prompt
+ assert "city/state" in prompt
+
+
+# ---------------------------------------------------------------------------
+# Tests: helpers
+# ---------------------------------------------------------------------------
+
+
+def test_replacements_for_judge_flattens_dict_form() -> None:
+ raw = {
+ "replacements": [
+ {"original": "Alice", "label": "first_name", "synthetic": "Maya"},
+ {"original": "Acme", "label": "company_name", "synthetic": "NovaCorp"},
+ ]
+ }
+ assert _replacements_for_judge(raw) == [
+ {"original": "Alice", "label": "first_name", "synthetic": "Maya"},
+ {"original": "Acme", "label": "company_name", "synthetic": "NovaCorp"},
+ ]
+
+
+def test_replacements_for_judge_accepts_json_string() -> None:
+ payload = '{"replacements":[{"original":"Alice","label":"first_name","synthetic":"Maya"}]}'
+ assert _replacements_for_judge(payload) == [{"original": "Alice", "label": "first_name", "synthetic": "Maya"}]
+
+
+def test_replacements_for_judge_returns_empty_for_malformed() -> None:
+ assert _replacements_for_judge(None) == []
+ assert _replacements_for_judge("not json") == []
+ assert _replacements_for_judge(42) == []
+ assert _replacements_for_judge({"replacements": "oops"}) == []
+
+
+def test_label_examples_for_judge_only_includes_labels_in_replacements() -> None:
+ examples_json = _label_examples_for_judge([{"original": "Alice", "label": "first_name", "synthetic": "Maya"}])
+ assert "first_name" in examples_json
+ assert "ssn" not in examples_json
+
+
+def test_label_examples_for_judge_empty_when_no_replacements() -> None:
+ assert _label_examples_for_judge([]) == "{}"
+
+
+# ---------------------------------------------------------------------------
+# Tests: _flatten_judgment
+# ---------------------------------------------------------------------------
+
+
+def test_flatten_judgment_all_valid_path() -> None:
+ valid, invalid = _flatten_judgment({"all_valid": True, "invalid_replacements": []})
+ assert valid is True
+ assert invalid == []
+
+
+def test_flatten_judgment_returns_invalid_entries() -> None:
+ raw = {
+ "all_valid": False,
+ "invalid_replacements": [
+ {
+ "original": "Alice",
+ "label": "first_name",
+ "synthetic": "[REDACTED]",
+ "reasoning": "class membership: placeholder, not a person name",
+ },
+ ],
+ }
+ valid, invalid = _flatten_judgment(raw)
+ assert valid is False
+ assert invalid == [
+ {
+ "original": "Alice",
+ "label": "first_name",
+ "synthetic": "[REDACTED]",
+ "reasoning": "class membership: placeholder, not a person name",
+ }
+ ]
+
+
+def test_flatten_judgment_accepts_pydantic_model() -> None:
+ payload = TypeFidelityJudgmentSchema(all_valid=True, invalid_replacements=[])
+ valid, invalid = _flatten_judgment(payload)
+ assert valid is True
+ assert invalid == []
+
+
+def test_flatten_judgment_none_returns_unavailable_sentinel() -> None:
+ assert _flatten_judgment(None) == (None, [])
+
+
+def test_flatten_judgment_malformed_returns_unavailable_sentinel() -> None:
+ assert _flatten_judgment("not json") == (None, [])
+ assert _flatten_judgment(42) == (None, [])
+ assert _flatten_judgment({"missing": "all_valid"}) == (None, [])
+
+
+# ---------------------------------------------------------------------------
+# Tests: TypeFidelityJudgeWorkflow.evaluate
+# ---------------------------------------------------------------------------
+
+
+def _map_payload(items: list[dict]) -> dict:
+ return {"replacements": items}
+
+
+def test_evaluate_short_circuits_when_no_replacements(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ df = pd.DataFrame({COL_REPLACEMENT_MAP: [_map_payload([])]})
+
+ class _UnusedAdapter:
+ def run_workflow(self, *args, **kwargs): # pragma: no cover - should not be called
+ raise AssertionError("run_workflow should not be called when there are no replacements")
+
+ wf = TypeFidelityJudgeWorkflow(adapter=_UnusedAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+ assert result.failed_records == []
+ assert bool(result.dataframe[COL_TYPE_FIDELITY_VALID].iloc[0]) is True
+ assert result.dataframe[COL_TYPE_FIDELITY_INVALID_REPLACEMENTS].iloc[0] == []
+
+
+def test_evaluate_invokes_adapter_with_correct_alias_and_schema(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ df = pd.DataFrame(
+ {
+ COL_REPLACEMENT_MAP: [
+ _map_payload(
+ [
+ {"original": "Alice", "label": "first_name", "synthetic": "Maya"},
+ {"original": "alice@x.com", "label": "email", "synthetic": "not-an-email"},
+ ]
+ )
+ ]
+ }
+ )
+
+ captured: dict = {}
+
+ class _StubAdapter:
+ def run_workflow(self, frame, *, model_configs, columns, workflow_name, preview_num_records=None):
+ captured["columns"] = columns
+ captured["workflow_name"] = workflow_name
+ out = frame.copy()
+ out[COL_TYPE_FIDELITY_JUDGE] = [
+ {
+ "all_valid": False,
+ "invalid_replacements": [
+ {
+ "original": "alice@x.com",
+ "label": "email",
+ "synthetic": "not-an-email",
+ "reasoning": "format: missing '@' and domain",
+ }
+ ],
+ }
+ ]
+
+ class _Result:
+ dataframe = out
+ failed_records: list = []
+
+ return _Result()
+
+ wf = TypeFidelityJudgeWorkflow(adapter=_StubAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+
+ assert captured["workflow_name"] == "replace-type-fidelity-judge"
+ col = captured["columns"][0]
+ assert isinstance(col, LLMStructuredColumnConfig)
+ assert col.name == COL_TYPE_FIDELITY_JUDGE
+ assert col.model_alias == stub_evaluate_model_selection.replace_type_fidelity_judge
+ assert col.output_format == TypeFidelityJudgmentSchema.model_json_schema()
+
+ assert bool(result.dataframe[COL_TYPE_FIDELITY_VALID].iloc[0]) is False
+ invalid = result.dataframe[COL_TYPE_FIDELITY_INVALID_REPLACEMENTS].iloc[0]
+ assert invalid == [
+ {
+ "original": "alice@x.com",
+ "label": "email",
+ "synthetic": "not-an-email",
+ "reasoning": "format: missing '@' and domain",
+ }
+ ]
+
+
+def test_evaluate_preserves_row_order_when_mixing_empty_and_populated_maps(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ df = pd.DataFrame(
+ {
+ COL_REPLACEMENT_MAP: [
+ _map_payload([]),
+ _map_payload([{"original": "Alice", "label": "first_name", "synthetic": "Maya"}]),
+ ]
+ }
+ )
+
+ class _StubAdapter:
+ def run_workflow(self, frame, *, model_configs, columns, workflow_name, preview_num_records=None):
+ out = frame.copy()
+ out[COL_TYPE_FIDELITY_JUDGE] = [{"all_valid": True, "invalid_replacements": []}]
+
+ class _Result:
+ dataframe = out
+ failed_records: list = []
+
+ return _Result()
+
+ wf = TypeFidelityJudgeWorkflow(adapter=_StubAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+
+ assert [bool(v) for v in result.dataframe[COL_TYPE_FIDELITY_VALID]] == [True, True]
+
+
+def test_evaluate_marks_unavailable_for_malformed_payload(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ df = pd.DataFrame(
+ {COL_REPLACEMENT_MAP: [_map_payload([{"original": "Alice", "label": "first_name", "synthetic": "Maya"}])]}
+ )
+
+ class _StubAdapter:
+ def run_workflow(self, frame, *, model_configs, columns, workflow_name, preview_num_records=None):
+ out = frame.copy()
+ out[COL_TYPE_FIDELITY_JUDGE] = ["not json"]
+
+ class _Result:
+ dataframe = out
+ failed_records: list = []
+
+ return _Result()
+
+ wf = TypeFidelityJudgeWorkflow(adapter=_StubAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+
+ assert result.dataframe[COL_TYPE_FIDELITY_VALID].iloc[0] is None
+ assert result.dataframe[COL_TYPE_FIDELITY_INVALID_REPLACEMENTS].iloc[0] == []
diff --git a/tests/engine/evaluation/test_detection_judge.py b/tests/engine/evaluation/test_detection_judge.py
new file mode 100644
index 00000000..1d92eca2
--- /dev/null
+++ b/tests/engine/evaluation/test_detection_judge.py
@@ -0,0 +1,274 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pandas as pd
+from data_designer.config.column_configs import LLMStructuredColumnConfig
+
+from anonymizer.config.models import EvaluateModelSelection
+from anonymizer.engine.constants import (
+ COL_DETECTION_INVALID_ENTITIES,
+ COL_DETECTION_JUDGE,
+ COL_DETECTION_VALID,
+ COL_ENTITIES_BY_VALUE,
+ COL_TEXT,
+)
+from anonymizer.engine.evaluation.detection_judge import (
+ DetectionJudgeWorkflow,
+ DetectionJudgmentSchema,
+ _entities_for_judge,
+ _flatten_judgment,
+ _judge_prompt,
+ _label_examples_for_judge,
+)
+from anonymizer.engine.schemas import EntitiesByValueSchema
+
+# ---------------------------------------------------------------------------
+# Tests: _judge_prompt
+# ---------------------------------------------------------------------------
+
+
+def test_judge_prompt_uses_xml_sections() -> None:
+ prompt = _judge_prompt()
+ for tag in ("original_text", "detected_entities", "task", "invalid_criteria", "valid_criteria"):
+ assert f"<{tag}>" in prompt
+ assert f"{tag}>" in prompt
+
+
+def test_judge_prompt_references_original_text_column() -> None:
+ prompt = _judge_prompt()
+ assert COL_TEXT in prompt
+
+
+def test_judge_prompt_iterates_detected_entities() -> None:
+ prompt = _judge_prompt()
+ assert "for entity in _entities_for_detection_judge" in prompt
+ assert "entity.value" in prompt
+ assert "entity.label" in prompt
+
+
+# ---------------------------------------------------------------------------
+# Tests: helpers
+# ---------------------------------------------------------------------------
+
+
+def test_entities_for_judge_flattens_labels() -> None:
+ parsed = EntitiesByValueSchema.from_raw(
+ {
+ "entities_by_value": [
+ {"value": "Alice", "labels": ["first_name"]},
+ {"value": "Acme", "labels": ["company_name", "organization_name"]},
+ ]
+ }
+ )
+ rows = _entities_for_judge(parsed)
+ assert rows == [
+ {"value": "Alice", "label": "first_name"},
+ {"value": "Acme", "label": "company_name"},
+ {"value": "Acme", "label": "organization_name"},
+ ]
+
+
+def test_label_examples_for_judge_returns_json_keyed_by_label() -> None:
+ parsed = EntitiesByValueSchema.from_raw({"entities_by_value": [{"value": "Alice", "labels": ["first_name"]}]})
+ examples_json = _label_examples_for_judge(parsed)
+ assert "first_name" in examples_json
+ assert examples_json.startswith("{")
+
+
+def test_label_examples_for_judge_empty_when_no_entities() -> None:
+ parsed = EntitiesByValueSchema()
+ assert _label_examples_for_judge(parsed) == "{}"
+
+
+# ---------------------------------------------------------------------------
+# Tests: _flatten_judgment
+# ---------------------------------------------------------------------------
+
+
+def test_flatten_judgment_all_valid_path() -> None:
+ valid, invalid = _flatten_judgment({"all_valid": True, "invalid_entities": []})
+ assert valid is True
+ assert invalid == []
+
+
+def test_flatten_judgment_returns_invalid_entries() -> None:
+ raw = {
+ "all_valid": False,
+ "invalid_entities": [
+ {"value": "morning", "label": "date_time", "reasoning": "common word"},
+ ],
+ }
+ valid, invalid = _flatten_judgment(raw)
+ assert valid is False
+ assert invalid == [{"value": "morning", "label": "date_time", "reasoning": "common word"}]
+
+
+def test_flatten_judgment_accepts_pydantic_model() -> None:
+ payload = DetectionJudgmentSchema(all_valid=True, invalid_entities=[])
+ valid, invalid = _flatten_judgment(payload)
+ assert valid is True
+ assert invalid == []
+
+
+def test_flatten_judgment_accepts_json_string() -> None:
+ valid, invalid = _flatten_judgment('{"all_valid": true, "invalid_entities": []}')
+ assert valid is True
+ assert invalid == []
+
+
+def test_flatten_judgment_none_returns_unavailable_sentinel() -> None:
+ assert _flatten_judgment(None) == (None, [])
+
+
+def test_flatten_judgment_malformed_returns_unavailable_sentinel() -> None:
+ assert _flatten_judgment("not json") == (None, [])
+ assert _flatten_judgment(42) == (None, [])
+ assert _flatten_judgment({"missing": "all_valid"}) == (None, [])
+
+
+# ---------------------------------------------------------------------------
+# Tests: DetectionJudgeWorkflow.evaluate
+# ---------------------------------------------------------------------------
+
+
+def _entities_payload(entities: list[dict]) -> dict:
+ return {"entities_by_value": entities}
+
+
+def test_evaluate_short_circuits_when_no_entities(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ """Rows with no detected entities skip the LLM call and pass trivially."""
+ df = pd.DataFrame(
+ {
+ COL_TEXT: ["plain text"],
+ COL_ENTITIES_BY_VALUE: [_entities_payload([])],
+ }
+ )
+
+ class _UnusedAdapter:
+ def run_workflow(self, *args, **kwargs): # pragma: no cover - should not be called
+ raise AssertionError("run_workflow should not be called for empty-entity rows")
+
+ wf = DetectionJudgeWorkflow(adapter=_UnusedAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+ assert result.failed_records == []
+ assert bool(result.dataframe[COL_DETECTION_VALID].iloc[0]) is True
+ assert result.dataframe[COL_DETECTION_INVALID_ENTITIES].iloc[0] == []
+
+
+def test_evaluate_invokes_adapter_for_rows_with_entities(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ """Rows with entities get a structured-column workflow keyed on detection_judge."""
+ df = pd.DataFrame(
+ {
+ COL_TEXT: ["Alice works at Acme"],
+ COL_ENTITIES_BY_VALUE: [
+ _entities_payload(
+ [
+ {"value": "Alice", "labels": ["first_name"]},
+ {"value": "Acme", "labels": ["company_name"]},
+ ]
+ )
+ ],
+ }
+ )
+
+ captured: dict = {}
+
+ class _StubAdapter:
+ def run_workflow(self, frame, *, model_configs, columns, workflow_name, preview_num_records=None):
+ captured["columns"] = columns
+ captured["workflow_name"] = workflow_name
+ out = frame.copy()
+ out[COL_DETECTION_JUDGE] = [
+ {
+ "all_valid": False,
+ "invalid_entities": [{"value": "Acme", "label": "company_name", "reasoning": "spurious"}],
+ }
+ ]
+
+ class _Result:
+ dataframe = out
+ failed_records: list = []
+
+ return _Result()
+
+ wf = DetectionJudgeWorkflow(adapter=_StubAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+
+ assert captured["workflow_name"] == "replace-detection-judge"
+ assert len(captured["columns"]) == 1
+ col = captured["columns"][0]
+ assert isinstance(col, LLMStructuredColumnConfig)
+ assert col.name == COL_DETECTION_JUDGE
+ assert col.model_alias == stub_evaluate_model_selection.detection_validity_judge
+ assert col.output_format == DetectionJudgmentSchema.model_json_schema()
+
+ assert bool(result.dataframe[COL_DETECTION_VALID].iloc[0]) is False
+ invalid = result.dataframe[COL_DETECTION_INVALID_ENTITIES].iloc[0]
+ assert invalid == [{"value": "Acme", "label": "company_name", "reasoning": "spurious"}]
+
+
+def test_evaluate_merges_entity_and_empty_rows_in_order(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ """Rows are returned in their original order, even when one bypasses the LLM."""
+ df = pd.DataFrame(
+ {
+ COL_TEXT: ["no entities here", "Alice was here"],
+ COL_ENTITIES_BY_VALUE: [
+ _entities_payload([]),
+ _entities_payload([{"value": "Alice", "labels": ["first_name"]}]),
+ ],
+ }
+ )
+
+ class _StubAdapter:
+ def run_workflow(self, frame, *, model_configs, columns, workflow_name, preview_num_records=None):
+ out = frame.copy()
+ out[COL_DETECTION_JUDGE] = [{"all_valid": True, "invalid_entities": []}]
+
+ class _Result:
+ dataframe = out
+ failed_records: list = []
+
+ return _Result()
+
+ wf = DetectionJudgeWorkflow(adapter=_StubAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+
+ assert list(result.dataframe[COL_TEXT]) == ["no entities here", "Alice was here"]
+ assert [bool(v) for v in result.dataframe[COL_DETECTION_VALID]] == [True, True]
+
+
+def test_evaluate_marks_judge_unavailable_for_malformed_payload(
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ """Malformed judge output leaves detection_valid=None rather than fabricating a verdict."""
+ df = pd.DataFrame(
+ {
+ COL_TEXT: ["Alice"],
+ COL_ENTITIES_BY_VALUE: [_entities_payload([{"value": "Alice", "labels": ["first_name"]}])],
+ }
+ )
+
+ class _StubAdapter:
+ def run_workflow(self, frame, *, model_configs, columns, workflow_name, preview_num_records=None):
+ out = frame.copy()
+ out[COL_DETECTION_JUDGE] = ["not json"]
+
+ class _Result:
+ dataframe = out
+ failed_records: list = []
+
+ return _Result()
+
+ wf = DetectionJudgeWorkflow(adapter=_StubAdapter())
+ result = wf.evaluate(df, model_configs=[], selected_models=stub_evaluate_model_selection)
+
+ assert result.dataframe[COL_DETECTION_VALID].iloc[0] is None
+ assert result.dataframe[COL_DETECTION_INVALID_ENTITIES].iloc[0] == []
diff --git a/tests/engine/test_model_loader.py b/tests/engine/test_model_loader.py
index 14e94710..3b754dd4 100644
--- a/tests/engine/test_model_loader.py
+++ b/tests/engine/test_model_loader.py
@@ -312,7 +312,15 @@ def test_validate_model_alias_references_raises_on_unknown_replace_alias_when_en
stub_slim_model_selection: ModelSelection,
) -> None:
selected_models = stub_slim_model_selection.model_copy(
- update={"replace": ReplaceModelSelection(replacement_generator="bad-replace-alias")}
+ update={
+ "replace": ReplaceModelSelection(
+ replacement_generator="bad-replace-alias",
+ detection_judge="known",
+ type_fidelity_judge="known",
+ relational_consistency_judge="known",
+ attribute_fidelity_judge="known",
+ )
+ }
)
with pytest.raises(ValueError, match="bad-replace-alias"):
@@ -328,7 +336,15 @@ def test_validate_model_alias_references_skips_replace_alias_when_not_enabled(
stub_slim_model_selection: ModelSelection,
) -> None:
selected_models = stub_slim_model_selection.model_copy(
- update={"replace": ReplaceModelSelection(replacement_generator="bad-replace-alias")}
+ update={
+ "replace": ReplaceModelSelection(
+ replacement_generator="bad-replace-alias",
+ detection_judge="known",
+ type_fidelity_judge="known",
+ relational_consistency_judge="known",
+ attribute_fidelity_judge="known",
+ )
+ }
)
validate_model_alias_references(
diff --git a/tests/engine/test_replace_runner.py b/tests/engine/test_replace_runner.py
index 17339068..adc4d970 100644
--- a/tests/engine/test_replace_runner.py
+++ b/tests/engine/test_replace_runner.py
@@ -10,10 +10,28 @@
import pytest
from data_designer.config.models import ModelConfig
-from anonymizer.config.models import ReplaceModelSelection
+from anonymizer.config.models import EvaluateModelSelection, ReplaceModelSelection
from anonymizer.config.replace_strategies import Hash, Redact, Substitute
-from anonymizer.engine.constants import COL_FINAL_ENTITIES, COL_REPLACED_TEXT, COL_REPLACEMENT_MAP, COL_TEXT
-from anonymizer.engine.ndd.adapter import FailedRecord
+from anonymizer.engine.constants import (
+ COL_ATTRIBUTE_FIDELITY_JUDGE,
+ COL_ATTRIBUTE_FIDELITY_VALID,
+ COL_DETECTION_JUDGE,
+ COL_DETECTION_VALID,
+ COL_ENTITIES_BY_VALUE,
+ COL_FINAL_ENTITIES,
+ COL_RELATIONAL_CONSISTENCY_JUDGE,
+ COL_RELATIONAL_CONSISTENCY_VALID,
+ COL_REPLACED_TEXT,
+ COL_REPLACEMENT_MAP,
+ COL_TEXT,
+ COL_TYPE_FIDELITY_JUDGE,
+ COL_TYPE_FIDELITY_VALID,
+)
+from anonymizer.engine.evaluation.detection_judge import DetectionJudgeWorkflow
+from anonymizer.engine.evaluation.replace.attribute_fidelity_judge import AttributeFidelityJudgeWorkflow
+from anonymizer.engine.evaluation.replace.relational_consistency_judge import RelationalConsistencyJudgeWorkflow
+from anonymizer.engine.evaluation.replace.type_fidelity_judge import TypeFidelityJudgeWorkflow
+from anonymizer.engine.ndd.adapter import RECORD_ID_COLUMN, FailedRecord, WorkflowRunResult
from anonymizer.engine.replace.llm_replace_workflow import LlmReplaceResult
from anonymizer.engine.replace.replace_runner import ReplacementWorkflow
from anonymizer.engine.replace.strategies import apply_replacement_map
@@ -116,6 +134,268 @@ def test_substitute_without_workflow_raises(
)
+def test_evaluate_uses_merged_dd_workflow_for_judges(
+ stub_model_configs: list[ModelConfig],
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ """``evaluate()`` runs all 4 judges as columns of a SINGLE DD workflow call
+ (DataDesigner parallelizes the columns internally — no Python threads)."""
+
+ # Trace-shaped input: simulates a dataframe returned by a prior ``run()``.
+ saved_trace = pd.DataFrame(
+ {
+ COL_TEXT: ["Alice works at Acme"],
+ COL_FINAL_ENTITIES: [{"entities": []}],
+ COL_REPLACED_TEXT: ["Maya works at NovaCorp"],
+ COL_REPLACEMENT_MAP: [
+ {
+ "replacements": [
+ {"original": "Alice", "label": "first_name", "synthetic": "Maya"},
+ {"original": "Acme", "label": "organization", "synthetic": "NovaCorp"},
+ ]
+ }
+ ],
+ COL_ENTITIES_BY_VALUE: [
+ {
+ "entities_by_value": [
+ {"value": "Alice", "labels": ["first_name"]},
+ {"value": "Acme", "labels": ["organization"]},
+ ]
+ }
+ ],
+ }
+ )
+
+ judge_defaults = {
+ COL_DETECTION_JUDGE: {"all_valid": True, "invalid_entities": []},
+ COL_TYPE_FIDELITY_JUDGE: {"all_valid": True, "invalid_replacements": []},
+ COL_RELATIONAL_CONSISTENCY_JUDGE: {"all_consistent": True, "relations": []},
+ COL_ATTRIBUTE_FIDELITY_JUDGE: {"all_valid": True, "entities": []},
+ }
+
+ def fake_run_workflow(df: pd.DataFrame, *, columns, **_: object) -> WorkflowRunResult:
+ out = df.copy()
+ for column in columns:
+ out[column.name] = [judge_defaults[column.name]] * len(out)
+ return WorkflowRunResult(dataframe=out, failed_records=[])
+
+ def fake_attach_ids(df: pd.DataFrame) -> pd.DataFrame:
+ if RECORD_ID_COLUMN in df.columns:
+ return df.copy()
+ out = df.copy()
+ out[RECORD_ID_COLUMN] = [f"id-{i}" for i in range(len(out))]
+ return out
+
+ adapter = Mock()
+ adapter.run_workflow.side_effect = fake_run_workflow
+ adapter._attach_record_ids.side_effect = fake_attach_ids
+
+ runner = ReplacementWorkflow(
+ detection_judge=DetectionJudgeWorkflow(adapter=adapter),
+ type_fidelity_judge=TypeFidelityJudgeWorkflow(adapter=adapter),
+ relational_consistency_judge=RelationalConsistencyJudgeWorkflow(adapter=adapter),
+ attribute_fidelity_judge=AttributeFidelityJudgeWorkflow(adapter=adapter),
+ adapter=adapter,
+ )
+
+ result = runner.evaluate(
+ saved_trace,
+ replace_method=Substitute(),
+ model_configs=stub_model_configs,
+ selected_models=stub_evaluate_model_selection,
+ )
+
+ # Exactly ONE adapter call for the judges step (proves merge, not 4 separate workflows).
+ assert adapter.run_workflow.call_count == 1
+ call_columns = adapter.run_workflow.call_args.kwargs["columns"]
+ assert {c.name for c in call_columns} == set(judge_defaults)
+
+ # And each judge's VALID column ended up on the result, with True (default payload above).
+ for col in (
+ COL_DETECTION_VALID,
+ COL_TYPE_FIDELITY_VALID,
+ COL_RELATIONAL_CONSISTENCY_VALID,
+ COL_ATTRIBUTE_FIDELITY_VALID,
+ ):
+ assert col in result.dataframe.columns, f"missing column: {col}"
+ assert bool(result.dataframe[col].iloc[0]) is True
+
+
+def test_evaluate_preserves_all_rows_when_llm_drops_some(
+ stub_model_configs: list[ModelConfig],
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ """Evaluation is non-critical: rows the LLM drops (parse error, timeout,
+ etc.) must still appear in the result with *_valid=None ("Unavailable"),
+ not vanish from a previously successful preview/run.
+ """
+ saved_trace = pd.DataFrame(
+ {
+ COL_TEXT: ["Alice works at Acme", "Bob works at Globex"],
+ COL_FINAL_ENTITIES: [{"entities": []}, {"entities": []}],
+ COL_REPLACED_TEXT: ["Maya works at NovaCorp", "Carl works at Initech"],
+ COL_REPLACEMENT_MAP: [
+ {
+ "replacements": [
+ {"original": "Alice", "label": "first_name", "synthetic": "Maya"},
+ {"original": "Acme", "label": "organization", "synthetic": "NovaCorp"},
+ ]
+ },
+ {
+ "replacements": [
+ {"original": "Bob", "label": "first_name", "synthetic": "Carl"},
+ {"original": "Globex", "label": "organization", "synthetic": "Initech"},
+ ]
+ },
+ ],
+ COL_ENTITIES_BY_VALUE: [
+ {"entities_by_value": [{"value": "Alice", "labels": ["first_name"]}]},
+ {"entities_by_value": [{"value": "Bob", "labels": ["first_name"]}]},
+ ],
+ }
+ )
+
+ judge_payload = {
+ COL_DETECTION_JUDGE: {"all_valid": True, "invalid_entities": []},
+ COL_TYPE_FIDELITY_JUDGE: {"all_valid": True, "invalid_replacements": []},
+ COL_RELATIONAL_CONSISTENCY_JUDGE: {"all_consistent": True, "relations": []},
+ COL_ATTRIBUTE_FIDELITY_JUDGE: {"all_valid": True, "entities": []},
+ }
+
+ def fake_attach_ids(df: pd.DataFrame) -> pd.DataFrame:
+ if RECORD_ID_COLUMN in df.columns:
+ return df.copy()
+ out = df.copy()
+ out[RECORD_ID_COLUMN] = [f"id-{i}" for i in range(len(out))]
+ return out
+
+ def fake_run_workflow(df: pd.DataFrame, *, columns, **_: object) -> WorkflowRunResult:
+ # Simulate the LLM successfully judging only the first row;
+ # the second row got dropped during the workflow.
+ kept = df.iloc[:1].copy()
+ for column in columns:
+ kept[column.name] = [judge_payload[column.name]] * len(kept)
+ dropped = FailedRecord(record_id="id-1", step="replace-judges", reason="parse error")
+ return WorkflowRunResult(dataframe=kept, failed_records=[dropped])
+
+ adapter = Mock()
+ adapter._attach_record_ids.side_effect = fake_attach_ids
+ adapter.run_workflow.side_effect = fake_run_workflow
+
+ runner = ReplacementWorkflow(
+ detection_judge=DetectionJudgeWorkflow(adapter=adapter),
+ type_fidelity_judge=TypeFidelityJudgeWorkflow(adapter=adapter),
+ relational_consistency_judge=RelationalConsistencyJudgeWorkflow(adapter=adapter),
+ attribute_fidelity_judge=AttributeFidelityJudgeWorkflow(adapter=adapter),
+ adapter=adapter,
+ )
+ result = runner.evaluate(
+ saved_trace,
+ replace_method=Substitute(),
+ model_configs=stub_model_configs,
+ selected_models=stub_evaluate_model_selection,
+ )
+
+ # Row count is preserved end-to-end.
+ assert len(result.dataframe) == 2
+ # First row got a real verdict.
+ assert bool(result.dataframe[COL_DETECTION_VALID].iloc[0]) is True
+ # Second row (LLM-dropped) is surfaced as Unavailable, not dropped.
+ assert result.dataframe[COL_DETECTION_VALID].iloc[1] is None
+ assert result.dataframe[COL_TYPE_FIDELITY_VALID].iloc[1] is None
+ assert result.dataframe[COL_RELATIONAL_CONSISTENCY_VALID].iloc[1] is None
+ assert result.dataframe[COL_ATTRIBUTE_FIDELITY_VALID].iloc[1] is None
+ # The drop is still visible via failed_records for downstream observability.
+ assert len(result.failed_records) == 1
+ assert result.failed_records[0].record_id == "id-1"
+
+
+def test_runner_does_not_invoke_judges(
+ stub_model_configs: list[ModelConfig],
+ stub_replace_model_selection: ReplaceModelSelection,
+ stub_entities: list[dict],
+) -> None:
+ """``ReplacementWorkflow.run()`` only does the replace step — never the judges.
+
+ The judges live behind a separate ``evaluate()`` call.
+ """
+ llm_workflow = Mock()
+ llm_workflow.generate_map_only.return_value = LlmReplaceResult(
+ dataframe=pd.DataFrame(
+ {
+ COL_TEXT: ["Alice works at Acme"],
+ COL_FINAL_ENTITIES: [{"entities": stub_entities}],
+ COL_REPLACEMENT_MAP: [
+ {
+ "replacements": [
+ {"original": "Alice", "label": "first_name", "synthetic": "Maya"},
+ {"original": "Acme", "label": "organization", "synthetic": "NovaCorp"},
+ ]
+ }
+ ],
+ }
+ ),
+ failed_records=[],
+ )
+ detection_judge = Mock()
+ type_fidelity_judge = Mock()
+ relational_judge = Mock()
+ attribute_judge = Mock()
+ adapter = Mock()
+ runner = ReplacementWorkflow(
+ llm_workflow=llm_workflow,
+ detection_judge=detection_judge,
+ type_fidelity_judge=type_fidelity_judge,
+ relational_consistency_judge=relational_judge,
+ attribute_fidelity_judge=attribute_judge,
+ adapter=adapter,
+ )
+
+ result = runner.run(
+ pd.DataFrame({COL_TEXT: ["Alice works at Acme"], COL_FINAL_ENTITIES: [{"entities": []}]}),
+ replace_method=Substitute(),
+ model_configs=stub_model_configs,
+ selected_models=stub_replace_model_selection,
+ )
+
+ detection_judge.evaluate.assert_not_called()
+ type_fidelity_judge.evaluate.assert_not_called()
+ relational_judge.evaluate.assert_not_called()
+ attribute_judge.evaluate.assert_not_called()
+ adapter.run_workflow.assert_not_called()
+ for col in (
+ COL_DETECTION_VALID,
+ COL_TYPE_FIDELITY_VALID,
+ COL_ATTRIBUTE_FIDELITY_VALID,
+ COL_RELATIONAL_CONSISTENCY_VALID,
+ ):
+ assert col not in result.dataframe.columns
+ assert result.dataframe[COL_REPLACED_TEXT].iloc[0] == "Maya works at NovaCorp"
+
+
+def test_evaluate_raises_on_missing_required_columns(
+ stub_model_configs: list[ModelConfig],
+ stub_evaluate_model_selection: EvaluateModelSelection,
+) -> None:
+ """``evaluate()`` rejects dataframes lacking the columns the judges need,
+ with a message that hints at the trace_dataframe workflow."""
+ runner = ReplacementWorkflow(
+ detection_judge=DetectionJudgeWorkflow(adapter=Mock()),
+ type_fidelity_judge=TypeFidelityJudgeWorkflow(adapter=Mock()),
+ relational_consistency_judge=RelationalConsistencyJudgeWorkflow(adapter=Mock()),
+ attribute_fidelity_judge=AttributeFidelityJudgeWorkflow(adapter=Mock()),
+ adapter=Mock(),
+ )
+ bare_df = pd.DataFrame({COL_TEXT: ["Alice"]}) # missing _entities_by_value and _replacement_map
+ with pytest.raises(ValueError, match="trace_dataframe"):
+ runner.evaluate(
+ bare_df,
+ replace_method=Substitute(),
+ model_configs=stub_model_configs,
+ selected_models=stub_evaluate_model_selection,
+ )
+
+
def test_apply_replacement_map_handles_string_map() -> None:
dataframe = pd.DataFrame(
{
diff --git a/tests/interface/test_anonymizer_interface.py b/tests/interface/test_anonymizer_interface.py
index 3428b667..f892285c 100644
--- a/tests/interface/test_anonymizer_interface.py
+++ b/tests/interface/test_anonymizer_interface.py
@@ -4,6 +4,7 @@
from __future__ import annotations
from pathlib import Path
+from types import SimpleNamespace
from unittest.mock import Mock
import pandas as pd
@@ -428,7 +429,15 @@ def test_validate_config_raises_on_unknown_replace_alias_for_substitute(
anonymizer._model_configs = stub_known_model_configs
anonymizer._selected_models = stub_slim_model_selection
anonymizer._selected_models = anonymizer._selected_models.model_copy(
- update={"replace": ReplaceModelSelection(replacement_generator="bad-replace-alias")}
+ update={
+ "replace": ReplaceModelSelection(
+ replacement_generator="bad-replace-alias",
+ detection_judge="known",
+ type_fidelity_judge="known",
+ relational_consistency_judge="known",
+ attribute_fidelity_judge="known",
+ )
+ }
)
with pytest.raises(InvalidConfigError, match="bad-replace-alias"):
@@ -444,7 +453,15 @@ def test_validate_config_skips_replace_alias_for_non_substitute(
anonymizer._model_configs = stub_known_model_configs
anonymizer._selected_models = stub_slim_model_selection
anonymizer._selected_models = anonymizer._selected_models.model_copy(
- update={"replace": ReplaceModelSelection(replacement_generator="bad-replace-alias")}
+ update={
+ "replace": ReplaceModelSelection(
+ replacement_generator="bad-replace-alias",
+ detection_judge="known",
+ type_fidelity_judge="known",
+ relational_consistency_judge="known",
+ attribute_fidelity_judge="known",
+ )
+ }
)
anonymizer.validate_config(stub_anonymizer_config)
@@ -519,7 +536,15 @@ def test_run_raises_invalid_config_before_workflows(
anonymizer._model_configs = stub_known_model_configs
anonymizer._selected_models = stub_slim_model_selection
anonymizer._selected_models = anonymizer._selected_models.model_copy(
- update={"replace": ReplaceModelSelection(replacement_generator="bad-replace-alias")}
+ update={
+ "replace": ReplaceModelSelection(
+ replacement_generator="bad-replace-alias",
+ detection_judge="known",
+ type_fidelity_judge="known",
+ relational_consistency_judge="known",
+ attribute_fidelity_judge="known",
+ )
+ }
)
with pytest.raises(InvalidConfigError, match="bad-replace-alias"):
@@ -653,8 +678,30 @@ def test_validate_config_raises_on_unknown_replace_alias_in_rewrite_mode(
anonymizer._model_configs = stub_known_model_configs
anonymizer._selected_models = stub_slim_model_selection
anonymizer._selected_models = anonymizer._selected_models.model_copy(
- update={"replace": ReplaceModelSelection(replacement_generator="bad-replace-alias")}
+ update={
+ "replace": ReplaceModelSelection(
+ replacement_generator="bad-replace-alias",
+ detection_judge="known",
+ type_fidelity_judge="known",
+ relational_consistency_judge="known",
+ attribute_fidelity_judge="known",
+ )
+ }
)
with pytest.raises(InvalidConfigError, match="bad-replace-alias"):
anonymizer.validate_config(AnonymizerConfig(rewrite=Rewrite()))
+
+
+def test_evaluate_raises_value_error_on_legacy_result_without_replace_method() -> None:
+ """A pickled result from before `replace_method` existed should surface the
+ actionable ValueError, not an AttributeError from the missing attribute."""
+ anonymizer, _, _, _ = _make_anonymizer()
+ legacy_result = SimpleNamespace(
+ dataframe=pd.DataFrame(),
+ trace_dataframe=pd.DataFrame(),
+ resolved_text_column="text",
+ )
+
+ with pytest.raises(ValueError, match="replace_method"):
+ anonymizer.evaluate(legacy_result) # type: ignore[arg-type]
diff --git a/tests/interface/test_display.py b/tests/interface/test_display.py
index 8578763c..fddc6028 100644
--- a/tests/interface/test_display.py
+++ b/tests/interface/test_display.py
@@ -23,6 +23,7 @@
_build_replaced_entities,
_normalize_replacement_map,
_render_highlighted_text,
+ _verdict_badge,
render_record_html,
)
from anonymizer.interface.results import PreviewResult
@@ -148,6 +149,33 @@ def test_normalize_replacement_map_non_dict_returns_empty() -> None:
assert _normalize_replacement_map([1, 2, 3]) == []
+def test_verdict_badge_satisfied_when_all_correct_and_valid_true() -> None:
+ badge, rate = _verdict_badge(valid=True, correct=10, total=10)
+ assert "Satisfied" in badge and "Not" not in badge
+ assert "10/10" in rate
+
+
+def test_verdict_badge_partial_for_mixed_count() -> None:
+ badge, _ = _verdict_badge(valid=False, correct=8, total=10)
+ assert "Partially Satisfied" in badge
+
+
+def test_verdict_badge_unavailable_when_valid_none() -> None:
+ badge, rate = _verdict_badge(valid=None, correct=0, total=0)
+ assert "Unavailable" in badge
+ assert rate == ""
+
+
+def test_verdict_badge_not_satisfied_when_valid_false_without_enumerated_failures() -> None:
+ """``valid is False`` with ``correct == total`` is an inconsistent LLM response
+ (the judge said it's invalid but didn't list specifics). The explicit boolean
+ must override the count so we don't render a misleading green badge."""
+ badge, rate = _verdict_badge(valid=False, correct=10, total=10)
+ assert "Not Satisfied" in badge
+ assert "Satisfied" not in badge.replace("Not Satisfied", "")
+ assert "10/10" in rate
+
+
@pytest.mark.parametrize(
"payload_kind",
["dict_wrapper", "numpy_wrapped_dict_wrapper", "entities_schema"],
@@ -227,6 +255,23 @@ def test_render_record_html_without_replacement_map() -> None:
assert "No replacement map available" in result
+def test_render_record_html_omits_detection_judge_section_when_judge_did_not_run() -> None:
+ """A preview/run without evaluation must not render an empty 'Detection Judge'
+ heading. The wrapper lives inside ``_render_detection_judge_section`` so the
+ whole block is omitted when ``COL_DETECTION_VALID`` is absent."""
+ row = pd.Series(
+ {
+ "text": "Alice works here",
+ "text_replaced": "Bob works here",
+ COL_DETECTED_ENTITIES: {"entities": []},
+ COL_REPLACEMENT_MAP: {},
+ }
+ )
+ result = render_record_html(row)
+ assert "Detection Judge" not in result
+ assert "Detection Validity" not in result
+
+
def _make_preview(rows: int = 2) -> PreviewResult:
df = pd.DataFrame(
{