Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion docs/source/redact/generator_metadata.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,10 @@ When you use ``generator_config`` to set an entity type to ``Synthesis``, Textua
Common parameters
-----------------

All metadata classes inherit from ``BaseMetadata`` and share the following parameter:
All metadata classes inherit from ``BaseMetadata`` and share the following parameters:

* ``swaps`` (dict of str to str, default ``{}``) -- A dictionary of explicit replacement mappings. When a detected value matches a key, the corresponding value is used as the synthesized replacement instead of a generated one.
* ``constant_value`` (str | None, default ``None``) -- A string value that will be used as the replacement, if there is not a value in ``swaps`` that matches.

.. code-block:: python

Expand All @@ -45,6 +46,12 @@ All metadata classes inherit from ``BaseMetadata`` and share the following param
# Always replace "Acme" with "Globex" instead of generating a random name
metadata = NameGeneratorMetadata(swaps={"Acme": "Globex"})

# Always replace names with "Alice"
metadata = NameGeneratorMetadata(constant_value="Alice")

# Replace all names with "Bob" except for "Alice" which will be replaced with "Mary"
metadata = NameGeneratorMetadata(constant_value="Bob", swaps={"Alice": "Mary"})


Name synthesis
--------------
Expand Down Expand Up @@ -125,6 +132,7 @@ Date and time synthesis
* ``scramble_unrecognized_dates`` (bool, default ``True``) -- When ``True``, dates that Textual cannot parse into a standard format are scrambled.
* ``additional_date_formats`` (list of str, default ``[]``) -- Additional date format patterns that Textual should recognize. Uses Python ``strftime``/``strptime`` format codes.
* ``apply_constant_shift_to_document`` (bool, default ``False``) -- When ``True``, all dates within the same document are shifted by the same random offset. This preserves the relative time differences between dates.
* ``use_clear_date_and_passthrough_or_group_year_generator`` (bool, default ``False``) -- When ``True``, sets the date to January 1st and if the year is less than 90 years ago, passes through the year. Otherwise, sets the year to the current year minus 90.
* ``metadata`` (:class:`~tonic_textual.classes.generator_metadata.timestamp_shift_metadata.TimestampShiftMetadata`) -- Controls the date shift range. By default, dates shift by -7 to +7 days.

TimestampShiftMetadata
Expand Down Expand Up @@ -163,6 +171,7 @@ Person age synthesis
:class:`~tonic_textual.classes.generator_metadata.person_age_generator_metadata.PersonAgeGeneratorMetadata` controls how synthesized ages are generated. Use it with the ``PERSON_AGE`` entity type.

* ``scramble_unrecognized_dates`` (bool, default ``True``) -- When ``True``, dates that Textual cannot parse are scrambled.
* ``use_passthrough_or_group_age_generator`` (bool, default ``False``) -- When ``True``, passes through ages 89 or under. Changes other ages to ``"90+"``.
* ``metadata`` (:class:`~tonic_textual.classes.generator_metadata.age_shift_metadata.AgeShiftMetadata`) -- Controls the age shift amount. By default, ages shift by 7 years.

AgeShiftMetadata
Expand Down Expand Up @@ -198,6 +207,8 @@ Address synthesis (HIPAA)
* ``use_non_hipaa_address_generator`` (bool, default ``False``) -- When ``True``, uses a non-HIPAA-compliant address generator that might produce more realistic addresses, but does not guarantee HIPAA Safe Harbor compliance.
* ``replace_truncated_zeros_in_zip_code`` (bool, default ``True``) -- When ``True``, for ZIP codes that are truncated to three digits (per HIPAA Safe Harbor), the removed digits are replaced with zeros.
* ``realistic_synthetic_values`` (bool, default ``True``) -- When ``True``, generates realistic-looking synthetic address values.
* ``use_three_digit_zips`` (bool, default ``False``) -- When ``True``, zip codes are always truncated to three digits.
* ``replace_foreign_zip_codes_with_zeros`` (bool, default ``False``) -- When ``True``, foreign zip codes become all zeros.

.. code-block:: python

Expand Down
37 changes: 35 additions & 2 deletions tests/tests/metadata_tests/test_base_metadata_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ def test_json_dumps_works_directly(self):
metadata = BaseMetadata(
custom_generator=GeneratorType.Name,
generator_version=GeneratorVersion.V2,
swaps={"foo": "bar"}
swaps={"foo": "bar"},
constant_value="REDACTED"
)
json_str = json.dumps(metadata)

Expand All @@ -19,6 +20,7 @@ def test_json_dumps_works_directly(self):
assert parsed["customGenerator"] == "Name"
assert parsed["generatorVersion"] == "V2"
assert parsed["swaps"] == {"foo": "bar"}
assert parsed["constantValue"] == "REDACTED"

def test_json_includes_type_field(self):
"""Serialized JSON should include _type for deserialization."""
Expand All @@ -38,13 +40,15 @@ def test_json_roundtrip_with_defaults(self):
assert restored.custom_generator == original.custom_generator
assert restored.generator_version == original.generator_version
assert restored.swaps == original.swaps
assert restored.constant_value is None

def test_json_roundtrip_with_custom_values(self):
"""Round-trip serialization preserves custom values."""
original = BaseMetadata(
custom_generator=GeneratorType.DateTime,
generator_version=GeneratorVersion.V2,
swaps={"original": "replaced"}
swaps={"original": "replaced"},
constant_value="STATIC"
)
json_str = json.dumps(original)
parsed = json.loads(json_str)
Expand All @@ -53,6 +57,7 @@ def test_json_roundtrip_with_custom_values(self):
assert restored.custom_generator == original.custom_generator
assert restored.generator_version == original.generator_version
assert restored.swaps == original.swaps
assert restored.constant_value == "STATIC"

def test_attribute_access_works(self):
"""Property-based attribute access should work."""
Expand All @@ -72,13 +77,16 @@ def test_attribute_setter_works(self):
metadata.custom_generator = GeneratorType.Email
metadata.generator_version = GeneratorVersion.V2
metadata.swaps = {"x": "y"}
metadata.constant_value = "FIXED"

assert metadata.custom_generator == GeneratorType.Email
assert metadata["customGenerator"] == GeneratorType.Email
assert metadata.generator_version == GeneratorVersion.V2
assert metadata["generatorVersion"] == GeneratorVersion.V2
assert metadata.swaps == {"x": "y"}
assert metadata["swaps"] == {"x": "y"}
assert metadata.constant_value == "FIXED"
assert metadata["constantValue"] == "FIXED"

def test_dict_access_works(self):
"""Direct dict access should work."""
Expand Down Expand Up @@ -109,3 +117,28 @@ def test_none_custom_generator_serializes_correctly(self):
parsed = json.loads(json_str)

assert parsed["customGenerator"] is None

def test_constant_value_default_is_none(self):
"""constant_value defaults to None."""
metadata = BaseMetadata()

assert metadata.constant_value is None
assert metadata["constantValue"] is None

def test_constant_value_serializes_correctly(self):
"""constant_value round-trips through JSON."""
metadata = BaseMetadata(constant_value="[REDACTED]")
json_str = json.dumps(metadata)
parsed = json.loads(json_str)
restored = BaseMetadata.from_payload(parsed)

assert parsed["constantValue"] == "[REDACTED]"
assert restored.constant_value == "[REDACTED]"

def test_constant_value_none_serializes_as_null(self):
"""None constant_value serializes to null in JSON."""
metadata = BaseMetadata(constant_value=None)
json_str = json.dumps(metadata)
parsed = json.loads(json_str)

assert parsed["constantValue"] is None
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def test_json_dumps_works_directly(self):
additional_date_formats=["yyyy-MM-dd"],
apply_constant_shift_to_document=True,
metadata=ts_metadata,
swaps={"date1": "date2"}
swaps={"date1": "date2"},
use_clear_date_and_passthrough_or_group_year_generator=True
)
json_str = json.dumps(metadata)

Expand All @@ -29,6 +30,7 @@ def test_json_dumps_works_directly(self):
assert parsed["applyConstantShiftToDocument"] is True
assert parsed["metadata"]["leftShiftInDays"] == -30
assert parsed["metadata"]["rightShiftInDays"] == 30
assert parsed["useClearDateAndPassthroughOrGroupYearGenerator"] is True

def test_json_includes_type_field(self):
"""Serialized JSON should include _type for deserialization."""
Expand All @@ -52,6 +54,7 @@ def test_json_roundtrip_with_defaults(self):
assert restored.apply_constant_shift_to_document == original.apply_constant_shift_to_document
assert restored.metadata.left_shift_in_days == original.metadata.left_shift_in_days
assert restored.metadata.right_shift_in_days == original.metadata.right_shift_in_days
assert restored.use_clear_date_and_passthrough_or_group_year_generator is False

def test_json_roundtrip_with_custom_values(self):
"""Round-trip serialization preserves custom values."""
Expand All @@ -66,7 +69,8 @@ def test_json_roundtrip_with_custom_values(self):
additional_date_formats=["format1", "format2"],
apply_constant_shift_to_document=True,
metadata=ts_metadata,
swaps={"outer": "swap"}
swaps={"outer": "swap"},
use_clear_date_and_passthrough_or_group_year_generator=True
)
json_str = json.dumps(original)
parsed = json.loads(json_str)
Expand All @@ -81,6 +85,7 @@ def test_json_roundtrip_with_custom_values(self):
assert restored.metadata.right_shift_in_days == 100
assert restored.metadata.swaps == {"ts_key": "ts_val"}
assert restored.swaps == {"outer": "swap"}
assert restored.use_clear_date_and_passthrough_or_group_year_generator is True

def test_attribute_access_works(self):
"""Property-based attribute access should work."""
Expand All @@ -98,18 +103,22 @@ def test_attribute_setter_works(self):
metadata = DateTimeGeneratorMetadata()
metadata.additional_date_formats = ["new-format"]
metadata.apply_constant_shift_to_document = True
metadata.use_clear_date_and_passthrough_or_group_year_generator = True

assert metadata.additional_date_formats == ["new-format"]
assert metadata["additionalDateFormats"] == ["new-format"]
assert metadata.apply_constant_shift_to_document is True
assert metadata["applyConstantShiftToDocument"] is True
assert metadata.use_clear_date_and_passthrough_or_group_year_generator is True
assert metadata["useClearDateAndPassthroughOrGroupYearGenerator"] is True

def test_dict_access_works(self):
"""Direct dict access should work."""
metadata = DateTimeGeneratorMetadata(additional_date_formats=["test"])

assert metadata["additionalDateFormats"] == ["test"]
assert metadata["_type"] == "DateTimeGeneratorMetadata"
assert "useClearDateAndPassthroughOrGroupYearGenerator" in metadata

def test_to_payload_returns_dict_copy(self):
"""to_payload() should return a dict copy of the metadata."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ def test_json_dumps_works_directly(self):
use_non_hipaa_address_generator=True,
replace_truncated_zeros_in_zip_code=False,
realistic_synthetic_values=False,
swaps={"Atlanta": "Boston"}
swaps={"Atlanta": "Boston"},
use_three_digit_zips=True,
replace_foreign_zip_codes_with_zeros=True
)
json_str = json.dumps(metadata)

Expand All @@ -25,6 +27,8 @@ def test_json_dumps_works_directly(self):
assert parsed["replaceTruncatedZerosInZipCode"] is False
assert parsed["realisticSyntheticValues"] is False
assert parsed["swaps"] == {"Atlanta": "Boston"}
assert parsed["useThreeDigitZips"] is True
assert parsed["replaceForeignZipCodesWithZeros"] is True

def test_json_includes_type_field(self):
"""Serialized JSON should include _type for deserialization."""
Expand All @@ -47,6 +51,8 @@ def test_json_roundtrip_with_defaults(self):
assert restored.replace_truncated_zeros_in_zip_code == original.replace_truncated_zeros_in_zip_code
assert restored.realistic_synthetic_values == original.realistic_synthetic_values
assert restored.swaps == original.swaps
assert restored.use_three_digit_zips is False
assert restored.replace_foreign_zip_codes_with_zeros is False

def test_json_roundtrip_with_custom_values(self):
"""Round-trip serialization preserves custom values."""
Expand All @@ -55,7 +61,9 @@ def test_json_roundtrip_with_custom_values(self):
use_non_hipaa_address_generator=True,
replace_truncated_zeros_in_zip_code=False,
realistic_synthetic_values=False,
swaps={"city1": "city2"}
swaps={"city1": "city2"},
use_three_digit_zips=True,
replace_foreign_zip_codes_with_zeros=True
)
json_str = json.dumps(original)
parsed = json.loads(json_str)
Expand All @@ -67,6 +75,8 @@ def test_json_roundtrip_with_custom_values(self):
assert restored.replace_truncated_zeros_in_zip_code is False
assert restored.realistic_synthetic_values is False
assert restored.swaps == {"city1": "city2"}
assert restored.use_three_digit_zips is True
assert restored.replace_foreign_zip_codes_with_zeros is True

def test_attribute_access_works(self):
"""Property-based attribute access should work."""
Expand All @@ -85,20 +95,28 @@ def test_attribute_setter_works(self):
metadata.use_non_hipaa_address_generator = True
metadata.replace_truncated_zeros_in_zip_code = False
metadata.realistic_synthetic_values = False
metadata.use_three_digit_zips = True
metadata.replace_foreign_zip_codes_with_zeros = True

assert metadata.use_non_hipaa_address_generator is True
assert metadata["useNonHipaaAddressGenerator"] is True
assert metadata.replace_truncated_zeros_in_zip_code is False
assert metadata["replaceTruncatedZerosInZipCode"] is False
assert metadata.realistic_synthetic_values is False
assert metadata["realisticSyntheticValues"] is False
assert metadata.use_three_digit_zips is True
assert metadata["useThreeDigitZips"] is True
assert metadata.replace_foreign_zip_codes_with_zeros is True
assert metadata["replaceForeignZipCodesWithZeros"] is True

def test_dict_access_works(self):
"""Direct dict access should work."""
metadata = HipaaAddressGeneratorMetadata(use_non_hipaa_address_generator=True)

assert metadata["useNonHipaaAddressGenerator"] is True
assert metadata["_type"] == "HipaaAddressGeneratorMetadata"
assert "useThreeDigitZips" in metadata
assert "replaceForeignZipCodesWithZeros" in metadata

def test_to_payload_returns_dict_copy(self):
"""to_payload() should return a dict copy of the metadata."""
Expand Down
Loading
Loading