Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
6466a97
wip
gsheni Jan 22, 2026
72276db
wip on test
gsheni Jan 23, 2026
2d56969
cleanup
gsheni Jan 23, 2026
e4f0369
wip
gsheni Jan 23, 2026
c87b595
wip
gsheni Jan 23, 2026
e6576b0
wip
gsheni Jan 23, 2026
684eace
cleanup
gsheni Jan 23, 2026
af14a5d
add additional test
gsheni Jan 26, 2026
2af75cc
Merge branch 'main' of github.com:sdv-dev/SDV into issue-2779-allow-a…
gsheni Jan 26, 2026
aa8f97e
cleanup
gsheni Jan 26, 2026
2354dbb
Add metadata func
gsheni Jan 26, 2026
8c1a74f
fix space"
gsheni Jan 26, 2026
4b851c7
cleanup
gsheni Jan 26, 2026
baeb75e
docstrings
gsheni Jan 26, 2026
18a2a31
fix test
gsheni Jan 26, 2026
e708398
Merge branch 'main' of github.com:sdv-dev/SDV into issue-2779-allow-a…
gsheni Jan 26, 2026
d12eb14
change approach
gsheni Jan 28, 2026
b8bf924
cleanup
gsheni Jan 28, 2026
955a2e3
add unit tests and cleanup
gsheni Jan 28, 2026
1895c09
cleanup
gsheni Jan 28, 2026
2890e78
fix flaky test
gsheni Jan 28, 2026
7edba32
lint
gsheni Jan 28, 2026
50fbee3
comment out flaky test
gsheni Jan 29, 2026
ed37ed3
lint
gsheni Jan 29, 2026
ca4445a
fix lint
gsheni Jan 29, 2026
8619429
fix lint
gsheni Jan 29, 2026
713de4f
address feedback
gsheni Jan 30, 2026
b043099
fix metadata
gsheni Jan 30, 2026
317746d
Update test_hma.py
gsheni Jan 31, 2026
51fcdc8
Update test_multi_table.py
gsheni Jan 31, 2026
f91f373
add int tests with multiple FKs
gsheni Feb 2, 2026
938029c
rename
gsheni Feb 2, 2026
848b708
lint
gsheni Feb 2, 2026
6f15e19
wip
gsheni Feb 2, 2026
ae72623
cleanup
gsheni Feb 3, 2026
68e9565
use existing logic
gsheni Feb 3, 2026
4827822
Update test_hma.py
gsheni Feb 3, 2026
0e85d69
wip
gsheni Jan 30, 2026
4c958c9
add unit test
gsheni Jan 30, 2026
216b521
cleanup
gsheni Jan 30, 2026
8c2b171
Add missing test
gsheni Feb 3, 2026
e9a49de
fix conflicts
gsheni Feb 4, 2026
3a1da09
fix failing tests
gsheni Feb 4, 2026
4ebdc43
add continue
gsheni Feb 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions sdv/metadata/multi_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,18 +541,27 @@ def _detect_foreign_keys_by_column_name(self, data):
Dictionary of table names to dataframes.
NOTE: this is only used in SDV-Enterprise.
"""
for parent_candidate in self.tables.keys():
primary_key = self.tables[parent_candidate].primary_key
for child_candidate in self.tables.keys() - {parent_candidate}:
sorted_tables = sorted(self.tables.keys())
for parent_candidate in sorted_tables:
parent_meta = self.tables[parent_candidate]
primary_key = parent_meta.primary_key
if primary_key is None:
continue
for child_candidate in sorted_tables:
if child_candidate == parent_candidate:
continue
child_meta = self.tables[child_candidate]
if primary_key in child_meta.columns.keys():
if primary_key in child_meta.columns:
try:
original_foreign_key_sdtype = child_meta.columns[primary_key]['sdtype']
original_sdinfo = child_meta.columns[primary_key]
original_foreign_key_sdtype = original_sdinfo['sdtype']

if original_foreign_key_sdtype != 'id':
self.update_column(
table_name=child_candidate, column_name=primary_key, sdtype='id'
table_name=child_candidate,
column_name=primary_key,
sdtype='id',
)

self.add_relationship(
parent_candidate, child_candidate, primary_key, primary_key
)
Expand Down
24 changes: 12 additions & 12 deletions tests/integration/metadata/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,36 +10,36 @@ def primary_key_to_primary_key():
'tables': {
'tableA': {
'columns': {
'table_A_primary_key': {'sdtype': 'id'},
'column_1': {'sdtype': 'categorical'},
'table_id': {'sdtype': 'id'},
'col1': {'sdtype': 'categorical'},
},
'primary_key': 'table_A_primary_key',
'primary_key': 'table_id',
},
'tableB': {
'columns': {
'table_B_primary_key': {'sdtype': 'id'},
'column_2': {'sdtype': 'categorical'},
'table_id': {'sdtype': 'id'},
'col2': {'sdtype': 'categorical'},
},
'primary_key': 'table_B_primary_key',
'primary_key': 'table_id',
},
},
'relationships': [
{
'parent_table_name': 'tableA',
'parent_primary_key': 'table_A_primary_key',
'parent_primary_key': 'table_id',
'child_table_name': 'tableB',
'child_foreign_key': 'table_B_primary_key',
'child_foreign_key': 'table_id',
}
],
})
data = {
'tableA': pd.DataFrame({
'table_A_primary_key': range(5),
'column_1': ['A', 'B', 'B', 'C', 'C'],
'table_id': range(5),
'col1': ['A', 'B', 'B', 'C', 'C'],
}),
'tableB': pd.DataFrame({
'table_B_primary_key': range(5),
'column_2': ['A', 'B', 'B', 'C', 'C'],
'table_id': range(5),
'col2': ['A', 'B', 'B', 'C', 'C'],
}),
}
return data, metadata
81 changes: 62 additions & 19 deletions tests/integration/metadata/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
from copy import deepcopy

import numpy as np
import pandas as pd
import pytest

Expand Down Expand Up @@ -879,6 +880,44 @@ def test_detect_from_dataframes_invalid_format():
Metadata.detect_from_dataframes(data)


def test_no_duplicated_foreign_key_relationships_are_generated():
# Setup
parent_a = pd.DataFrame({
'id': ['id-' + str(i) for i in range(100)],
'col1': [round(i, 2) for i in np.random.uniform(low=0, high=10, size=100)],
})
parent_b = pd.DataFrame({
'id': ['id-' + str(i) for i in range(100)],
'col2': [round(i, 2) for i in np.random.uniform(low=0, high=10, size=100)],
})

child_c = pd.DataFrame({
'id': ['id-' + str(i) for i in np.random.randint(0, 100, size=1000)],
'col3': [round(i, 2) for i in np.random.uniform(low=0, high=10, size=1000)],
})

data = {'parent_a': parent_a, 'parent_b': parent_b, 'child_c': child_c}

# Run
metadata = Metadata.detect_from_dataframes(data)

# Assert
assert metadata.relationships == [
{
'parent_table_name': 'parent_a',
'child_table_name': 'child_c',
'parent_primary_key': 'id',
'child_foreign_key': 'id',
},
{
'parent_table_name': 'parent_a',
'child_table_name': 'parent_b',
'parent_primary_key': 'id',
'child_foreign_key': 'id',
},
]


def test_validate_metadata_with_reused_foreign_keys():
# Setup
metadata_dict = {
Expand Down Expand Up @@ -1373,44 +1412,48 @@ def test_validate_empty_metadata():
synthesizer.fit(pd.DataFrame())


def test_validate_pk_to_pk(primary_key_to_primary_key):
"""Test validation to indicate a PK to PK relationship."""
def test_validate_primary_key_to_primary_key(primary_key_to_primary_key):
"""Test validate methods with primary key to primary key dataset."""
# Setup
data, metadata_instance = primary_key_to_primary_key
data, metadata = primary_key_to_primary_key

# Run and Assert
metadata.validate()
metadata.validate_data(data)


def test_primary_key_to_primary_key(primary_key_to_primary_key):
"""Test metadata can auto-detect a primary key which is also a foreign key."""
# Setup
data, _ = primary_key_to_primary_key

# Run
metadata_instance.validate()
metadata_instance.validate_data(data)
metadata = Metadata.detect_from_dataframes(data)

# Assert
expected_metadata = {
metadata.validate()
metadata.validate_data(data)
assert metadata.to_dict() == {
'tables': {
'tableA': {
'columns': {
'table_A_primary_key': {'sdtype': 'id'},
'column_1': {'sdtype': 'categorical'},
},
'primary_key': 'table_A_primary_key',
'columns': {'table_id': {'sdtype': 'id'}, 'col1': {'sdtype': 'categorical'}},
'primary_key': 'table_id',
},
'tableB': {
'columns': {
'table_B_primary_key': {'sdtype': 'id'},
'column_2': {'sdtype': 'categorical'},
},
'primary_key': 'table_B_primary_key',
'columns': {'table_id': {'sdtype': 'id'}, 'col2': {'sdtype': 'categorical'}},
'primary_key': 'table_id',
},
},
'relationships': [
{
'parent_table_name': 'tableA',
'parent_primary_key': 'table_A_primary_key',
'child_table_name': 'tableB',
'child_foreign_key': 'table_B_primary_key',
'parent_primary_key': 'table_id',
'child_foreign_key': 'table_id',
}
],
'METADATA_SPEC_VERSION': 'V1',
}
assert metadata_instance.to_dict() == expected_metadata


def test_validate_pk_to_pk_email():
Expand Down
69 changes: 69 additions & 0 deletions tests/unit/metadata/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,75 @@ def test_detect_from_dataframe_bad_input_infer_keys(self):
with pytest.raises(ValueError, match=expected_message):
Metadata.detect_from_dataframe(data, infer_keys=infer_keys)

def test_detect_from_dataframe_primary_key_to_primary_key(self):
"""Test primary to primary key relationship is detected if column name match."""
# Setup
data = {
'table1': pd.DataFrame({
'id': [1, 2, 3],
}),
'table2': pd.DataFrame({
'id': [1, 2, 3],
}),
}
instance = Metadata()
instance.detect_table_from_dataframe('table1', data['table1'])
instance.detect_table_from_dataframe('table2', data['table2'])

# Run
instance._detect_foreign_keys_by_column_name(data)

# Assert
assert instance.to_dict()['relationships'] == [
{
'parent_table_name': 'table1',
'child_table_name': 'table2',
'parent_primary_key': 'id',
'child_foreign_key': 'id',
}
]

def test_detect_foreign_keys_sorting(self):
# Setup
mock_metadata_1 = Mock()
mock_metadata_1.primary_key = 'id'
mock_metadata_1.columns = {
'id': {'sdtype': 'id'},
'col1': {'sdtype': 'categorical'},
}
mock_metadata_2 = Mock()
mock_metadata_2.primary_key = 'id'
mock_metadata_2.columns = {
'id': {'sdtype': 'id'},
'col2': {'sdtype': 'numerical'},
}
data = {
'table1': pd.DataFrame({
'id': [1, 2, 3],
'col1': ['a', 'b', 'c'],
}),
'table2': pd.DataFrame({
'id': [1, 2, 3],
'col2': [1.1, 2.2, 3.3],
}),
}
instance = Metadata()
instance.tables = {
'table1': mock_metadata_1,
'table2': mock_metadata_2,
}
instance.add_relationship = Mock()

# Run
instance._detect_foreign_keys_by_column_name(data=data)

# Assert
expected_calls = [
call('table1', 'table2', 'id', 'id'),
call('table2', 'table1', 'id', 'id'),
]
instance.add_relationship.assert_has_calls(expected_calls, any_order=False)

def test__handle_table_name(self):
"""Test the ``_handle_table_name`` method."""
# Setup
Expand Down
Loading