diff --git a/changelog.txt b/changelog.txt index d419bfa..7ae0d25 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,3 +1,7 @@ +# 0.0.3 +* Fixed crash when a boolean column contains only one unique value (e.g., all True) and the table is rendered with the native backend. +* Fixed object columns containing decimal.Decimal values (e.g., from Amazon Redshift) being incorrectly treated as categorical instead of numerical. + # 0.0.2 * Introduced narwhals as dependency, adds support for pandas 3, polars, pyarrow diff --git a/pysummaries/__init__.py b/pysummaries/__init__.py index 6cf64b0..617889a 100644 --- a/pysummaries/__init__.py +++ b/pysummaries/__init__.py @@ -30,4 +30,4 @@ 'pandas_to_report_html', 'get_styles', 'Pandas2HTMLSummaryTable', 'PySummariesException'] -__version__ = '0.0.2' +__version__ = '0.0.3' diff --git a/pysummaries/reportable/intermediate_representation.py b/pysummaries/reportable/intermediate_representation.py index 87452e7..339adc6 100644 --- a/pysummaries/reportable/intermediate_representation.py +++ b/pysummaries/reportable/intermediate_representation.py @@ -180,6 +180,8 @@ def extract_multibodyblocks(indexes, df, value_styles, styles, tabid): #row labels *rowgroup_labels, currow_label = rowlabel if (rowgroup_labels != lastblock_index and lastblock_index is not None) or cnt==len(df): + if lastblock_index is None: + lastblock_index = rowgroup_labels rowgroupstyle = styles.get("rowgrouplabel") titles = [RowGroupLabel(x, tabid, level, y, rowgroup_cnts, style=rowgroupstyle) for level,(x,y) in enumerate(zip(lastblock_index, labels_paddings))] # prune diff --git a/pysummaries/table_summary/table_summary.py b/pysummaries/table_summary/table_summary.py index e2452f6..3bfa9d2 100644 --- a/pysummaries/table_summary/table_summary.py +++ b/pysummaries/table_summary/table_summary.py @@ -179,6 +179,11 @@ def calculate_table_summary(df, strata=None, show_overall=True, columns_labels=N coltypes = detect_df_col_types(df) if not isinstance(df, pd.DataFrame): df = nw.from_native(df).to_pandas() + # Convert object columns classified as numerical (e.g. decimal.Decimal) to + # a numeric dtype so that pandas can compute stats on them. + for col_name, col_type in coltypes.items(): + if col_type == "numerical" and df[col_name].dtype == object: + df[col_name] = pd.to_numeric(df[col_name], errors="coerce") colnames = df.columns.to_list() strat_cats = list() if strata is not None: diff --git a/pysummaries/table_summary/utils.py b/pysummaries/table_summary/utils.py index 238a5e4..2bb8e31 100644 --- a/pysummaries/table_summary/utils.py +++ b/pysummaries/table_summary/utils.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ############################################################################# +from decimal import Decimal + import narwhals as nw import pandas as pd import numpy as np @@ -41,7 +43,7 @@ def _classify_object_col(df, col_name): return "categorical" if curtype == str: return "categorical" - if np.issubdtype(type(col.iloc[0]), np.number) or isinstance(col.iloc[0], (int, float)): + if np.issubdtype(type(col.iloc[0]), np.number) or isinstance(col.iloc[0], (int, float, Decimal)): return "numerical" return "categorical" diff --git a/setup.py b/setup.py index c5241b6..ac759d7 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ setup( name="pysummaries", - version='0.0.2', + version='0.0.3', author="Otto Fajardo", author_email="pleasecontactviagithub@notvalid.com", description="Produce table summaries from pandas, polars or PyArrow dataframes", diff --git a/tests/test_basic.py b/tests/test_basic.py index 5866afe..b23f281 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -437,6 +437,77 @@ def test_simple_table_summary_gt(self): # --------------------------------------------------------------------------- # HTML content tests using BeautifulSoup # --------------------------------------------------------------------------- +class TestEdgeCases(unittest.TestCase): + """Tests for edge cases and bug fixes.""" + + def test_bool_column_single_value(self): + """Boolean column with only one unique value should not crash. + + Regression test: when the summary DataFrame has a single row, + cnt==len(df) on the first iteration while lastblock_index is still + None, causing a TypeError in extract_multibodyblocks. + """ + df = pd.DataFrame({ + "flag": [True, True, True, True], + "group": ["A", "A", "B", "B"], + }) + result = pysummaries.get_table_summary( + df, strata="group", columns_include=["flag", "group"], + show_overall=False + ) + self.assertIsNotNone(result) + + def test_decimal_column_treated_as_numeric(self): + """Columns containing decimal.Decimal values should be treated as numeric.""" + from decimal import Decimal + df = pd.DataFrame({ + "value": [Decimal("12.5"), Decimal("24.3"), Decimal("6.1"), Decimal("18.7")], + "group": ["A", "A", "B", "B"], + }) + col_types = detect_df_col_types(df) + self.assertEqual(col_types["value"], "numerical") + + def test_decimal_column_summary(self): + """Decimal columns should produce numeric summaries (Mean, Median, etc.).""" + from decimal import Decimal + df = pd.DataFrame({ + "value": [Decimal("12.5"), Decimal("24.3"), Decimal("6.1"), Decimal("18.7")], + "group": ["A", "A", "B", "B"], + }) + result, _ = pysummaries.calculate_table_summary( + df, strata="group", columns_include=["value", "group"], + show_overall=False + ) + # Numeric summaries have "Mean (SD)" as a row label + self.assertIn("Mean (SD)", result.index.get_level_values(-1)) + + def test_decimal_column_polars(self): + """Polars Decimal columns should produce numeric summaries.""" + from decimal import Decimal + df = pl.DataFrame({ + "value": [Decimal("12.5"), Decimal("24.3"), Decimal("6.1"), Decimal("18.7")], + "group": ["A", "A", "B", "B"], + }) + result, _ = pysummaries.calculate_table_summary( + df, strata="group", columns_include=["value", "group"], + show_overall=False + ) + self.assertIn("Mean (SD)", result.index.get_level_values(-1)) + + def test_decimal_column_pyarrow(self): + """PyArrow Decimal columns should produce numeric summaries.""" + from decimal import Decimal + table = pa.table({ + "value": pa.array([Decimal("12.5"), Decimal("24.3"), Decimal("6.1"), Decimal("18.7")]), + "group": ["A", "A", "B", "B"], + }) + result, _ = pysummaries.calculate_table_summary( + table, strata="group", columns_include=["value", "group"], + show_overall=False + ) + self.assertIn("Mean (SD)", result.index.get_level_values(-1)) + + class TestHTMLContent(unittest.TestCase): def setUp(self):