From 0e73573dd3bb8369c78598fcfcb6d93fdbf9e2a3 Mon Sep 17 00:00:00 2001
From: Matt Whiteway <themattinthehatt@gmail.com>
Date: Thu, 9 Apr 2026 16:29:54 -0400
Subject: [PATCH 1/4] integration test fixup

---
 tests/conftest.py                             | 99 ++++++++++++++++++-
 tests/scripts/README.md                       | 83 ++++++++++++++++
 .../scripts/test_ibl_paw_multicam_example.py  | 17 +++-
 tests/scripts/test_ibl_pupil_example.py       | 12 ++-
 .../scripts/test_mirrored_multicam_example.py | 14 ++-
 tests/scripts/test_multicam_example.py        | 26 +++--
 tests/scripts/test_singlecam_example.py       | 14 ++-
 tests/test_multicam_smoother.py               |  3 -
 8 files changed, 238 insertions(+), 30 deletions(-)
 create mode 100644 tests/scripts/README.md

diff --git a/tests/conftest.py b/tests/conftest.py
index a9a9691..ad7c2dc 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,19 +1,65 @@
+import io
+import shutil
 import subprocess
+import urllib.request
+import zipfile
+from pathlib import Path
 from typing import Callable
 
+import pandas as pd
 import pytest
 
+# URL of the zipped golden files. Update this after uploading a new release to GitHub.
+GOLDEN_URL = 'https://github.com/paninski-lab/eks-test-fixtures/releases/download/v1/eks_golden.zip'
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        '--generate-golden',
+        action='store_true',
+        default=False,
+        help='Generate golden output files instead of comparing against them.',
+    )
+    parser.addoption(
+        '--golden-dir',
+        action='store',
+        default=None,
+        help='Directory to write golden files to (used with --generate-golden).',
+    )
+
+
+@pytest.fixture(scope='session')
+def golden_dir(tmp_path_factory, pytestconfig):
+    """Return path to golden files directory, downloading and extracting if necessary."""
+    if pytestconfig.getoption('--generate-golden'):
+        golden_dir_opt = pytestconfig.getoption('--golden-dir')
+        if golden_dir_opt is None:
+            raise ValueError('--golden-dir must be specified when using --generate-golden')
+        path = Path(golden_dir_opt)
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+    if GOLDEN_URL is None:
+        return None
+
+    cache_dir = tmp_path_factory.mktemp('golden')
+    with urllib.request.urlopen(GOLDEN_URL) as response:
+        data = response.read()
+    with zipfile.ZipFile(io.BytesIO(data)) as zf:
+        zf.extractall(cache_dir)
+    return cache_dir
+
 
 @pytest.fixture
 def run_script() -> Callable:
 
-    def _run_script(script_file, input_dir, output_dir, **kwargs):
+    def _run_script(script_file, input_dir, output_dir, **kwargs) -> Path:
 
         command_str = [
             'python',
             script_file,
             '--input-dir', input_dir,
-            '--save-dir', output_dir,
+            '--save-dir', str(output_dir),
             '--verbose', 'True',
         ]
         for key, arg in kwargs.items():
@@ -25,5 +71,54 @@ def _run_script(script_file, input_dir, output_dir, **kwargs):
 
         process = subprocess.run(command_str)
         assert process.returncode == 0
+        return Path(str(output_dir))
 
     return _run_script
+
+
+@pytest.fixture
+def compare_to_golden(golden_dir, pytestconfig):
+    """Fixture that either saves CSV outputs as golden files, or compares against them.
+
+    In generate mode (--generate-golden), copies all CSVs from output_dir into
+    golden_dir/<test_name>/. In compare mode, downloads golden files from the URL
+    and asserts numerical equality against them.
+    """
+
+    def _compare(test_name: str, output_dir: Path):
+        csv_files = sorted(output_dir.glob('*.csv'))
+        assert len(csv_files) > 0, f'No CSV files found in {output_dir}'
+
+        if pytestconfig.getoption('--generate-golden'):
+            dest = golden_dir / test_name
+            dest.mkdir(parents=True, exist_ok=True)
+            for csv_file in csv_files:
+                shutil.copy(csv_file, dest / csv_file.name)
+            return
+
+        if golden_dir is None:
+            pytest.skip(
+                'No golden URL provided (set --golden-url or EKS_GOLDEN_URL); skipping comparison.'
+            )
+
+        golden_test_dir = golden_dir / test_name
+        assert golden_test_dir.exists(), (
+            f'Golden directory not found for test "{test_name}": {golden_test_dir}'
+        )
+
+        for csv_file in csv_files:
+            golden_csv = golden_test_dir / csv_file.name
+            assert golden_csv.exists(), (
+                f'Golden file not found: {golden_csv}. '
+                f'Run with --generate-golden to regenerate.'
+            )
+            actual = pd.read_csv(csv_file, index_col=0)
+            expected = pd.read_csv(golden_csv, index_col=0)
+            pd.testing.assert_frame_equal(
+                actual, expected,
+                check_exact=False,
+                atol=1e-5,
+                obj=f'{test_name}/{csv_file.name}',
+            )
+
+    return _compare
diff --git a/tests/scripts/README.md b/tests/scripts/README.md
new file mode 100644
index 0000000..f6a3f22
--- /dev/null
+++ b/tests/scripts/README.md
@@ -0,0 +1,83 @@
+from tests.conftest import GOLDEN_URL
+
+# Integration Script Tests
+
+These tests run the example scripts end-to-end and optionally compare their CSV outputs
+against a set of **golden files** — a reference snapshot of known-good outputs.
+
+## How tests work
+
+- **Without golden files**: tests only verify that the scripts exit without error (original behavior).
+- **With golden files**: after each script runs, all CSV outputs are compared against the
+  corresponding golden CSVs using `pandas.testing.assert_frame_equal` with `atol=1e-5`.
+
+---
+
+## Generating new golden files
+
+Run this whenever you want to establish a new baseline (e.g. after an intentional change
+to the algorithm, or when setting up golden files for the first time).
+
+```bash
+pytest tests/scripts/ \
+    --generate-golden \
+    --golden-dir /tmp/eks_golden
+```
+
+This runs every integration test and copies the CSV outputs into
+`/tmp/eks_golden/<test_name>/`. The directory structure will look like:
+
+```
+/tmp/eks_golden/
+  test_singlecam_example_defaults/
+    eks_singlecam.csv
+  test_singlecam_example_fixed_smooth_param/
+    eks_singlecam.csv
+  test_multicam_example_defaults/
+    multicam_top_results.csv
+    multicam_bot_results.csv
+  test_multicam_example_defaults_nonlinear/
+    multicam_Cam-A_results.csv
+    multicam_Cam-B_results.csv
+    multicam_Cam-C_results.csv
+    multicam_3d_results.csv
+  ...
+```
+
+### Zip and upload
+
+```bash
+cd /tmp/eks_golden
+zip -r eks_golden.zip .
+```
+
+Upload `eks_golden.zip` to your hosting location. The zip must have the test-name
+folders at its root (no extra top-level wrapper directory) — the `cd` + `.` zip
+command above ensures this.
+
+### Update the URL in conftest.py
+
+Once uploaded, copy the direct download URL of the zip asset from the GitHub release
+and set it as `GOLDEN_URL` near the top of `tests/conftest.py`:
+
+```python
+GOLDEN_URL = 'https://github.com/paninski-lab/eks-test-fixtures/releases/download/vX/eks_golden.zip'
+```
+
+Commit this change so CI and other contributors pick it up automatically.
+
+---
+
+## Running tests with golden comparison
+
+```bash
+pytest tests/scripts/
+```
+
+The golden zip is downloaded once per test session and cached in a temporary directory.
+Golden comparison is skipped automatically when `GOLDEN_URL = None` in `conftest.py`.
+
+### Without golden comparison
+
+Set `GOLDEN_URL = None` in `tests/conftest.py`. Tests will only verify that the scripts
+exit without error (original behavior).
diff --git a/tests/scripts/test_ibl_paw_multicam_example.py b/tests/scripts/test_ibl_paw_multicam_example.py
index ee42163..e3322a6 100644
--- a/tests/scripts/test_ibl_paw_multicam_example.py
+++ b/tests/scripts/test_ibl_paw_multicam_example.py
@@ -1,18 +1,25 @@
 
 
-def test_ibl_paw_multicam_example_defaults(run_script, tmpdir, pytestconfig):
+def test_ibl_paw_multicam_example_defaults(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'ibl_paw_multiview_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'ibl-paw'),
         output_dir=tmpdir,
     )
+    compare_to_golden(request.node.name, output_dir)
 
 
-def test_ibl_paw_multicam_example_fixed_smooth_param(run_script, tmpdir, pytestconfig):
-    run_script(
+def test_ibl_paw_multicam_example_fixed_smooth_param(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
+
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'ibl_paw_multiview_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'ibl-paw'),
         output_dir=tmpdir,
-        s=10
+        s=10,
     )
+    compare_to_golden(request.node.name, output_dir)
diff --git a/tests/scripts/test_ibl_pupil_example.py b/tests/scripts/test_ibl_pupil_example.py
index 458606d..f891af0 100644
--- a/tests/scripts/test_ibl_pupil_example.py
+++ b/tests/scripts/test_ibl_pupil_example.py
@@ -1,20 +1,24 @@
 
 
-def test_ibl_pupil_example_defaults(run_script, tmpdir, pytestconfig):
+def test_ibl_pupil_example_defaults(run_script, compare_to_golden, tmpdir, pytestconfig, request):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'ibl_pupil_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'ibl-pupil'),
         output_dir=tmpdir,
     )
+    compare_to_golden(request.node.name, output_dir)
 
 
-def test_ibl_pupil_example_fixed_smooth_param(run_script, tmpdir, pytestconfig):
+def test_ibl_pupil_example_fixed_smooth_param(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'ibl_pupil_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'ibl-pupil'),
         output_dir=tmpdir,
         diameter_s=0.99,
         com_s=0.99,
     )
+    compare_to_golden(request.node.name, output_dir)
diff --git a/tests/scripts/test_mirrored_multicam_example.py b/tests/scripts/test_mirrored_multicam_example.py
index 24fb938..e4ae7e1 100644
--- a/tests/scripts/test_mirrored_multicam_example.py
+++ b/tests/scripts/test_mirrored_multicam_example.py
@@ -1,19 +1,24 @@
 
 
-def test_mirrored_multicam_example_defaults(run_script, tmpdir, pytestconfig):
+def test_mirrored_multicam_example_defaults(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'mirrored_multicam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'mirror-mouse'),
         output_dir=tmpdir,
         bodypart_list=['paw1LH', 'paw2LF'],  # , 'paw3RF', 'paw4RH'],  # unneeded computation
         camera_names=['top', 'bot'],
     )
+    compare_to_golden(request.node.name, output_dir)
 
 
-def test_mirrored_multicam_example_fixed_smooth_param(run_script, tmpdir, pytestconfig):
+def test_mirrored_multicam_example_fixed_smooth_param(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'mirrored_multicam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'mirror-mouse'),
         output_dir=tmpdir,
@@ -21,3 +26,4 @@ def test_mirrored_multicam_example_fixed_smooth_param(run_script, tmpdir, pytest
         camera_names=['top', 'bot'],
         s=10
     )
+    compare_to_golden(request.node.name, output_dir)
diff --git a/tests/scripts/test_multicam_example.py b/tests/scripts/test_multicam_example.py
index 2d41007..7908fc5 100644
--- a/tests/scripts/test_multicam_example.py
+++ b/tests/scripts/test_multicam_example.py
@@ -1,19 +1,22 @@
 
 
-def test_multicam_example_defaults(run_script, tmpdir, pytestconfig):
+def test_multicam_example_defaults(run_script, compare_to_golden, tmpdir, pytestconfig, request):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'multicam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'mirror-mouse-separate'),
         output_dir=tmpdir,
         bodypart_list=['paw1LH', 'paw2LF'],  # , 'paw3RF', 'paw4RH'],  # unneeded computation
         camera_names=['top', 'bot'],
     )
+    compare_to_golden(request.node.name, output_dir)
 
 
-def test_multicam_example_fixed_smooth_param(run_script, tmpdir, pytestconfig):
+def test_multicam_example_fixed_smooth_param(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'multicam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'mirror-mouse-separate'),
         output_dir=tmpdir,
@@ -21,11 +24,14 @@ def test_multicam_example_fixed_smooth_param(run_script, tmpdir, pytestconfig):
         camera_names=['top', 'bot'],
         s=10,
     )
+    compare_to_golden(request.node.name, output_dir)
 
 
-def test_multicam_example_defaults_nonlinear(run_script, tmpdir, pytestconfig):
+def test_multicam_example_defaults_nonlinear(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'multicam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'fly'),
         output_dir=tmpdir,
@@ -33,11 +39,14 @@ def test_multicam_example_defaults_nonlinear(run_script, tmpdir, pytestconfig):
         camera_names=['Cam-A', 'Cam-B', 'Cam-C'],
         calibration=str(pytestconfig.rootpath / 'data' / 'fly' / 'calibration.toml'),
     )
+    compare_to_golden(request.node.name, output_dir)
 
 
-def test_multicam_example_fixed_smooth_param_nonlinear(run_script, tmpdir, pytestconfig):
+def test_multicam_example_fixed_smooth_param_nonlinear(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'multicam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'fly'),
         output_dir=tmpdir,
@@ -46,3 +55,4 @@ def test_multicam_example_fixed_smooth_param_nonlinear(run_script, tmpdir, pytes
         calibration=str(pytestconfig.rootpath / 'data' / 'fly' / 'calibration.toml'),
         s=10,
     )
+    compare_to_golden(request.node.name, output_dir)
diff --git a/tests/scripts/test_singlecam_example.py b/tests/scripts/test_singlecam_example.py
index bb681c1..66c2e52 100644
--- a/tests/scripts/test_singlecam_example.py
+++ b/tests/scripts/test_singlecam_example.py
@@ -1,19 +1,25 @@
 
 
-def test_singlecam_example_defaults(run_script, tmpdir, pytestconfig):
+def test_singlecam_example_defaults(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'singlecam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data/ibl-pupil'),
         output_dir=tmpdir,
     )
+    compare_to_golden(request.node.name, output_dir)
 
 
-def test_singlecam_example_fixed_smooth_param(run_script, tmpdir, pytestconfig):
+def test_singlecam_example_fixed_smooth_param(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'singlecam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data/ibl-pupil'),
         output_dir=tmpdir,
         s=10,
     )
+    compare_to_golden(request.node.name, output_dir)
diff --git a/tests/test_multicam_smoother.py b/tests/test_multicam_smoother.py
index bcaf518..5c3f03c 100644
--- a/tests/test_multicam_smoother.py
+++ b/tests/test_multicam_smoother.py
@@ -1,5 +1,3 @@
-import os
-
 import cv2
 import jax
 import jax.numpy as jnp
@@ -354,7 +352,6 @@ def test_center_predictions_min_frames():
 - Covariance projection via Jacobian vs finite differences
 """
 
-os.environ.setdefault("JAX_ENABLE_X64", "true")
 jax.config.update("jax_enable_x64", True)
 
 

From 52deb748b86fc30f310f7938fb3bc9ad961c01be Mon Sep 17 00:00:00 2001
From: Matt Whiteway <themattinthehatt@gmail.com>
Date: Thu, 9 Apr 2026 16:30:58 -0400
Subject: [PATCH 2/4] lint

---
 tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index ad7c2dc..aa811e5 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,7 +10,7 @@
 import pytest
 
 # URL of the zipped golden files. Update this after uploading a new release to GitHub.
-GOLDEN_URL = 'https://github.com/paninski-lab/eks-test-fixtures/releases/download/v1/eks_golden.zip'
+GOLDEN_URL = 'https://github.com/paninski-lab/eks-test-fixtures/releases/download/v1/eks_golden.zip'  # noqa: E501
 
 
 def pytest_addoption(parser):

From 47d88b478f1a15f83899fe48ed4df519f8cd0acb Mon Sep 17 00:00:00 2001
From: Matt Whiteway <themattinthehatt@gmail.com>
Date: Thu, 9 Apr 2026 16:46:30 -0400
Subject: [PATCH 3/4] loosen test tolerance

---
 tests/conftest.py       | 6 ++----
 tests/scripts/README.md | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index aa811e5..be7f7dd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -97,9 +97,7 @@ def _compare(test_name: str, output_dir: Path):
             return
 
         if golden_dir is None:
-            pytest.skip(
-                'No golden URL provided (set --golden-url or EKS_GOLDEN_URL); skipping comparison.'
-            )
+            pytest.skip('GOLDEN_URL is None in conftest.py; skipping golden comparison.')
 
         golden_test_dir = golden_dir / test_name
         assert golden_test_dir.exists(), (
@@ -117,7 +115,7 @@ def _compare(test_name: str, output_dir: Path):
             pd.testing.assert_frame_equal(
                 actual, expected,
                 check_exact=False,
-                atol=1e-5,
+                atol=1e-4,
                 obj=f'{test_name}/{csv_file.name}',
             )
 
diff --git a/tests/scripts/README.md b/tests/scripts/README.md
index f6a3f22..77554e2 100644
--- a/tests/scripts/README.md
+++ b/tests/scripts/README.md
@@ -9,7 +9,7 @@ against a set of **golden files** — a reference snapshot of known-good outputs
 
 - **Without golden files**: tests only verify that the scripts exit without error (original behavior).
 - **With golden files**: after each script runs, all CSV outputs are compared against the
-  corresponding golden CSVs using `pandas.testing.assert_frame_equal` with `atol=1e-5`.
+  corresponding golden CSVs using `pandas.testing.assert_frame_equal` with `atol=1e-4`.
 
 ---
 

From 4b69e389480af0fc4a32c5ec337645230b73b5f8 Mon Sep 17 00:00:00 2001
From: Matt Whiteway <themattinthehatt@gmail.com>
Date: Thu, 9 Apr 2026 16:57:49 -0400
Subject: [PATCH 4/4] pandas->np testing

---
 tests/conftest.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index be7f7dd..4cdb803 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,6 +6,7 @@
 from pathlib import Path
 from typing import Callable
 
+import numpy as np
 import pandas as pd
 import pytest
 
@@ -112,11 +113,19 @@ def _compare(test_name: str, output_dir: Path):
             )
             actual = pd.read_csv(csv_file, index_col=0)
             expected = pd.read_csv(golden_csv, index_col=0)
-            pd.testing.assert_frame_equal(
-                actual, expected,
-                check_exact=False,
+            assert actual.shape == expected.shape, (
+                f'{test_name}/{csv_file.name}: shape mismatch '
+                f'{actual.shape} != {expected.shape}'
+            )
+            assert list(actual.columns) == list(expected.columns), (
+                f'{test_name}/{csv_file.name}: column mismatch'
+            )
+            np.testing.assert_allclose(
+                actual.select_dtypes('number').values,
+                expected.select_dtypes('number').values,
+                rtol=0,
                 atol=1e-4,
-                obj=f'{test_name}/{csv_file.name}',
+                err_msg=f'{test_name}/{csv_file.name}',
             )
 
     return _compare