paninski-lab · themattinthehatt · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,19 +1,66 @@
+import io
+import shutil
 import subprocess
+import urllib.request
+import zipfile
+from pathlib import Path
 from typing import Callable
 
+import numpy as np
+import pandas as pd
 import pytest
 
+# URL of the zipped golden files. Update this after uploading a new release to GitHub.
+GOLDEN_URL = 'https://github.com/paninski-lab/eks-test-fixtures/releases/download/v1/eks_golden.zip'  # noqa: E501
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        '--generate-golden',
+        action='store_true',
+        default=False,
+        help='Generate golden output files instead of comparing against them.',
+    )
+    parser.addoption(
+        '--golden-dir',
+        action='store',
+        default=None,
+        help='Directory to write golden files to (used with --generate-golden).',
+    )
+
+
+@pytest.fixture(scope='session')
+def golden_dir(tmp_path_factory, pytestconfig):
+    """Return path to golden files directory, downloading and extracting if necessary."""
+    if pytestconfig.getoption('--generate-golden'):
+        golden_dir_opt = pytestconfig.getoption('--golden-dir')
+        if golden_dir_opt is None:
+            raise ValueError('--golden-dir must be specified when using --generate-golden')
+        path = Path(golden_dir_opt)
+        path.mkdir(parents=True, exist_ok=True)
+        return path
+
+    if GOLDEN_URL is None:
+        return None
+
+    cache_dir = tmp_path_factory.mktemp('golden')
+    with urllib.request.urlopen(GOLDEN_URL) as response:
+        data = response.read()
+    with zipfile.ZipFile(io.BytesIO(data)) as zf:
+        zf.extractall(cache_dir)
+    return cache_dir
+
 
 @pytest.fixture
 def run_script() -> Callable:
 
-    def _run_script(script_file, input_dir, output_dir, **kwargs):
+    def _run_script(script_file, input_dir, output_dir, **kwargs) -> Path:
 
         command_str = [
             'python',
             script_file,
             '--input-dir', input_dir,
-            '--save-dir', output_dir,
+            '--save-dir', str(output_dir),
             '--verbose', 'True',
         ]
         for key, arg in kwargs.items():
@@ -25,5 +72,60 @@ def _run_script(script_file, input_dir, output_dir, **kwargs):
 
         process = subprocess.run(command_str)
         assert process.returncode == 0
+        return Path(str(output_dir))
 
     return _run_script
+
+
+@pytest.fixture
+def compare_to_golden(golden_dir, pytestconfig):
+    """Fixture that either saves CSV outputs as golden files, or compares against them.
+
+    In generate mode (--generate-golden), copies all CSVs from output_dir into
+    golden_dir/<test_name>/. In compare mode, downloads golden files from the URL
+    and asserts numerical equality against them.
+    """
+
+    def _compare(test_name: str, output_dir: Path):
+        csv_files = sorted(output_dir.glob('*.csv'))
+        assert len(csv_files) > 0, f'No CSV files found in {output_dir}'
+
+        if pytestconfig.getoption('--generate-golden'):
+            dest = golden_dir / test_name
+            dest.mkdir(parents=True, exist_ok=True)
+            for csv_file in csv_files:
+                shutil.copy(csv_file, dest / csv_file.name)
+            return
+
+        if golden_dir is None:
+            pytest.skip('GOLDEN_URL is None in conftest.py; skipping golden comparison.')
+
+        golden_test_dir = golden_dir / test_name
+        assert golden_test_dir.exists(), (
+            f'Golden directory not found for test "{test_name}": {golden_test_dir}'
+        )
+
+        for csv_file in csv_files:
+            golden_csv = golden_test_dir / csv_file.name
+            assert golden_csv.exists(), (
+                f'Golden file not found: {golden_csv}. '
+                f'Run with --generate-golden to regenerate.'
+            )
+            actual = pd.read_csv(csv_file, index_col=0)
+            expected = pd.read_csv(golden_csv, index_col=0)
+            assert actual.shape == expected.shape, (
+                f'{test_name}/{csv_file.name}: shape mismatch '
+                f'{actual.shape} != {expected.shape}'
+            )
+            assert list(actual.columns) == list(expected.columns), (
+                f'{test_name}/{csv_file.name}: column mismatch'
+            )
+            np.testing.assert_allclose(
+                actual.select_dtypes('number').values,
+                expected.select_dtypes('number').values,
+                rtol=0,
+                atol=1e-4,
+                err_msg=f'{test_name}/{csv_file.name}',
+            )
+
+    return _compare
diff --git a/tests/scripts/README.md b/tests/scripts/README.md
@@ -0,0 +1,83 @@
+from tests.conftest import GOLDEN_URL
+
+# Integration Script Tests
+
+These tests run the example scripts end-to-end and optionally compare their CSV outputs
+against a set of **golden files** — a reference snapshot of known-good outputs.
+
+## How tests work
+
+- **Without golden files**: tests only verify that the scripts exit without error (original behavior).
+- **With golden files**: after each script runs, all CSV outputs are compared against the
+  corresponding golden CSVs using `pandas.testing.assert_frame_equal` with `atol=1e-4`.
+
+---
+
+## Generating new golden files
+
+Run this whenever you want to establish a new baseline (e.g. after an intentional change
+to the algorithm, or when setting up golden files for the first time).
+
+```bash
+pytest tests/scripts/ \
+    --generate-golden \
+    --golden-dir /tmp/eks_golden
+```
+
+This runs every integration test and copies the CSV outputs into
+`/tmp/eks_golden/<test_name>/`. The directory structure will look like:
+
+```
+/tmp/eks_golden/
+  test_singlecam_example_defaults/
+    eks_singlecam.csv
+  test_singlecam_example_fixed_smooth_param/
+    eks_singlecam.csv
+  test_multicam_example_defaults/
+    multicam_top_results.csv
+    multicam_bot_results.csv
+  test_multicam_example_defaults_nonlinear/
+    multicam_Cam-A_results.csv
+    multicam_Cam-B_results.csv
+    multicam_Cam-C_results.csv
+    multicam_3d_results.csv
+  ...
+```
+
+### Zip and upload
+
+```bash
+cd /tmp/eks_golden
+zip -r eks_golden.zip .
+```
+
+Upload `eks_golden.zip` to your hosting location. The zip must have the test-name
+folders at its root (no extra top-level wrapper directory) — the `cd` + `.` zip
+command above ensures this.
+
+### Update the URL in conftest.py
+
+Once uploaded, copy the direct download URL of the zip asset from the GitHub release
+and set it as `GOLDEN_URL` near the top of `tests/conftest.py`:
+
+```python
+GOLDEN_URL = 'https://github.com/paninski-lab/eks-test-fixtures/releases/download/vX/eks_golden.zip'
+```
+
+Commit this change so CI and other contributors pick it up automatically.
+
+---
+
+## Running tests with golden comparison
+
+```bash
+pytest tests/scripts/
+```
+
+The golden zip is downloaded once per test session and cached in a temporary directory.
+Golden comparison is skipped automatically when `GOLDEN_URL = None` in `conftest.py`.
+
+### Without golden comparison
+
+Set `GOLDEN_URL = None` in `tests/conftest.py`. Tests will only verify that the scripts
+exit without error (original behavior).
diff --git a/tests/scripts/test_ibl_paw_multicam_example.py b/tests/scripts/test_ibl_paw_multicam_example.py
@@ -1,18 +1,25 @@
 
 
-def test_ibl_paw_multicam_example_defaults(run_script, tmpdir, pytestconfig):
+def test_ibl_paw_multicam_example_defaults(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'ibl_paw_multiview_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'ibl-paw'),
         output_dir=tmpdir,
     )
+    compare_to_golden(request.node.name, output_dir)
 
 
-def test_ibl_paw_multicam_example_fixed_smooth_param(run_script, tmpdir, pytestconfig):
-    run_script(
+def test_ibl_paw_multicam_example_fixed_smooth_param(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
+
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'ibl_paw_multiview_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'ibl-paw'),
         output_dir=tmpdir,
-        s=10
+        s=10,
     )
+    compare_to_golden(request.node.name, output_dir)
diff --git a/tests/scripts/test_ibl_pupil_example.py b/tests/scripts/test_ibl_pupil_example.py
@@ -1,20 +1,24 @@
 
 
-def test_ibl_pupil_example_defaults(run_script, tmpdir, pytestconfig):
+def test_ibl_pupil_example_defaults(run_script, compare_to_golden, tmpdir, pytestconfig, request):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'ibl_pupil_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'ibl-pupil'),
         output_dir=tmpdir,
     )
+    compare_to_golden(request.node.name, output_dir)
 
 
-def test_ibl_pupil_example_fixed_smooth_param(run_script, tmpdir, pytestconfig):
+def test_ibl_pupil_example_fixed_smooth_param(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'ibl_pupil_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'ibl-pupil'),
         output_dir=tmpdir,
         diameter_s=0.99,
         com_s=0.99,
     )
+    compare_to_golden(request.node.name, output_dir)
diff --git a/tests/scripts/test_mirrored_multicam_example.py b/tests/scripts/test_mirrored_multicam_example.py
@@ -1,23 +1,29 @@
 
 
-def test_mirrored_multicam_example_defaults(run_script, tmpdir, pytestconfig):
+def test_mirrored_multicam_example_defaults(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'mirrored_multicam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'mirror-mouse'),
         output_dir=tmpdir,
         bodypart_list=['paw1LH', 'paw2LF'],  # , 'paw3RF', 'paw4RH'],  # unneeded computation
         camera_names=['top', 'bot'],
     )
+    compare_to_golden(request.node.name, output_dir)
 
 
-def test_mirrored_multicam_example_fixed_smooth_param(run_script, tmpdir, pytestconfig):
+def test_mirrored_multicam_example_fixed_smooth_param(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'mirrored_multicam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'mirror-mouse'),
         output_dir=tmpdir,
         bodypart_list=['paw1LH', 'paw2LF'],  # , 'paw3RF', 'paw4RH'],  # unneeded computation
         camera_names=['top', 'bot'],
         s=10
     )
+    compare_to_golden(request.node.name, output_dir)
diff --git a/tests/scripts/test_multicam_example.py b/tests/scripts/test_multicam_example.py
@@ -1,43 +1,52 @@
 
 
-def test_multicam_example_defaults(run_script, tmpdir, pytestconfig):
+def test_multicam_example_defaults(run_script, compare_to_golden, tmpdir, pytestconfig, request):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'multicam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'mirror-mouse-separate'),
         output_dir=tmpdir,
         bodypart_list=['paw1LH', 'paw2LF'],  # , 'paw3RF', 'paw4RH'],  # unneeded computation
         camera_names=['top', 'bot'],
     )
+    compare_to_golden(request.node.name, output_dir)
 
 
-def test_multicam_example_fixed_smooth_param(run_script, tmpdir, pytestconfig):
+def test_multicam_example_fixed_smooth_param(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'multicam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'mirror-mouse-separate'),
         output_dir=tmpdir,
         bodypart_list=['paw1LH', 'paw2LF'],  # , 'paw3RF', 'paw4RH'],  # unneeded computation
         camera_names=['top', 'bot'],
         s=10,
     )
+    compare_to_golden(request.node.name, output_dir)
 
 
-def test_multicam_example_defaults_nonlinear(run_script, tmpdir, pytestconfig):
+def test_multicam_example_defaults_nonlinear(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'multicam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'fly'),
         output_dir=tmpdir,
         bodypart_list=['L1A', 'L1B'],
         camera_names=['Cam-A', 'Cam-B', 'Cam-C'],
         calibration=str(pytestconfig.rootpath / 'data' / 'fly' / 'calibration.toml'),
     )
+    compare_to_golden(request.node.name, output_dir)
 
 
-def test_multicam_example_fixed_smooth_param_nonlinear(run_script, tmpdir, pytestconfig):
+def test_multicam_example_fixed_smooth_param_nonlinear(
+    run_script, compare_to_golden, tmpdir, pytestconfig, request,
+):
 
-    run_script(
+    output_dir = run_script(
         script_file=str(pytestconfig.rootpath / 'scripts' / 'multicam_example.py'),
         input_dir=str(pytestconfig.rootpath / 'data' / 'fly'),
         output_dir=tmpdir,
@@ -46,3 +55,4 @@ def test_multicam_example_fixed_smooth_param_nonlinear(run_script, tmpdir, pytes
         calibration=str(pytestconfig.rootpath / 'data' / 'fly' / 'calibration.toml'),
         s=10,
     )
+    compare_to_golden(request.node.name, output_dir)