From b8571f1f170709ad4b5197f235fbb6b2f8596a29 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 4 Dec 2025 11:54:59 -0600 Subject: [PATCH 1/4] FEA First version of cuda-health plugin --- ci/run_cuml_dask_pytests.sh | 2 +- ci/run_cuml_integration_pytests.sh | 2 +- ci/run_cuml_singlegpu_accel_pytests.sh | 2 +- ci/run_cuml_singlegpu_pytests.sh | 2 +- .../testing/plugins/cuda_health_plugin.py | 309 ++++++++++++++++++ python/cuml/tests/conftest.py | 5 +- 6 files changed, 317 insertions(+), 5 deletions(-) create mode 100644 python/cuml/cuml/testing/plugins/cuda_health_plugin.py diff --git a/ci/run_cuml_dask_pytests.sh b/ci/run_cuml_dask_pytests.sh index 8152481542..cdb5db5710 100755 --- a/ci/run_cuml_dask_pytests.sh +++ b/ci/run_cuml_dask_pytests.sh @@ -5,4 +5,4 @@ # Support invoking run_cuml_dask_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuml/tests/dask || exit 1 -python -m pytest --cache-clear "$@" . +python -m pytest --cache-clear --cuda-health-check "$@" . diff --git a/ci/run_cuml_integration_pytests.sh b/ci/run_cuml_integration_pytests.sh index 547b0698d5..13b1f8b587 100755 --- a/ci/run_cuml_integration_pytests.sh +++ b/ci/run_cuml_integration_pytests.sh @@ -5,4 +5,4 @@ # Support invoking run_cuml_singlegpu_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuml/tests || exit 1 -python -m pytest -p cudf.pandas --cache-clear --ignore=dask "$@" --quick_run . +python -m pytest -p cudf.pandas --cache-clear --cuda-health-check --ignore=dask "$@" --quick_run . diff --git a/ci/run_cuml_singlegpu_accel_pytests.sh b/ci/run_cuml_singlegpu_accel_pytests.sh index e30abe4d74..5443c4b25e 100755 --- a/ci/run_cuml_singlegpu_accel_pytests.sh +++ b/ci/run_cuml_singlegpu_accel_pytests.sh @@ -5,4 +5,4 @@ # Support invoking run_cuml_singlegpu_accel_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuml/cuml_accel_tests || exit 1 -python -m pytest --cache-clear "$@" . +python -m pytest --cache-clear --cuda-health-check "$@" . diff --git a/ci/run_cuml_singlegpu_pytests.sh b/ci/run_cuml_singlegpu_pytests.sh index 9e5224536f..378c715da5 100755 --- a/ci/run_cuml_singlegpu_pytests.sh +++ b/ci/run_cuml_singlegpu_pytests.sh @@ -5,4 +5,4 @@ # Support invoking run_cuml_singlegpu_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuml/tests || exit 1 -python -m pytest --cache-clear --ignore=dask "$@" . +python -m pytest --cache-clear --cuda-health-check --ignore=dask "$@" . diff --git a/python/cuml/cuml/testing/plugins/cuda_health_plugin.py b/python/cuml/cuml/testing/plugins/cuda_health_plugin.py new file mode 100644 index 0000000000..e3d1f2852e --- /dev/null +++ b/python/cuml/cuml/testing/plugins/cuda_health_plugin.py @@ -0,0 +1,309 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# + +""" +Pytest plugin to monitor CUDA health during test execution. + +This plugin helps identify tests that cause CUDA memory corruption by: +1. Checking GPU health before and after each test +2. Logging CUDA memory state +3. Detecting illegal memory access errors early +4. Providing detailed reports on which test caused issues + +Usage: + pytest --cuda-health-check + pytest --cuda-health-check --cuda-health-verbose + pytest --cuda-health-check --cuda-health-sync + pytest --cuda-health-check --cuda-health-gc + +With xdist, logs go to cuda_health_.log files. +""" + +import os +import sys +import time + +import pytest + + +def pytest_addoption(parser): + """Add command line options for CUDA health monitoring.""" + group = parser.getgroup("CUDA Health Monitoring") + + group.addoption( + "--cuda-health-check", + action="store_true", + default=False, + help="Enable CUDA health checking between tests", + ) + + group.addoption( + "--cuda-health-verbose", + action="store_true", + default=False, + help="Verbose CUDA health logging", + ) + + group.addoption( + "--cuda-health-sync", + action="store_true", + default=False, + help="Force CUDA stream sync after each test", + ) + + group.addoption( + "--cuda-health-gc", + action="store_true", + default=False, + help="Force garbage collection after each test", + ) + + +class CUDAHealthPlugin: + """Plugin to monitor CUDA health during test execution.""" + + def __init__(self, config): + self.config = config + self.verbose = config.getoption("--cuda-health-verbose") + self.sync_after_test = config.getoption("--cuda-health-sync") + self.gc_after_test = config.getoption("--cuda-health-gc") + + self.last_healthy_test = None + self.current_test = None + self.test_count = 0 + self.health_failures = [] + self.cuda_available = False + self.log_file = None + self.worker_id = "main" + + # Detect xdist worker + worker_id = os.environ.get("PYTEST_XDIST_WORKER") + if worker_id: + self.worker_id = worker_id + # Each worker writes to its own log file + self.log_file = open(f"cuda_health_{worker_id}.log", "w") + + # Try to import CUDA libraries + try: + import cupy as cp + + self.cp = cp + self.cuda_available = True + except ImportError: + self.cp = None + + self._log( + f"Plugin initialized: verbose={self.verbose}, " + f"sync={self.sync_after_test}, cuda={self.cuda_available}, " + f"worker={self.worker_id}", + force=True, + ) + + def _log(self, msg, force=False): + """Log a message if verbose mode is enabled.""" + if self.verbose or force: + full_msg = f"[CUDA-HEALTH:{self.worker_id}] {msg}" + # Always write to log file if we have one (xdist worker) + if self.log_file: + self.log_file.write(full_msg + "\n") + self.log_file.flush() + # Write to stderr (more likely to show with xdist) + sys.stderr.write(full_msg + "\n") + sys.stderr.flush() + + def _get_memory_info(self): + """Get current CUDA memory info.""" + if not self.cuda_available: + return None + + try: + mempool = self.cp.get_default_memory_pool() + + info = { + "device_used": mempool.used_bytes(), + "device_total": mempool.total_bytes(), + } + + # Pinned memory pool has different API + try: + pinned_mempool = self.cp.get_default_pinned_memory_pool() + info["pinned_used"] = pinned_mempool.n_free_blocks() + except Exception: + pass + + return info + except Exception as e: + return {"error": str(e)} + + def _check_cuda_health(self): + """Check if CUDA is in a healthy state.""" + if not self.cuda_available: + return True, "CUDA not available" + + try: + # Try a simple CUDA operation + a = self.cp.array([1, 2, 3], dtype=self.cp.float32) + b = a + 1 + self.cp.cuda.Stream.null.synchronize() + del a, b + return True, "OK" + except Exception as e: + error_msg = str(e) + # Check for specific CUDA errors + if "cudaErrorIllegalAddress" in error_msg: + return False, f"ILLEGAL MEMORY ACCESS: {error_msg}" + elif "cudaError" in error_msg: + return False, f"CUDA ERROR: {error_msg}" + else: + return False, f"UNKNOWN ERROR: {error_msg}" + + def _sync_cuda(self): + """Synchronize CUDA stream.""" + if not self.cuda_available: + return + + try: + self.cp.cuda.Stream.null.synchronize() + except Exception as e: + self._log(f"Sync failed: {e}", force=True) + + def _force_gc(self): + """Force garbage collection and CUDA memory cleanup.""" + import gc + + gc.collect() + + if self.cuda_available: + try: + self.cp.get_default_memory_pool().free_all_blocks() + self.cp.get_default_pinned_memory_pool().free_all_blocks() + except Exception: + pass + + @pytest.hookimpl(tryfirst=True) + def pytest_runtest_setup(self, item): + """Called before each test runs.""" + self.current_test = item.nodeid + self.test_count += 1 + + if self.verbose: + mem_info = self._get_memory_info() + self._log( + f"[{self.test_count}] SETUP: {item.nodeid} | " + f"Memory: {mem_info}" + ) + + @pytest.hookimpl(trylast=True) + def pytest_runtest_teardown(self, item, nextitem): + """Called after each test runs.""" + # Force sync if requested + if self.sync_after_test: + self._sync_cuda() + + # Force GC if requested + if self.gc_after_test: + self._force_gc() + + # Check CUDA health + healthy, msg = self._check_cuda_health() + + if not healthy: + failure_info = { + "test": item.nodeid, + "test_number": self.test_count, + "error": msg, + "last_healthy_test": self.last_healthy_test, + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), + "worker": self.worker_id, + } + self.health_failures.append(failure_info) + + # Build failure message + failure_msg = ( + "\n" + + "=" * 80 + + "\n" + f"[CUDA-HEALTH:{self.worker_id}] !!! CUDA HEALTH CHECK FAILED !!!\n" + f" Current test: {item.nodeid}\n" + f" Test number: {self.test_count}\n" + f" Error: {msg}\n" + f" Last healthy test: {self.last_healthy_test}\n" + + "=" * 80 + + "\n" + ) + + # Write to log file if xdist worker + if self.log_file: + self.log_file.write(failure_msg) + self.log_file.flush() + + # Write to stderr (works better with xdist) + sys.stderr.write(failure_msg) + sys.stderr.flush() + else: + self.last_healthy_test = item.nodeid + + if self.verbose: + mem_info = self._get_memory_info() + self._log( + f"[{self.test_count}] TEARDOWN: {item.nodeid} | " + f"Health: {msg} | Memory: {mem_info}" + ) + + def pytest_sessionfinish(self, session, exitstatus): + """Called at the end of the test session.""" + summary_lines = [] + + if self.health_failures: + summary_lines.append("\n" + "=" * 80) + summary_lines.append( + f"[CUDA-HEALTH:{self.worker_id}] SESSION SUMMARY - " + "HEALTH CHECK FAILURES" + ) + summary_lines.append("=" * 80) + + for failure in self.health_failures: + summary_lines.append( + f"\nTest #{failure['test_number']}: {failure['test']}" + ) + summary_lines.append(f" Error: {failure['error']}") + summary_lines.append( + f" Last healthy test: {failure['last_healthy_test']}" + ) + summary_lines.append(f" Time: {failure['timestamp']}") + + summary_lines.append("\n" + "=" * 80) + summary_lines.append( + f"Total CUDA health failures: {len(self.health_failures)}" + ) + summary_lines.append("=" * 80 + "\n") + + summary = "\n".join(summary_lines) + sys.stderr.write(summary + "\n") + sys.stderr.flush() + if self.log_file: + self.log_file.write(summary) + + elif self.test_count > 0: + self._log( + f"Session complete. All {self.test_count} tests " + "passed CUDA health checks.", + force=True, + ) + + # Close log file if we have one + if self.log_file: + self.log_file.close() + + +def pytest_configure(config): + """Register the plugin if --cuda-health-check is specified.""" + if config.getoption("--cuda-health-check", default=False): + plugin_name = "cuda_health_plugin_instance" + if not config.pluginmanager.has_plugin(plugin_name): + config.pluginmanager.register( + CUDAHealthPlugin(config), plugin_name + ) + diff --git a/python/cuml/tests/conftest.py b/python/cuml/tests/conftest.py index ce5532afeb..a7e212c8f7 100644 --- a/python/cuml/tests/conftest.py +++ b/python/cuml/tests/conftest.py @@ -28,7 +28,10 @@ # ============================================================================= # Add the import here for any plugins that should be loaded EVERY TIME -pytest_plugins = "cuml.testing.plugins.quick_run_plugin" +pytest_plugins = [ + "cuml.testing.plugins.quick_run_plugin", + "cuml.testing.plugins.cuda_health_plugin", +] # ============================================================================= From 11b73a4bbc42e311e8630fd31110835ce895b4c7 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 4 Dec 2025 12:06:07 -0600 Subject: [PATCH 2/4] FIX style fixes --- python/cuml/cuml/testing/plugins/cuda_health_plugin.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/cuml/cuml/testing/plugins/cuda_health_plugin.py b/python/cuml/cuml/testing/plugins/cuda_health_plugin.py index e3d1f2852e..dc8bba8c2b 100644 --- a/python/cuml/cuml/testing/plugins/cuda_health_plugin.py +++ b/python/cuml/cuml/testing/plugins/cuda_health_plugin.py @@ -222,9 +222,7 @@ def pytest_runtest_teardown(self, item, nextitem): # Build failure message failure_msg = ( - "\n" - + "=" * 80 - + "\n" + "\n" + "=" * 80 + "\n" f"[CUDA-HEALTH:{self.worker_id}] !!! CUDA HEALTH CHECK FAILED !!!\n" f" Current test: {item.nodeid}\n" f" Test number: {self.test_count}\n" @@ -306,4 +304,3 @@ def pytest_configure(config): config.pluginmanager.register( CUDAHealthPlugin(config), plugin_name ) - From d203fcb982f40b8de5403aab2edbcb1bf3431a11 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 4 Dec 2025 15:07:27 -0600 Subject: [PATCH 3/4] FIX remove flag from accel pytests --- ci/run_cuml_integration_pytests.sh | 2 +- ci/run_cuml_singlegpu_accel_pytests.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/run_cuml_integration_pytests.sh b/ci/run_cuml_integration_pytests.sh index 13b1f8b587..547b0698d5 100755 --- a/ci/run_cuml_integration_pytests.sh +++ b/ci/run_cuml_integration_pytests.sh @@ -5,4 +5,4 @@ # Support invoking run_cuml_singlegpu_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuml/tests || exit 1 -python -m pytest -p cudf.pandas --cache-clear --cuda-health-check --ignore=dask "$@" --quick_run . +python -m pytest -p cudf.pandas --cache-clear --ignore=dask "$@" --quick_run . diff --git a/ci/run_cuml_singlegpu_accel_pytests.sh b/ci/run_cuml_singlegpu_accel_pytests.sh index 5443c4b25e..e30abe4d74 100755 --- a/ci/run_cuml_singlegpu_accel_pytests.sh +++ b/ci/run_cuml_singlegpu_accel_pytests.sh @@ -5,4 +5,4 @@ # Support invoking run_cuml_singlegpu_accel_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuml/cuml_accel_tests || exit 1 -python -m pytest --cache-clear --cuda-health-check "$@" . +python -m pytest --cache-clear "$@" . From 7d9f1e2b8b2091a4ad9bc2c3edad92297982d4ff Mon Sep 17 00:00:00 2001 From: Simon Adorf Date: Fri, 5 Dec 2025 14:19:20 -0600 Subject: [PATCH 4/4] Test only UMAP in serial. --- ci/run_cuml_singlegpu_pytests.sh | 2 +- ci/test_python_singlegpu.sh | 8 ++++---- ci/test_wheel.sh | 8 ++------ 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/ci/run_cuml_singlegpu_pytests.sh b/ci/run_cuml_singlegpu_pytests.sh index 378c715da5..5895d64a79 100755 --- a/ci/run_cuml_singlegpu_pytests.sh +++ b/ci/run_cuml_singlegpu_pytests.sh @@ -5,4 +5,4 @@ # Support invoking run_cuml_singlegpu_pytests.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cuml/tests || exit 1 -python -m pytest --cache-clear --cuda-health-check --ignore=dask "$@" . +python -m pytest --cache-clear --cuda-health-check --ignore=dask "$@" test_umap.py diff --git a/ci/test_python_singlegpu.sh b/ci/test_python_singlegpu.sh index 2d355ffd9e..5dc791268b 100755 --- a/ci/test_python_singlegpu.sh +++ b/ci/test_python_singlegpu.sh @@ -22,8 +22,8 @@ set +e rapids-logger "pytest cuml single GPU" ./ci/run_cuml_singlegpu_pytests.sh \ - --numprocesses=8 \ - --dist=worksteal \ + --verbose \ + --exitfirst \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cuml.xml" \ --cov-config=../.coveragerc \ --cov=cuml \ @@ -31,8 +31,8 @@ rapids-logger "pytest cuml single GPU" rapids-logger "pytest cuml accelerator" ./ci/run_cuml_singlegpu_accel_pytests.sh \ - --numprocesses=8 \ - --dist=worksteal \ + --verbose \ + --exitfirst \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cuml-accel.xml" \ --cov-config=../.coveragerc \ --cov=cuml \ diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index b53cb02379..19ba3c20ff 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -49,15 +49,11 @@ python -m pytest --cache-clear python/libcuml/tests/test_libcuml_linkage.py -v rapids-logger "pytest cuml single GPU" ./ci/run_cuml_singlegpu_pytests.sh \ - --numprocesses=8 \ + --verbose \ + --exitfirst \ --dist=worksteal \ -k 'not test_sparse_pca_inputs' \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cuml.xml" -# Run test_sparse_pca_inputs separately -./ci/run_cuml_singlegpu_pytests.sh \ - -k 'test_sparse_pca_inputs' \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-cuml-sparse-pca.xml" - rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE}