diff --git a/.github/workflows/api_doc.yml b/.github/workflows/api_doc.yml.disabled
similarity index 100%
rename from .github/workflows/api_doc.yml
rename to .github/workflows/api_doc.yml.disabled
diff --git a/.github/workflows/check_deleted_comments.yml b/.github/workflows/check_deleted_comments.yml.disabled
similarity index 100%
rename from .github/workflows/check_deleted_comments.yml
rename to .github/workflows/check_deleted_comments.yml.disabled
diff --git a/.github/workflows/check_feature_factorization.yml b/.github/workflows/check_feature_factorization.yml.disabled
similarity index 100%
rename from .github/workflows/check_feature_factorization.yml
rename to .github/workflows/check_feature_factorization.yml.disabled
diff --git a/.github/workflows/check_markup_links.yml b/.github/workflows/check_markup_links.yml.disabled
similarity index 100%
rename from .github/workflows/check_markup_links.yml
rename to .github/workflows/check_markup_links.yml.disabled
diff --git a/.github/workflows/check_test_coverage.yml b/.github/workflows/check_test_coverage.yml.disabled
similarity index 100%
rename from .github/workflows/check_test_coverage.yml
rename to .github/workflows/check_test_coverage.yml.disabled
diff --git a/.github/workflows/check_wrapping.yml b/.github/workflows/check_wrapping.yml.disabled
similarity index 100%
rename from .github/workflows/check_wrapping.yml
rename to .github/workflows/check_wrapping.yml.disabled
diff --git a/.github/workflows/clang_tidy.yml b/.github/workflows/clang_tidy.yml.disabled
similarity index 100%
rename from .github/workflows/clang_tidy.yml
rename to .github/workflows/clang_tidy.yml.disabled
diff --git a/.github/workflows/debug_cuda_max_reducer.yml b/.github/workflows/debug_cuda_max_reducer.yml
new file mode 100644
index 0000000000..d157389f37
--- /dev/null
+++ b/.github/workflows/debug_cuda_max_reducer.yml
@@ -0,0 +1,330 @@
+name: Debug CUDA T4 max-reducer hang
+
+# DO NOT MERGE. Single-job workflow that builds Quadrants in editable mode on a `gpu-t4-4-core`
+# runner, opens a detached tmate session for live attach, then runs the proven reproducer +
+# compute-sanitizer trace + dmesg Xid capture in the background while you iterate.
+#
+# Findings carried over from the previous debug session:
+#   * Reproducer = `pytest -n 8 --count 50 tests/python/test_ad_ndarray.py -k cuda` on T4. ~50% of
+#     workers see their FIRST or SECOND launched test crash with `CUDA_ERROR_ILLEGAL_ADDRESS while
+#     calling cuStreamSynchronize / cuMemsetD8 / cuMemFreeAsync`.
+#   * `dmesg` shows `Xid 31 ... MMU Fault: ENGINE GRAPHICS GPC0 GPCCLIENT_T1_0 faulted @ 0x???_??000000.
+#     Fault is of type FAULT_PDE ACCESS_TYPE_VIRT_WRITE` followed by `Xid 109 CTX SWITCH TIMEOUT`.
+#     A CUDA kernel is writing to a stale / out-of-bounds device pointer; subsequent ops fail until
+#     the GPU is wedged.
+#   * Host stack at the surfaced symptom = `Ndarray::write_float -> Ndarray::write -> allocate_memory
+#     (small staging buffer) -> cuMemset` failing because the context is already corrupted by the
+#     earlier OOB write.
+#
+# This workflow:
+#   1. Builds Quadrants editable, CUDA-only (no Vulkan / AMDGPU / cpp tests, so iteration via
+#      `cd _skbuild/*/cmake-build && cmake --build .` inside tmate is fast).
+#   2. Pre-installs `nvidia-cuda-toolkit` so `compute-sanitizer` is available.
+#   3. Captures system / driver / HMM info + the pre-test `dmesg` snapshot.
+#   4. Opens a detached tmate session - SSH command is printed in the workflow log; the workflow
+#      continues straight into the reproducer / sanitizer / artifact-upload steps.
+#   5. Runs the reproducer with `CUDA_LAUNCH_BLOCKING=1`, streaming pytest output through an
+#      abort wrapper that SIGTERMs the controller on the first FAILED line.
+#   6. Snapshots `dmesg` again so the new Xid lines are visible.
+#   7. Re-runs the previously-failed tests under `compute-sanitizer --tool memcheck`. Its first
+#      `Invalid write` block is the offending kernel.
+#   8. Uploads `/tmp/repro/` as an artifact. Tmate keeps running until `timeout-minutes` or job end.
+
+on:
+  pull_request:
+    types: [opened, reopened, synchronize]
+  workflow_dispatch:
+    inputs:
+      quadrants_ref:
+        description: "Quadrants ref to build + test (branch / tag / SHA). Default = tip of the source branch holding the in-flight debug instrumentation."
+        required: false
+        default: "duburcqa/turing_stream_pin_experiment"
+      tmate_timeout_minutes:
+        description: "tmate session timeout (minutes). `touch /tmp/continue` from inside the shell to skip the rest of the wait."
+        required: false
+        default: "60"
+      reproduce_count:
+        description: "pytest --count value for the reproducer (multiplier per test)."
+        required: false
+        default: "50"
+
+concurrency:
+  # Group keyed on the run id so two manual `workflow_dispatch` invocations (or a push synced on top
+  # of an active tmate session) coexist instead of cancelling each other. Debug runs are interactive
+  # via tmate; cancelling a live tmate session because a new push landed is exactly what we do not
+  # want here.
+  group: ${{ github.workflow }}-${{ github.run_id }}
+  cancel-in-progress: false
+
+jobs:
+  debug_cuda_t4:
+    name: Build + reproduce + sanitizer + tmate on CUDA T4
+    runs-on: gpu-t4-4-core
+    env:
+      QUADRANTS_REF: ${{ github.event.inputs.quadrants_ref || 'duburcqa/turing_stream_pin_experiment' }}
+      TMATE_TIMEOUT: ${{ github.event.inputs.tmate_timeout_minutes || '60' }}
+      REPRODUCE_COUNT: ${{ github.event.inputs.reproduce_count || '50' }}
+      QD_OFFLINE_CACHE: "0"
+      QD_OFFLINE_CACHE_CLEANING_POLICY: "never"
+      QD_DEBUG_ADSTACK: "1"
+      PYTHONUNBUFFERED: "1"
+      PYTHONFAULTHANDLER: "1"
+      CUDA_LAUNCH_BLOCKING: "1"
+      QUADRANTS_CMAKE_ARGS: "-DQD_WITH_VULKAN:BOOL=OFF -DQD_WITH_CUDA:BOOL=ON -DQD_WITH_AMDGPU:BOOL=OFF -DQD_BUILD_TESTS:BOOL=OFF"
+    steps:
+      - name: Checkout Quadrants at the target ref
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ env.QUADRANTS_REF }}
+          fetch-depth: 1
+          submodules: recursive
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Capture system / driver / HMM info + initial dmesg
+        run: |
+          set +e
+          mkdir -p /tmp/repro
+          {
+            echo "=== uname / lsb_release ==="
+            uname -a
+            cat /etc/os-release
+            echo "=== nvidia-smi ==="
+            nvidia-smi
+            echo "=== driver version ==="
+            cat /proc/driver/nvidia/version
+            echo "=== HMM device attrs ==="
+            python3 - <<'PY'
+          import ctypes
+          libcuda = ctypes.CDLL('libcuda.so.1'); libcuda.cuInit(0)
+          dev = ctypes.c_int(); libcuda.cuDeviceGet(ctypes.byref(dev), 0)
+          name = ctypes.create_string_buffer(256); libcuda.cuDeviceGetName(name, 256, dev)
+          attr = ctypes.c_int()
+          for code, label in [(83, 'MANAGED_MEMORY'), (88, 'PAGEABLE_MEMORY_ACCESS'),
+                              (99, 'PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES')]:
+              libcuda.cuDeviceGetAttribute(ctypes.byref(attr), code, dev)
+              print(f"  {label}: {attr.value}")
+          print(f"  device: {name.value.decode()}")
+          PY
+            echo "=== runner image ==="
+            cat /imagegeneration/imagedata.json 2>/dev/null
+          } | tee /tmp/repro/system_identity.txt
+          # Snapshot dmesg BEFORE the reproducer so we can diff against a post-run snapshot.
+          sudo dmesg --ctime > /tmp/repro/dmesg_before.log 2>/dev/null
+
+      - name: Install OS build deps + CUDA libs + compute-sanitizer
+        run: |
+          set -ex
+          sudo apt-get update -y
+          sudo apt-get install -y \
+            build-essential cmake ninja-build pkg-config git \
+            liblz4-dev libncurses-dev libssl-dev libzstd-dev \
+            libcusolver-dev-12-8 libcusolver-12-8 libcusparse-dev-12-8 libcusparse-12-8 \
+            libnvjitlink-12-8 libcublas-12-8 \
+            nvidia-cuda-toolkit
+          echo "/usr/local/cuda/targets/x86_64-linux/lib" | sudo tee /etc/ld.so.conf.d/cuda-targets.conf
+          sudo ldconfig
+          which compute-sanitizer
+          compute-sanitizer --version
+
+      - name: Install Python build deps + LLVM toolchain
+        run: |
+          set -ex
+          python -m pip install --upgrade pip setuptools wheel
+          pip install --group dev
+          pip install --group test || true
+          pip install -r requirements_test_xdist.txt || true
+          pip install py-spy pytest-repeat
+          LLVM_DIR=$(python download_llvm.py | tail -n 1)
+          echo "LLVM_DIR=${LLVM_DIR}" >> $GITHUB_ENV
+          echo "${LLVM_DIR}/bin" >> $GITHUB_PATH
+          chmod +x ${LLVM_DIR}/bin/* || true
+          ${LLVM_DIR}/bin/clang --version
+
+      - name: Build in develop mode (./build.py --write-env + python setup.py develop)
+        run: |
+          set -ex
+          # Develop install, no wheel, no isolation. Mirrors the project's local-iteration flow
+          # (`./build.py --shell` + `python setup.py develop`) but non-interactive: `./build.py
+          # --write-env=<file>` dumps the cmake / LLVM / sccache env vars (the same ones the
+          # interactive shell mode exports), we source it, then `setup.py develop` runs skbuild's
+          # cmake build + DevelopWithStubs install. After this the source tree IS the install: edit
+          # a .cpp, then `cd _skbuild/*/cmake-build && cmake --build . -j` and the egg-linked
+          # package picks up the new .so. NEVER `pip install -e` (this project does not use
+          # scikit-build-core as a build backend; editable install ships a python-only stub).
+          ./build.py --write-env=/tmp/build_env.sh
+          set +x; source /tmp/build_env.sh; set -x
+          python setup.py develop
+          python -c "import quadrants as qd; qd.init(arch=qd.cuda); print('cuda init OK')" 2>&1 | tee /tmp/repro/cuda_init.log
+          BUILD_DIR=$(ls -d _skbuild/*/cmake-build 2>/dev/null | head -1)
+          echo "BUILD_DIR=${BUILD_DIR}" | tee -a $GITHUB_ENV
+          ls -la "${BUILD_DIR}/"
+
+      - name: Stage repro helpers under /tmp/repro/
+        run: |
+          set -ex
+          # Inline-write so the workflow does NOT depend on any file in the source tree.
+          cat <<'SH' > /tmp/repro/run_reproducer.sh
+          #!/bin/bash
+          set -x
+          cd "$(git rev-parse --show-toplevel 2>/dev/null || pwd)"
+          export QD_OFFLINE_CACHE=0 QD_DEBUG_ADSTACK=1 PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 CUDA_LAUNCH_BLOCKING=1
+          # Stream pytest output through this shell and abort the whole run the moment the first
+          # FAILED line appears. pytest-xdist's `-x` does not work, so we drive abort externally:
+          # read the pytest controller's combined stdout/stderr line-by-line; on the first match,
+          # SIGTERM the controller (its xdist workers are direct children and follow). Without this,
+          # `--count 50` would burn the full 50x sweep after the bug has already reproduced.
+          # The calling workflow step pipes our stdout through `tee /tmp/repro/reproducer.log`.
+          exec 3< <(python -m pytest -n 8 --count "${1:-50}" -ra -v --tb=long \
+                      tests/python/test_ad_ndarray.py -k cuda 2>&1)
+          PYTEST_PID=$!
+          ABORTED=0
+          while IFS= read -r line <&3; do
+            printf '%s\n' "$line"
+            if [ "$ABORTED" = "0" ] && [ "${line#*FAILED}" != "$line" ]; then
+              ABORTED=1
+              echo "[abort] FAILED detected; SIGTERM pytest controller pid=$PYTEST_PID + xdist workers"
+              pkill -TERM -P "$PYTEST_PID" 2>/dev/null || true
+              kill -TERM "$PYTEST_PID" 2>/dev/null || true
+            fi
+          done
+          exec 3<&-
+          wait "$PYTEST_PID" 2>/dev/null
+          exit $?
+          SH
+          chmod +x /tmp/repro/run_reproducer.sh
+
+          cat <<'SH' > /tmp/repro/run_sanitizer.sh
+          #!/bin/bash
+          set -x
+          cd "$(git rev-parse --show-toplevel 2>/dev/null || pwd)"
+          export QD_OFFLINE_CACHE=0 QD_DEBUG_ADSTACK=1 PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 CUDA_LAUNCH_BLOCKING=1
+          # `--lf` runs only the previously-failed tests; serial (-n 0) so sanitizer output stays sane.
+          # `-s` disables pytest stdout/stderr capture so the host-side `[trace dispatch_max_reducers]`
+          # fprintfs land in the captured log immediately preceding the sanitizer fault block (without
+          # `-s` pytest swallows stderr per-test and the trace lines never reach `sanitizer.log`).
+          exec compute-sanitizer --tool memcheck --print-limit 50 --launch-timeout 0 --error-exitcode 99 \
+            python -m pytest -n 0 --lf -x --tb=long -v -s
+          SH
+          chmod +x /tmp/repro/run_sanitizer.sh
+
+          cat <<'PY' > /tmp/repro/repro_cuda.py
+          # Standalone repro of test_ad_fibonacci with faulthandler. Useful for `py-spy --native --locals`.
+          import argparse, faulthandler, os, sys, time
+          os.environ.setdefault("QD_OFFLINE_CACHE", "0")
+          faulthandler.enable(); faulthandler.dump_traceback_later(15, repeat=True, file=sys.stderr)
+          ap = argparse.ArgumentParser(); ap.add_argument("--arch", default="cuda"); args = ap.parse_args()
+          import quadrants as qd
+          qd.init(arch=getattr(qd, args.arch))
+          N = 15
+          a = qd.ndarray(qd.f32, shape=N, needs_grad=True); b = qd.ndarray(qd.f32, shape=N, needs_grad=True)
+          c = qd.ndarray(qd.i32, shape=N); f = qd.ndarray(qd.f32, shape=N, needs_grad=True)
+          @qd.kernel
+          def fib(a: qd.types.ndarray(), b: qd.types.ndarray(), c: qd.types.ndarray(), f: qd.types.ndarray()):
+              for i in range(N):
+                  p = a[i]; q = b[i]
+                  for j in range(c[i]): p, q = q, p + q
+                  f[i] = q
+          b.fill(1)
+          for i in range(N): c[i] = i
+          fib(a, b, c, f); qd.sync()
+          for i in range(N): f.grad[i] = 1
+          fib.grad(a, b, c, f); qd.sync()
+          print("OK")
+          PY
+
+          cat <<'EOF' > /tmp/repro/README.md
+          # Quick reference inside the tmate session
+
+          The Quadrants source tree is at \$GITHUB_WORKSPACE; the cmake build dir is at
+          _skbuild/<linux-arch>/cmake-build (path captured in env var BUILD_DIR).
+
+          Reproducer (-n 8 --count 50; full test_ad_ndarray.py -k cuda):
+              bash /tmp/repro/run_reproducer.sh
+
+          compute-sanitizer (memcheck) over previously-failed tests via pytest --lf:
+              bash /tmp/repro/run_sanitizer.sh
+
+          Standalone repro of test_ad_fibonacci with faulthandler self-dump:
+              python /tmp/repro/repro_cuda.py
+
+          Iterating: edit a .cpp file, rebuild only the changed compile unit, then copy the new .so
+          back into the installed package (NEVER \`pip install -e\` for iterative rebuilds; the
+          project does not use scikit-build-core, the editable install ships only python).
+              cd \$GITHUB_WORKSPACE
+              cd \$BUILD_DIR && cmake --build . -j && cd -
+              cp \$BUILD_DIR/quadrants_python.cpython-310-x86_64-linux-gnu.so \\
+                 python/quadrants/_lib/core/
+              bash /tmp/repro/run_reproducer.sh
+
+          Native stack of a hung repro (from a second tmate window):
+              sudo py-spy dump --pid \$(pgrep -f 'python -m pytest') --native --locals
+
+          Recovering after a GPU wedge: kill the live processes, then
+              touch /tmp/continue       # frees the workflow
+
+          dmesg Xid history (filled in pre-test, post-reproducer, post-sanitizer):
+              cat /tmp/repro/dmesg_*.log | grep -E 'Xid|MMU Fault'
+          EOF
+          ls -la /tmp/repro/
+
+      - name: Tmate detached (BEFORE reproducer, limit-access-to-actor)
+        # Detached so it returns immediately and the workflow continues into the reproducer / sanitizer
+        # / artifact upload steps. The SSH command is printed in the workflow log; attach any time to
+        # iterate inside the same source tree (`cd _skbuild/*/cmake-build && cmake --build .`). The
+        # detached session lives until the job ends or `timeout-minutes` elapses.
+        if: always()
+        uses: mxschmitt/action-tmate@v3
+        with:
+          limit-access-to-actor: true
+          detached: true
+          timeout-minutes: ${{ fromJSON(env.TMATE_TIMEOUT) }}
+
+      - name: Reproducer (-n 8 --count $REPRODUCE_COUNT, CUDA_LAUNCH_BLOCKING=1)
+        id: reproduce
+        continue-on-error: true
+        run: |
+          set -x
+          bash /tmp/repro/run_reproducer.sh "${REPRODUCE_COUNT}" 2>&1 | tee /tmp/repro/reproducer.log
+          echo "exit_code=${PIPESTATUS[0]}" >> $GITHUB_OUTPUT
+          # Snapshot dmesg right after the reproducer so the new Xid lines are visible.
+          sudo dmesg --ctime > /tmp/repro/dmesg_after_reproducer.log 2>/dev/null
+          echo "=== Xid lines added by the reproducer ==="
+          diff /tmp/repro/dmesg_before.log /tmp/repro/dmesg_after_reproducer.log \
+            | grep -E "Xid|MMU Fault|nvidia_uvm" | tee /tmp/repro/xid_added.log
+
+      - name: compute-sanitizer over previously-failed tests
+        if: always() && steps.reproduce.outputs.exit_code != '0'
+        continue-on-error: true
+        run: |
+          set -x
+          bash /tmp/repro/run_sanitizer.sh 2>&1 | tee /tmp/repro/sanitizer.log
+
+      - name: Final dmesg snapshot
+        if: always()
+        run: |
+          sudo dmesg --ctime > /tmp/repro/dmesg_final.log 2>/dev/null
+          echo "=== Xid count by code (final) ==="
+          grep -oE "Xid \(PCI:[^)]*\): [0-9]+" /tmp/repro/dmesg_final.log 2>/dev/null \
+            | awk '{print $NF}' | sort | uniq -c | sort -rn | tee /tmp/repro/xid_summary.txt
+
+      - name: Upload all repro artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: debug_cuda_repro_artifacts
+          path: |
+            /tmp/repro/
+          if-no-files-found: ignore
+
+      - name: Re-emit reproducer exit code
+        if: always()
+        run: |
+          ec="${{ steps.reproduce.outputs.exit_code }}"
+          echo "reproducer exit code: ${ec}"
+          if [ "${ec}" != "0" ]; then
+            exit "${ec:-1}"
+          fi
diff --git a/.github/workflows/debug_metal_grad.yml.disabled b/.github/workflows/debug_metal_grad.yml.disabled
new file mode 100644
index 0000000000..669fcea565
--- /dev/null
+++ b/.github/workflows/debug_metal_grad.yml.disabled
@@ -0,0 +1,236 @@
+name: Debug Metal AD pipeline
+
+# DO NOT MERGE. This workflow exists only on `duburcqa/debug_metal_grad_repro`
+# to reproduce the macos-15 (Apple M1 Virtual) reverse-grad pipeline-creation
+# failure seen in Genesis CI run 25099014645 (Genesis PR #2743). All other
+# workflows on this branch are renamed to `*.yml.disabled` so this is the
+# only job that runs on PR sync.
+
+on:
+  pull_request:
+    types: [opened, reopened, synchronize]
+  workflow_dispatch:
+    inputs:
+      genesis_ref:
+        description: "Genesis ref to test against (branch, tag, or SHA)"
+        required: false
+        default: "f8ed535c83dd275b321009f33952a7e0b4e67a3d"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  repro:
+    name: Build Quadrants + run failing Genesis grad tests (${{ matrix.os }})
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [macos-26]
+    runs-on: ${{ matrix.os }}
+
+    env:
+      GENESIS_REF: ${{ github.event.inputs.genesis_ref || 'f8ed535c83dd275b321009f33952a7e0b4e67a3d' }}
+      HF_HUB_DOWNLOAD_TIMEOUT: "60"
+      FORCE_COLOR: "1"
+      PY_COLORS: "1"
+      GS_CACHE_FILE_PATH: ".cache/genesis"
+      GS_ENABLE_NDARRAY: "1"
+      GS_TORCH_FORCE_CPU_DEVICE: "1"
+      # Cache off so the rebuild always observes the patched metal_device.mm.
+      QD_OFFLINE_CACHE: "0"
+      QD_OFFLINE_CACHE_CLEANING_POLICY: "never"
+      QD_OFFLINE_CACHE_FILE_PATH: ".cache/quadrants"
+      QD_ENABLE_CUDA: "0"
+      QD_ENABLE_AMDGPU: "0"
+      QD_ENABLE_METAL: "1"
+      # Instrumentation (verbose logging, IR / MSL dumps) is commented out for this run so the workflow
+      # exercises the production path with no extra side-effects on memory pressure or pipeline-create timing.
+      # QD_DEBUG: "1"
+      # QD_LOG_LEVEL: "trace"
+      # QD_DUMP_MSL: "1"
+      # QD_DUMP_IR: "1"
+      OMNI_KIT_ACCEPT_EULA: "yes"
+
+    steps:
+      - name: Checkout Quadrants
+        uses: actions/checkout@v4
+        with:
+          path: quadrants
+          submodules: recursive
+          fetch-depth: 1
+
+      - name: Checkout Genesis
+        uses: actions/checkout@v4
+        with:
+          repository: Genesis-Embodied-AI/Genesis
+          ref: ${{ env.GENESIS_REF }}
+          path: genesis
+          fetch-depth: 1
+
+      # - name: Print system information
+      #   run: |
+      #     sw_vers
+      #     uname -a
+      #     system_profiler SPDisplaysDataType | head -40 || true
+      #     clang --version
+      #     sysctl hw.memsize hw.physicalcpu hw.ncpu hw.cputype hw.cpusubtype hw.cpubrand_string 2>&1 | head -10
+      #     vm_stat 2>&1 | head -20
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Quadrants - prerequisites
+        working-directory: quadrants
+        run: bash .github/workflows/scripts_new/macosx/1_prerequisites.sh
+
+      - name: Quadrants - build wheel
+        working-directory: quadrants
+        run: bash .github/workflows/scripts_new/macosx/2_build.sh
+
+      - name: Install built Quadrants wheel
+        working-directory: quadrants
+        run: |
+          python -m pip install --upgrade pip setuptools wheel
+          pip install dist/*.whl
+          python -c "import quadrants as qd; print('Quadrants version:', qd.__version__)"
+
+      - name: Install torch (CPU build, matches Genesis CI)
+        run: |
+          pip3 install torch --upgrade --index-url https://download.pytorch.org/whl/cpu
+
+      - name: Install Genesis (and pin our Quadrants wheel afterwards)
+        working-directory: genesis
+        run: |
+          pip install ".[dev,usd]"
+          # Genesis pulls quadrants from PyPI as a dep; override with our locally built wheel.
+          pip install --force-reinstall --no-deps ../quadrants/dist/*.whl
+          python -c "import quadrants as qd; print('Quadrants version after pin:', qd.__version__)"
+          python -c "import genesis as gs; print('Genesis version:', gs.__version__)"
+
+      - name: Run failing reverse-grad tests
+        id: run_tests
+        working-directory: genesis
+        continue-on-error: true
+        run: |
+          set -x
+          # Run the entire test_grad module on the gpu backend so this exercises every reverse-mode path the
+          # sparse-adstack-heap PR touches, not just the original M1 push-test repro.
+          pytest -v -ra --logical --dev --backend gpu -n 0 -s \
+            ./tests/test_grad.py 2>&1 | tee pytest.log
+          echo "exit_code=${PIPESTATUS[0]}" >> $GITHUB_OUTPUT
+
+      # - name: Capture jetsam / MTLCompilerService kill events from system log
+      #   if: always()
+      #   run: |
+      #     mkdir -p /tmp/syslog
+      #     log show --last 10m --predicate 'eventMessage CONTAINS "jetsam" OR eventMessage CONTAINS "MTLCompilerService" OR eventMessage CONTAINS "MTLCompiler" OR eventMessage CONTAINS "AGXCompiler"' 2>&1 | head -200 > /tmp/syslog/jetsam_and_metal.txt || true
+      #     wc -l /tmp/syslog/jetsam_and_metal.txt
+      #     head -50 /tmp/syslog/jetsam_and_metal.txt
+      #     log show --last 10m --predicate 'eventMessage CONTAINS "lowSwap"' 2>&1 | head -100 > /tmp/syslog/lowSwap.txt || true
+      #     memory_pressure 2>&1 | tee /tmp/syslog/memory_pressure_now.txt | head -10 || true
+
+      - name: Upload pytest log
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: pytest-log-${{ matrix.os }}
+          path: genesis/pytest.log
+          if-no-files-found: ignore
+
+      # - name: Upload IR / MSL dumps
+      #   if: always()
+      #   uses: actions/upload-artifact@v4
+      #   with:
+      #     name: ir-dump-${{ matrix.os }}
+      #     path: /tmp/ir/
+      #     if-no-files-found: ignore
+
+      # - name: Capture Apple's actual Metal-compiler output per kernel (xcrun metal -c)
+      #   if: always()
+      #   working-directory: genesis
+      #   run: |
+      #     set -x
+      #     xcrun metal --version 2>&1 | head -5 | tee /tmp/metal_toolchain_version.txt
+      #     xcrun --show-sdk-version | tee -a /tmp/metal_toolchain_version.txt
+      #     mkdir -p /tmp/metal_compile
+      #     python3 - <<'PY'
+      #     import re, os, subprocess
+      #     os.makedirs('/tmp/metal_compile', exist_ok=True)
+      #     log = open('pytest.log', 'r', errors='replace').read()
+      #     pattern = re.compile(r"=== MSL for kernel '([^']+)' ===\n(.*?)\n=== END MSL ===", re.DOTALL)
+      #     summary = []
+      #     for m in pattern.finditer(log):
+      #         name, body = m.group(1), m.group(2)
+      #         metal_path = f'/tmp/metal_compile/{name}.metal'
+      #         air_path = f'/tmp/metal_compile/{name}.air'
+      #         diag_path = f'/tmp/metal_compile/{name}.diag.txt'
+      #         with open(metal_path, 'w') as f:
+      #             f.write(body)
+      #         proc = subprocess.run(
+      #             ['xcrun', '-sdk', 'macosx', 'metal', '-std=macos-metal2.3', '-v', '-c', metal_path,
+      #              '-o', air_path],
+      #             capture_output=True, text=True, timeout=300)
+      #         with open(diag_path, 'w') as f:
+      #             f.write(f'# rc={proc.returncode}\n# stdout:\n{proc.stdout}\n# stderr:\n{proc.stderr}\n')
+      #         air_size = os.path.getsize(air_path) if os.path.exists(air_path) else 0
+      #         msl_size = os.path.getsize(metal_path)
+      #         summary.append((proc.returncode, msl_size, air_size, name))
+      #         print(f'[{proc.returncode}] {name}: msl={msl_size} bytes, air={air_size} bytes')
+      #     summary.sort(key=lambda r: (r[0], -r[1]))
+      #     with open('/tmp/metal_compile/_summary.tsv', 'w') as f:
+      #         f.write('returncode\tmsl_bytes\tair_bytes\tkernel\n')
+      #         for rc, ms, ab, n in summary:
+      #             f.write(f'{rc}\t{ms}\t{ab}\t{n}\n')
+      #     fails = [r for r in summary if r[0] != 0]
+      #     if fails:
+      #         print(f'\n=== {len(fails)} kernel(s) failed xcrun metal -c on ${{ matrix.os }} ===')
+      #         for rc, ms, ab, n in fails[:5]:
+      #             print(f'  rc={rc} msl_bytes={ms} kernel={n}')
+      #         first = fails[0][3]
+      #         print(f'\n=== Diagnostic for {first} ===')
+      #         with open(f'/tmp/metal_compile/{first}.diag.txt') as f:
+      #             print(f.read())
+      #     else:
+      #         print(f'\nAll {len(summary)} kernel(s) compiled cleanly via xcrun metal -c on ${{ matrix.os }}.')
+      #     PY
+
+      # - name: Upload xcrun metal compile diagnostics
+      #   if: always()
+      #   uses: actions/upload-artifact@v4
+      #   with:
+      #     name: metal-compile-${{ matrix.os }}
+      #     path: |
+      #       /tmp/metal_compile/
+      #       /tmp/metal_toolchain_version.txt
+      #     if-no-files-found: ignore
+
+      # - name: Upload memory / jetsam diagnostics
+      #   if: always()
+      #   uses: actions/upload-artifact@v4
+      #   with:
+      #     name: memory-diagnostics-${{ matrix.os }}
+      #     path: |
+      #       /tmp/mem_before.txt
+      #       /tmp/mem_after.txt
+      #       /tmp/syslog/
+      #     if-no-files-found: ignore
+
+      # - name: Setup tmate session for interactive debugging
+      #   if: always()
+      #   uses: mxschmitt/action-tmate@v3
+      #   with:
+      #     limit-access-to-actor: true
+      #     detached: false
+      #     timeout-minutes: 60
+
+      - name: Re-emit pytest exit code
+        if: always()
+        run: |
+          ec="${{ steps.run_tests.outputs.exit_code }}"
+          echo "pytest exit code: ${ec}"
+          if [ "${ec}" != "0" ]; then
+            exit "${ec:-1}"
+          fi
diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml.disabled
similarity index 100%
rename from .github/workflows/linters.yml
rename to .github/workflows/linters.yml.disabled
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml.disabled
similarity index 100%
rename from .github/workflows/linux.yml
rename to .github/workflows/linux.yml.disabled
diff --git a/.github/workflows/macosx.yml b/.github/workflows/macosx.yml.disabled
similarity index 100%
rename from .github/workflows/macosx.yml
rename to .github/workflows/macosx.yml.disabled
diff --git a/.github/workflows/manylinux_wheel.yml b/.github/workflows/manylinux_wheel.yml.disabled
similarity index 100%
rename from .github/workflows/manylinux_wheel.yml
rename to .github/workflows/manylinux_wheel.yml.disabled
diff --git a/.github/workflows/pr_change_report.yml b/.github/workflows/pr_change_report.yml.disabled
similarity index 100%
rename from .github/workflows/pr_change_report.yml
rename to .github/workflows/pr_change_report.yml.disabled
diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml.disabled
similarity index 100%
rename from .github/workflows/publish_pypi.yml
rename to .github/workflows/publish_pypi.yml.disabled
diff --git a/.github/workflows/pyright_linter.yml b/.github/workflows/pyright_linter.yml.disabled
similarity index 100%
rename from .github/workflows/pyright_linter.yml
rename to .github/workflows/pyright_linter.yml.disabled
diff --git a/.github/workflows/test_gpu.yml b/.github/workflows/test_gpu.yml.disabled
similarity index 100%
rename from .github/workflows/test_gpu.yml
rename to .github/workflows/test_gpu.yml.disabled
diff --git a/.github/workflows/win.yml b/.github/workflows/win.yml.disabled
similarity index 100%
rename from .github/workflows/win.yml
rename to .github/workflows/win.yml.disabled