diff --git a/.github/workflows/api_doc.yml b/.github/workflows/api_doc.yml.disabled similarity index 100% rename from .github/workflows/api_doc.yml rename to .github/workflows/api_doc.yml.disabled diff --git a/.github/workflows/check_deleted_comments.yml b/.github/workflows/check_deleted_comments.yml.disabled similarity index 100% rename from .github/workflows/check_deleted_comments.yml rename to .github/workflows/check_deleted_comments.yml.disabled diff --git a/.github/workflows/check_feature_factorization.yml b/.github/workflows/check_feature_factorization.yml.disabled similarity index 100% rename from .github/workflows/check_feature_factorization.yml rename to .github/workflows/check_feature_factorization.yml.disabled diff --git a/.github/workflows/check_markup_links.yml b/.github/workflows/check_markup_links.yml.disabled similarity index 100% rename from .github/workflows/check_markup_links.yml rename to .github/workflows/check_markup_links.yml.disabled diff --git a/.github/workflows/check_test_coverage.yml b/.github/workflows/check_test_coverage.yml.disabled similarity index 100% rename from .github/workflows/check_test_coverage.yml rename to .github/workflows/check_test_coverage.yml.disabled diff --git a/.github/workflows/check_wrapping.yml b/.github/workflows/check_wrapping.yml.disabled similarity index 100% rename from .github/workflows/check_wrapping.yml rename to .github/workflows/check_wrapping.yml.disabled diff --git a/.github/workflows/clang_tidy.yml b/.github/workflows/clang_tidy.yml.disabled similarity index 100% rename from .github/workflows/clang_tidy.yml rename to .github/workflows/clang_tidy.yml.disabled diff --git a/.github/workflows/debug_cuda_max_reducer.yml b/.github/workflows/debug_cuda_max_reducer.yml new file mode 100644 index 0000000000..d157389f37 --- /dev/null +++ b/.github/workflows/debug_cuda_max_reducer.yml @@ -0,0 +1,330 @@ +name: Debug CUDA T4 max-reducer hang + +# DO NOT MERGE. Single-job workflow that builds Quadrants in editable mode on a `gpu-t4-4-core` +# runner, opens a detached tmate session for live attach, then runs the proven reproducer + +# compute-sanitizer trace + dmesg Xid capture in the background while you iterate. +# +# Findings carried over from the previous debug session: +# * Reproducer = `pytest -n 8 --count 50 tests/python/test_ad_ndarray.py -k cuda` on T4. ~50% of +# workers see their FIRST or SECOND launched test crash with `CUDA_ERROR_ILLEGAL_ADDRESS while +# calling cuStreamSynchronize / cuMemsetD8 / cuMemFreeAsync`. +# * `dmesg` shows `Xid 31 ... MMU Fault: ENGINE GRAPHICS GPC0 GPCCLIENT_T1_0 faulted @ 0x???_??000000. +# Fault is of type FAULT_PDE ACCESS_TYPE_VIRT_WRITE` followed by `Xid 109 CTX SWITCH TIMEOUT`. +# A CUDA kernel is writing to a stale / out-of-bounds device pointer; subsequent ops fail until +# the GPU is wedged. +# * Host stack at the surfaced symptom = `Ndarray::write_float -> Ndarray::write -> allocate_memory +# (small staging buffer) -> cuMemset` failing because the context is already corrupted by the +# earlier OOB write. +# +# This workflow: +# 1. Builds Quadrants editable, CUDA-only (no Vulkan / AMDGPU / cpp tests, so iteration via +# `cd _skbuild/*/cmake-build && cmake --build .` inside tmate is fast). +# 2. Pre-installs `nvidia-cuda-toolkit` so `compute-sanitizer` is available. +# 3. Captures system / driver / HMM info + the pre-test `dmesg` snapshot. +# 4. Opens a detached tmate session - SSH command is printed in the workflow log; the workflow +# continues straight into the reproducer / sanitizer / artifact-upload steps. +# 5. Runs the reproducer with `CUDA_LAUNCH_BLOCKING=1`, streaming pytest output through an +# abort wrapper that SIGTERMs the controller on the first FAILED line. +# 6. Snapshots `dmesg` again so the new Xid lines are visible. +# 7. Re-runs the previously-failed tests under `compute-sanitizer --tool memcheck`. Its first +# `Invalid write` block is the offending kernel. +# 8. Uploads `/tmp/repro/` as an artifact. Tmate keeps running until `timeout-minutes` or job end. + +on: + pull_request: + types: [opened, reopened, synchronize] + workflow_dispatch: + inputs: + quadrants_ref: + description: "Quadrants ref to build + test (branch / tag / SHA). Default = tip of the source branch holding the in-flight debug instrumentation." + required: false + default: "duburcqa/turing_stream_pin_experiment" + tmate_timeout_minutes: + description: "tmate session timeout (minutes). `touch /tmp/continue` from inside the shell to skip the rest of the wait." + required: false + default: "60" + reproduce_count: + description: "pytest --count value for the reproducer (multiplier per test)." + required: false + default: "50" + +concurrency: + # Group keyed on the run id so two manual `workflow_dispatch` invocations (or a push synced on top + # of an active tmate session) coexist instead of cancelling each other. Debug runs are interactive + # via tmate; cancelling a live tmate session because a new push landed is exactly what we do not + # want here. + group: ${{ github.workflow }}-${{ github.run_id }} + cancel-in-progress: false + +jobs: + debug_cuda_t4: + name: Build + reproduce + sanitizer + tmate on CUDA T4 + runs-on: gpu-t4-4-core + env: + QUADRANTS_REF: ${{ github.event.inputs.quadrants_ref || 'duburcqa/turing_stream_pin_experiment' }} + TMATE_TIMEOUT: ${{ github.event.inputs.tmate_timeout_minutes || '60' }} + REPRODUCE_COUNT: ${{ github.event.inputs.reproduce_count || '50' }} + QD_OFFLINE_CACHE: "0" + QD_OFFLINE_CACHE_CLEANING_POLICY: "never" + QD_DEBUG_ADSTACK: "1" + PYTHONUNBUFFERED: "1" + PYTHONFAULTHANDLER: "1" + CUDA_LAUNCH_BLOCKING: "1" + QUADRANTS_CMAKE_ARGS: "-DQD_WITH_VULKAN:BOOL=OFF -DQD_WITH_CUDA:BOOL=ON -DQD_WITH_AMDGPU:BOOL=OFF -DQD_BUILD_TESTS:BOOL=OFF" + steps: + - name: Checkout Quadrants at the target ref + uses: actions/checkout@v4 + with: + ref: ${{ env.QUADRANTS_REF }} + fetch-depth: 1 + submodules: recursive + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Capture system / driver / HMM info + initial dmesg + run: | + set +e + mkdir -p /tmp/repro + { + echo "=== uname / lsb_release ===" + uname -a + cat /etc/os-release + echo "=== nvidia-smi ===" + nvidia-smi + echo "=== driver version ===" + cat /proc/driver/nvidia/version + echo "=== HMM device attrs ===" + python3 - <<'PY' + import ctypes + libcuda = ctypes.CDLL('libcuda.so.1'); libcuda.cuInit(0) + dev = ctypes.c_int(); libcuda.cuDeviceGet(ctypes.byref(dev), 0) + name = ctypes.create_string_buffer(256); libcuda.cuDeviceGetName(name, 256, dev) + attr = ctypes.c_int() + for code, label in [(83, 'MANAGED_MEMORY'), (88, 'PAGEABLE_MEMORY_ACCESS'), + (99, 'PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES')]: + libcuda.cuDeviceGetAttribute(ctypes.byref(attr), code, dev) + print(f" {label}: {attr.value}") + print(f" device: {name.value.decode()}") + PY + echo "=== runner image ===" + cat /imagegeneration/imagedata.json 2>/dev/null + } | tee /tmp/repro/system_identity.txt + # Snapshot dmesg BEFORE the reproducer so we can diff against a post-run snapshot. + sudo dmesg --ctime > /tmp/repro/dmesg_before.log 2>/dev/null + + - name: Install OS build deps + CUDA libs + compute-sanitizer + run: | + set -ex + sudo apt-get update -y + sudo apt-get install -y \ + build-essential cmake ninja-build pkg-config git \ + liblz4-dev libncurses-dev libssl-dev libzstd-dev \ + libcusolver-dev-12-8 libcusolver-12-8 libcusparse-dev-12-8 libcusparse-12-8 \ + libnvjitlink-12-8 libcublas-12-8 \ + nvidia-cuda-toolkit + echo "/usr/local/cuda/targets/x86_64-linux/lib" | sudo tee /etc/ld.so.conf.d/cuda-targets.conf + sudo ldconfig + which compute-sanitizer + compute-sanitizer --version + + - name: Install Python build deps + LLVM toolchain + run: | + set -ex + python -m pip install --upgrade pip setuptools wheel + pip install --group dev + pip install --group test || true + pip install -r requirements_test_xdist.txt || true + pip install py-spy pytest-repeat + LLVM_DIR=$(python download_llvm.py | tail -n 1) + echo "LLVM_DIR=${LLVM_DIR}" >> $GITHUB_ENV + echo "${LLVM_DIR}/bin" >> $GITHUB_PATH + chmod +x ${LLVM_DIR}/bin/* || true + ${LLVM_DIR}/bin/clang --version + + - name: Build in develop mode (./build.py --write-env + python setup.py develop) + run: | + set -ex + # Develop install, no wheel, no isolation. Mirrors the project's local-iteration flow + # (`./build.py --shell` + `python setup.py develop`) but non-interactive: `./build.py + # --write-env=` dumps the cmake / LLVM / sccache env vars (the same ones the + # interactive shell mode exports), we source it, then `setup.py develop` runs skbuild's + # cmake build + DevelopWithStubs install. After this the source tree IS the install: edit + # a .cpp, then `cd _skbuild/*/cmake-build && cmake --build . -j` and the egg-linked + # package picks up the new .so. NEVER `pip install -e` (this project does not use + # scikit-build-core as a build backend; editable install ships a python-only stub). + ./build.py --write-env=/tmp/build_env.sh + set +x; source /tmp/build_env.sh; set -x + python setup.py develop + python -c "import quadrants as qd; qd.init(arch=qd.cuda); print('cuda init OK')" 2>&1 | tee /tmp/repro/cuda_init.log + BUILD_DIR=$(ls -d _skbuild/*/cmake-build 2>/dev/null | head -1) + echo "BUILD_DIR=${BUILD_DIR}" | tee -a $GITHUB_ENV + ls -la "${BUILD_DIR}/" + + - name: Stage repro helpers under /tmp/repro/ + run: | + set -ex + # Inline-write so the workflow does NOT depend on any file in the source tree. + cat <<'SH' > /tmp/repro/run_reproducer.sh + #!/bin/bash + set -x + cd "$(git rev-parse --show-toplevel 2>/dev/null || pwd)" + export QD_OFFLINE_CACHE=0 QD_DEBUG_ADSTACK=1 PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 CUDA_LAUNCH_BLOCKING=1 + # Stream pytest output through this shell and abort the whole run the moment the first + # FAILED line appears. pytest-xdist's `-x` does not work, so we drive abort externally: + # read the pytest controller's combined stdout/stderr line-by-line; on the first match, + # SIGTERM the controller (its xdist workers are direct children and follow). Without this, + # `--count 50` would burn the full 50x sweep after the bug has already reproduced. + # The calling workflow step pipes our stdout through `tee /tmp/repro/reproducer.log`. + exec 3< <(python -m pytest -n 8 --count "${1:-50}" -ra -v --tb=long \ + tests/python/test_ad_ndarray.py -k cuda 2>&1) + PYTEST_PID=$! + ABORTED=0 + while IFS= read -r line <&3; do + printf '%s\n' "$line" + if [ "$ABORTED" = "0" ] && [ "${line#*FAILED}" != "$line" ]; then + ABORTED=1 + echo "[abort] FAILED detected; SIGTERM pytest controller pid=$PYTEST_PID + xdist workers" + pkill -TERM -P "$PYTEST_PID" 2>/dev/null || true + kill -TERM "$PYTEST_PID" 2>/dev/null || true + fi + done + exec 3<&- + wait "$PYTEST_PID" 2>/dev/null + exit $? + SH + chmod +x /tmp/repro/run_reproducer.sh + + cat <<'SH' > /tmp/repro/run_sanitizer.sh + #!/bin/bash + set -x + cd "$(git rev-parse --show-toplevel 2>/dev/null || pwd)" + export QD_OFFLINE_CACHE=0 QD_DEBUG_ADSTACK=1 PYTHONUNBUFFERED=1 PYTHONFAULTHANDLER=1 CUDA_LAUNCH_BLOCKING=1 + # `--lf` runs only the previously-failed tests; serial (-n 0) so sanitizer output stays sane. + # `-s` disables pytest stdout/stderr capture so the host-side `[trace dispatch_max_reducers]` + # fprintfs land in the captured log immediately preceding the sanitizer fault block (without + # `-s` pytest swallows stderr per-test and the trace lines never reach `sanitizer.log`). + exec compute-sanitizer --tool memcheck --print-limit 50 --launch-timeout 0 --error-exitcode 99 \ + python -m pytest -n 0 --lf -x --tb=long -v -s + SH + chmod +x /tmp/repro/run_sanitizer.sh + + cat <<'PY' > /tmp/repro/repro_cuda.py + # Standalone repro of test_ad_fibonacci with faulthandler. Useful for `py-spy --native --locals`. + import argparse, faulthandler, os, sys, time + os.environ.setdefault("QD_OFFLINE_CACHE", "0") + faulthandler.enable(); faulthandler.dump_traceback_later(15, repeat=True, file=sys.stderr) + ap = argparse.ArgumentParser(); ap.add_argument("--arch", default="cuda"); args = ap.parse_args() + import quadrants as qd + qd.init(arch=getattr(qd, args.arch)) + N = 15 + a = qd.ndarray(qd.f32, shape=N, needs_grad=True); b = qd.ndarray(qd.f32, shape=N, needs_grad=True) + c = qd.ndarray(qd.i32, shape=N); f = qd.ndarray(qd.f32, shape=N, needs_grad=True) + @qd.kernel + def fib(a: qd.types.ndarray(), b: qd.types.ndarray(), c: qd.types.ndarray(), f: qd.types.ndarray()): + for i in range(N): + p = a[i]; q = b[i] + for j in range(c[i]): p, q = q, p + q + f[i] = q + b.fill(1) + for i in range(N): c[i] = i + fib(a, b, c, f); qd.sync() + for i in range(N): f.grad[i] = 1 + fib.grad(a, b, c, f); qd.sync() + print("OK") + PY + + cat <<'EOF' > /tmp/repro/README.md + # Quick reference inside the tmate session + + The Quadrants source tree is at \$GITHUB_WORKSPACE; the cmake build dir is at + _skbuild//cmake-build (path captured in env var BUILD_DIR). + + Reproducer (-n 8 --count 50; full test_ad_ndarray.py -k cuda): + bash /tmp/repro/run_reproducer.sh + + compute-sanitizer (memcheck) over previously-failed tests via pytest --lf: + bash /tmp/repro/run_sanitizer.sh + + Standalone repro of test_ad_fibonacci with faulthandler self-dump: + python /tmp/repro/repro_cuda.py + + Iterating: edit a .cpp file, rebuild only the changed compile unit, then copy the new .so + back into the installed package (NEVER \`pip install -e\` for iterative rebuilds; the + project does not use scikit-build-core, the editable install ships only python). + cd \$GITHUB_WORKSPACE + cd \$BUILD_DIR && cmake --build . -j && cd - + cp \$BUILD_DIR/quadrants_python.cpython-310-x86_64-linux-gnu.so \\ + python/quadrants/_lib/core/ + bash /tmp/repro/run_reproducer.sh + + Native stack of a hung repro (from a second tmate window): + sudo py-spy dump --pid \$(pgrep -f 'python -m pytest') --native --locals + + Recovering after a GPU wedge: kill the live processes, then + touch /tmp/continue # frees the workflow + + dmesg Xid history (filled in pre-test, post-reproducer, post-sanitizer): + cat /tmp/repro/dmesg_*.log | grep -E 'Xid|MMU Fault' + EOF + ls -la /tmp/repro/ + + - name: Tmate detached (BEFORE reproducer, limit-access-to-actor) + # Detached so it returns immediately and the workflow continues into the reproducer / sanitizer + # / artifact upload steps. The SSH command is printed in the workflow log; attach any time to + # iterate inside the same source tree (`cd _skbuild/*/cmake-build && cmake --build .`). The + # detached session lives until the job ends or `timeout-minutes` elapses. + if: always() + uses: mxschmitt/action-tmate@v3 + with: + limit-access-to-actor: true + detached: true + timeout-minutes: ${{ fromJSON(env.TMATE_TIMEOUT) }} + + - name: Reproducer (-n 8 --count $REPRODUCE_COUNT, CUDA_LAUNCH_BLOCKING=1) + id: reproduce + continue-on-error: true + run: | + set -x + bash /tmp/repro/run_reproducer.sh "${REPRODUCE_COUNT}" 2>&1 | tee /tmp/repro/reproducer.log + echo "exit_code=${PIPESTATUS[0]}" >> $GITHUB_OUTPUT + # Snapshot dmesg right after the reproducer so the new Xid lines are visible. + sudo dmesg --ctime > /tmp/repro/dmesg_after_reproducer.log 2>/dev/null + echo "=== Xid lines added by the reproducer ===" + diff /tmp/repro/dmesg_before.log /tmp/repro/dmesg_after_reproducer.log \ + | grep -E "Xid|MMU Fault|nvidia_uvm" | tee /tmp/repro/xid_added.log + + - name: compute-sanitizer over previously-failed tests + if: always() && steps.reproduce.outputs.exit_code != '0' + continue-on-error: true + run: | + set -x + bash /tmp/repro/run_sanitizer.sh 2>&1 | tee /tmp/repro/sanitizer.log + + - name: Final dmesg snapshot + if: always() + run: | + sudo dmesg --ctime > /tmp/repro/dmesg_final.log 2>/dev/null + echo "=== Xid count by code (final) ===" + grep -oE "Xid \(PCI:[^)]*\): [0-9]+" /tmp/repro/dmesg_final.log 2>/dev/null \ + | awk '{print $NF}' | sort | uniq -c | sort -rn | tee /tmp/repro/xid_summary.txt + + - name: Upload all repro artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: debug_cuda_repro_artifacts + path: | + /tmp/repro/ + if-no-files-found: ignore + + - name: Re-emit reproducer exit code + if: always() + run: | + ec="${{ steps.reproduce.outputs.exit_code }}" + echo "reproducer exit code: ${ec}" + if [ "${ec}" != "0" ]; then + exit "${ec:-1}" + fi diff --git a/.github/workflows/debug_metal_grad.yml.disabled b/.github/workflows/debug_metal_grad.yml.disabled new file mode 100644 index 0000000000..669fcea565 --- /dev/null +++ b/.github/workflows/debug_metal_grad.yml.disabled @@ -0,0 +1,236 @@ +name: Debug Metal AD pipeline + +# DO NOT MERGE. This workflow exists only on `duburcqa/debug_metal_grad_repro` +# to reproduce the macos-15 (Apple M1 Virtual) reverse-grad pipeline-creation +# failure seen in Genesis CI run 25099014645 (Genesis PR #2743). All other +# workflows on this branch are renamed to `*.yml.disabled` so this is the +# only job that runs on PR sync. + +on: + pull_request: + types: [opened, reopened, synchronize] + workflow_dispatch: + inputs: + genesis_ref: + description: "Genesis ref to test against (branch, tag, or SHA)" + required: false + default: "f8ed535c83dd275b321009f33952a7e0b4e67a3d" + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} + cancel-in-progress: true + +jobs: + repro: + name: Build Quadrants + run failing Genesis grad tests (${{ matrix.os }}) + strategy: + fail-fast: false + matrix: + os: [macos-26] + runs-on: ${{ matrix.os }} + + env: + GENESIS_REF: ${{ github.event.inputs.genesis_ref || 'f8ed535c83dd275b321009f33952a7e0b4e67a3d' }} + HF_HUB_DOWNLOAD_TIMEOUT: "60" + FORCE_COLOR: "1" + PY_COLORS: "1" + GS_CACHE_FILE_PATH: ".cache/genesis" + GS_ENABLE_NDARRAY: "1" + GS_TORCH_FORCE_CPU_DEVICE: "1" + # Cache off so the rebuild always observes the patched metal_device.mm. + QD_OFFLINE_CACHE: "0" + QD_OFFLINE_CACHE_CLEANING_POLICY: "never" + QD_OFFLINE_CACHE_FILE_PATH: ".cache/quadrants" + QD_ENABLE_CUDA: "0" + QD_ENABLE_AMDGPU: "0" + QD_ENABLE_METAL: "1" + # Instrumentation (verbose logging, IR / MSL dumps) is commented out for this run so the workflow + # exercises the production path with no extra side-effects on memory pressure or pipeline-create timing. + # QD_DEBUG: "1" + # QD_LOG_LEVEL: "trace" + # QD_DUMP_MSL: "1" + # QD_DUMP_IR: "1" + OMNI_KIT_ACCEPT_EULA: "yes" + + steps: + - name: Checkout Quadrants + uses: actions/checkout@v4 + with: + path: quadrants + submodules: recursive + fetch-depth: 1 + + - name: Checkout Genesis + uses: actions/checkout@v4 + with: + repository: Genesis-Embodied-AI/Genesis + ref: ${{ env.GENESIS_REF }} + path: genesis + fetch-depth: 1 + + # - name: Print system information + # run: | + # sw_vers + # uname -a + # system_profiler SPDisplaysDataType | head -40 || true + # clang --version + # sysctl hw.memsize hw.physicalcpu hw.ncpu hw.cputype hw.cpusubtype hw.cpubrand_string 2>&1 | head -10 + # vm_stat 2>&1 | head -20 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Quadrants - prerequisites + working-directory: quadrants + run: bash .github/workflows/scripts_new/macosx/1_prerequisites.sh + + - name: Quadrants - build wheel + working-directory: quadrants + run: bash .github/workflows/scripts_new/macosx/2_build.sh + + - name: Install built Quadrants wheel + working-directory: quadrants + run: | + python -m pip install --upgrade pip setuptools wheel + pip install dist/*.whl + python -c "import quadrants as qd; print('Quadrants version:', qd.__version__)" + + - name: Install torch (CPU build, matches Genesis CI) + run: | + pip3 install torch --upgrade --index-url https://download.pytorch.org/whl/cpu + + - name: Install Genesis (and pin our Quadrants wheel afterwards) + working-directory: genesis + run: | + pip install ".[dev,usd]" + # Genesis pulls quadrants from PyPI as a dep; override with our locally built wheel. + pip install --force-reinstall --no-deps ../quadrants/dist/*.whl + python -c "import quadrants as qd; print('Quadrants version after pin:', qd.__version__)" + python -c "import genesis as gs; print('Genesis version:', gs.__version__)" + + - name: Run failing reverse-grad tests + id: run_tests + working-directory: genesis + continue-on-error: true + run: | + set -x + # Run the entire test_grad module on the gpu backend so this exercises every reverse-mode path the + # sparse-adstack-heap PR touches, not just the original M1 push-test repro. + pytest -v -ra --logical --dev --backend gpu -n 0 -s \ + ./tests/test_grad.py 2>&1 | tee pytest.log + echo "exit_code=${PIPESTATUS[0]}" >> $GITHUB_OUTPUT + + # - name: Capture jetsam / MTLCompilerService kill events from system log + # if: always() + # run: | + # mkdir -p /tmp/syslog + # log show --last 10m --predicate 'eventMessage CONTAINS "jetsam" OR eventMessage CONTAINS "MTLCompilerService" OR eventMessage CONTAINS "MTLCompiler" OR eventMessage CONTAINS "AGXCompiler"' 2>&1 | head -200 > /tmp/syslog/jetsam_and_metal.txt || true + # wc -l /tmp/syslog/jetsam_and_metal.txt + # head -50 /tmp/syslog/jetsam_and_metal.txt + # log show --last 10m --predicate 'eventMessage CONTAINS "lowSwap"' 2>&1 | head -100 > /tmp/syslog/lowSwap.txt || true + # memory_pressure 2>&1 | tee /tmp/syslog/memory_pressure_now.txt | head -10 || true + + - name: Upload pytest log + if: always() + uses: actions/upload-artifact@v4 + with: + name: pytest-log-${{ matrix.os }} + path: genesis/pytest.log + if-no-files-found: ignore + + # - name: Upload IR / MSL dumps + # if: always() + # uses: actions/upload-artifact@v4 + # with: + # name: ir-dump-${{ matrix.os }} + # path: /tmp/ir/ + # if-no-files-found: ignore + + # - name: Capture Apple's actual Metal-compiler output per kernel (xcrun metal -c) + # if: always() + # working-directory: genesis + # run: | + # set -x + # xcrun metal --version 2>&1 | head -5 | tee /tmp/metal_toolchain_version.txt + # xcrun --show-sdk-version | tee -a /tmp/metal_toolchain_version.txt + # mkdir -p /tmp/metal_compile + # python3 - <<'PY' + # import re, os, subprocess + # os.makedirs('/tmp/metal_compile', exist_ok=True) + # log = open('pytest.log', 'r', errors='replace').read() + # pattern = re.compile(r"=== MSL for kernel '([^']+)' ===\n(.*?)\n=== END MSL ===", re.DOTALL) + # summary = [] + # for m in pattern.finditer(log): + # name, body = m.group(1), m.group(2) + # metal_path = f'/tmp/metal_compile/{name}.metal' + # air_path = f'/tmp/metal_compile/{name}.air' + # diag_path = f'/tmp/metal_compile/{name}.diag.txt' + # with open(metal_path, 'w') as f: + # f.write(body) + # proc = subprocess.run( + # ['xcrun', '-sdk', 'macosx', 'metal', '-std=macos-metal2.3', '-v', '-c', metal_path, + # '-o', air_path], + # capture_output=True, text=True, timeout=300) + # with open(diag_path, 'w') as f: + # f.write(f'# rc={proc.returncode}\n# stdout:\n{proc.stdout}\n# stderr:\n{proc.stderr}\n') + # air_size = os.path.getsize(air_path) if os.path.exists(air_path) else 0 + # msl_size = os.path.getsize(metal_path) + # summary.append((proc.returncode, msl_size, air_size, name)) + # print(f'[{proc.returncode}] {name}: msl={msl_size} bytes, air={air_size} bytes') + # summary.sort(key=lambda r: (r[0], -r[1])) + # with open('/tmp/metal_compile/_summary.tsv', 'w') as f: + # f.write('returncode\tmsl_bytes\tair_bytes\tkernel\n') + # for rc, ms, ab, n in summary: + # f.write(f'{rc}\t{ms}\t{ab}\t{n}\n') + # fails = [r for r in summary if r[0] != 0] + # if fails: + # print(f'\n=== {len(fails)} kernel(s) failed xcrun metal -c on ${{ matrix.os }} ===') + # for rc, ms, ab, n in fails[:5]: + # print(f' rc={rc} msl_bytes={ms} kernel={n}') + # first = fails[0][3] + # print(f'\n=== Diagnostic for {first} ===') + # with open(f'/tmp/metal_compile/{first}.diag.txt') as f: + # print(f.read()) + # else: + # print(f'\nAll {len(summary)} kernel(s) compiled cleanly via xcrun metal -c on ${{ matrix.os }}.') + # PY + + # - name: Upload xcrun metal compile diagnostics + # if: always() + # uses: actions/upload-artifact@v4 + # with: + # name: metal-compile-${{ matrix.os }} + # path: | + # /tmp/metal_compile/ + # /tmp/metal_toolchain_version.txt + # if-no-files-found: ignore + + # - name: Upload memory / jetsam diagnostics + # if: always() + # uses: actions/upload-artifact@v4 + # with: + # name: memory-diagnostics-${{ matrix.os }} + # path: | + # /tmp/mem_before.txt + # /tmp/mem_after.txt + # /tmp/syslog/ + # if-no-files-found: ignore + + # - name: Setup tmate session for interactive debugging + # if: always() + # uses: mxschmitt/action-tmate@v3 + # with: + # limit-access-to-actor: true + # detached: false + # timeout-minutes: 60 + + - name: Re-emit pytest exit code + if: always() + run: | + ec="${{ steps.run_tests.outputs.exit_code }}" + echo "pytest exit code: ${ec}" + if [ "${ec}" != "0" ]; then + exit "${ec:-1}" + fi diff --git a/.github/workflows/linters.yml b/.github/workflows/linters.yml.disabled similarity index 100% rename from .github/workflows/linters.yml rename to .github/workflows/linters.yml.disabled diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml.disabled similarity index 100% rename from .github/workflows/linux.yml rename to .github/workflows/linux.yml.disabled diff --git a/.github/workflows/macosx.yml b/.github/workflows/macosx.yml.disabled similarity index 100% rename from .github/workflows/macosx.yml rename to .github/workflows/macosx.yml.disabled diff --git a/.github/workflows/manylinux_wheel.yml b/.github/workflows/manylinux_wheel.yml.disabled similarity index 100% rename from .github/workflows/manylinux_wheel.yml rename to .github/workflows/manylinux_wheel.yml.disabled diff --git a/.github/workflows/pr_change_report.yml b/.github/workflows/pr_change_report.yml.disabled similarity index 100% rename from .github/workflows/pr_change_report.yml rename to .github/workflows/pr_change_report.yml.disabled diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml.disabled similarity index 100% rename from .github/workflows/publish_pypi.yml rename to .github/workflows/publish_pypi.yml.disabled diff --git a/.github/workflows/pyright_linter.yml b/.github/workflows/pyright_linter.yml.disabled similarity index 100% rename from .github/workflows/pyright_linter.yml rename to .github/workflows/pyright_linter.yml.disabled diff --git a/.github/workflows/test_gpu.yml b/.github/workflows/test_gpu.yml.disabled similarity index 100% rename from .github/workflows/test_gpu.yml rename to .github/workflows/test_gpu.yml.disabled diff --git a/.github/workflows/win.yml b/.github/workflows/win.yml.disabled similarity index 100% rename from .github/workflows/win.yml rename to .github/workflows/win.yml.disabled