From 4c75de096beef1353c2f2c5a7079d7677fd9fbc6 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 26 Aug 2025 14:06:47 +0300 Subject: [PATCH 01/10] starting nvshmem4py --- PYTHON/hello-nvshmem.py | 54 +++++++ PYTHON/nstream-cupy-nvshmem.py | 258 +++++++++++++++++++++++++++++++++ 2 files changed, 312 insertions(+) create mode 100644 PYTHON/hello-nvshmem.py create mode 100755 PYTHON/nstream-cupy-nvshmem.py diff --git a/PYTHON/hello-nvshmem.py b/PYTHON/hello-nvshmem.py new file mode 100644 index 000000000..bf39b631f --- /dev/null +++ b/PYTHON/hello-nvshmem.py @@ -0,0 +1,54 @@ +import numpy +import nvshmem.core as nvshmem +from mpi4py import MPI +from cuda.core.experimental import Device +from cuda.core.experimental import system + +# Initialize MPI +comm = MPI.COMM_WORLD +me = comm.Get_rank() +np = comm.Get_size() + +# Initialize NVSHMEM with MPI +#dev = Device() +#dev.set_current() +#nvshmem.init(device=dev, mpi_comm=comm, initializer_method="mpi") + +uid = nvshmem.get_unique_id(empty=(me != 0)) +comm.Bcast(uid._data.view(numpy.int8), root=0) +dev = Device() +dev.set_current() +nvshmem.init(device=dev, uid=uid, rank=me, nranks=np, initializer_method="uid") + +#dev = Device(me % system.num_devices) +#dev.set_current() +#nvshmem.init(device=dev, mpi_comm=comm, initializer_method="emulated_mpi") + +stream = dev.create_stream() + +# Get information about the current PE +my_pe = nvshmem.my_pe() +n_pes = nvshmem.n_pes() + +# Allocate symmetric memory +# array() returns a CuPy NDArray object +#x = nvshmem.array((1024,), dtype="float32") +#y = nvshmem.array((1024,), dtype="float32") + +#if my_pe == 0: +# y[:] = 1.0 + +# Perform communication operations +# Put y from PE 0 into x on PE 1 +#if my_pe == 0: +# nvshmem.put(x, y, pe=1, stream=stream) + +# Synchronize PEs +stream.sync() + +# Clean up +#nvshmem.free_array(x) +#nvshmem.free_array(y) +nvshmem.finalize() +print('OK') + diff --git a/PYTHON/nstream-cupy-nvshmem.py b/PYTHON/nstream-cupy-nvshmem.py new file mode 100755 index 000000000..eb626b621 --- /dev/null +++ b/PYTHON/nstream-cupy-nvshmem.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2020, Intel Corporation +# Copyright (c) 2023, NVIDIA +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of Intel Corporation nor the names of its +# contributors may be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +#******************************************************************* +# +# NAME: nstream +# +# PURPOSE: To compute memory bandwidth when adding a vector of a given +# number of double precision values to the scalar multiple of +# another vector of the same length, and storing the result in +# a third vector. +# +# USAGE: The program takes as input the number +# of iterations to loop over the triad vectors, the length of the +# vectors, and the offset between vectors +# +# <# iterations> +# +# The output consists of diagnostics to make sure the +# algorithm worked, and of timing statistics. +# +# NOTES: Bandwidth is determined as the number of words read, plus the +# number of words written, times the size of the words, divided +# by the execution time. For a vector length of N, the total +# number of words read and written is 4*N*sizeof(double). +# +# +# HISTORY: This code is loosely based on the Stream benchmark by John +# McCalpin, but does not follow all the Stream rules. Hence, +# reported results should not be associated with Stream in +# external publications +# +# Converted to Python by Jeff Hammond, October 2017. +# Adapted for CuPy+NVSHMEM4Py, December 2024. +# +# ******************************************************************* + +import sys +if sys.version_info >= (3, 3): + from time import process_time as timer +else: + from timeit import default_timer as timer + +from mpi4py import MPI + +import cupy + +print('=== CUDA Version Information ===') + +try: + # Get CUDA runtime version + runtime_version = cupy.cuda.runtime.runtimeGetVersion() + runtime_major = runtime_version // 1000 + runtime_minor = (runtime_version % 1000) // 10 + print(f'CUDA Runtime Version: {runtime_major}.{runtime_minor} (raw: {runtime_version})') + + # Get CUDA driver version + driver_version = cupy.cuda.runtime.driverGetVersion() + driver_major = driver_version // 1000 + driver_minor = (driver_version % 1000) // 10 + print(f'CUDA Driver Version: {driver_major}.{driver_minor} (raw: {driver_version})') + + print(f'Version compatibility: Driver {driver_major}.{driver_minor} vs Runtime {runtime_major}.{runtime_minor}') + + if driver_version < runtime_version: + print('WARNING: Driver version is older than runtime version!') + print('This can cause \"cudaErrorInsufficientDriver\" errors.') + print('Consider updating your NVIDIA drivers.') + else: + print('Driver and runtime versions are compatible.') + +except Exception as e: + print(f'Error: {e}') + print('This usually indicates CUDA driver/runtime compatibility issues.') + +from cuda.core.experimental import Device + +import nvshmem.core as nvshmem + +def main(): + + # Initialize MPI and CUDA device + comm = MPI.COMM_WORLD + local_rank = comm.Get_rank() % 8 # Assume max 8 GPUs per node + device = Device(local_rank) + device.set_current() + stream = device.create_stream() + + # Initialize NVSHMEM with MPI + nvshmem.init(device=device, mpi_comm=comm, initializer_method="mpi") + + me = nvshmem.my_pe() + np = nvshmem.n_pes() + + # ******************************************************************** + # read and test input parameters + # ******************************************************************** + + if (me==0): + print('Parallel Research Kernels version ') #, PRKVERSION + print('Python CuPy/NVSHMEM STREAM triad: A = B + scalar * C') + + if len(sys.argv) != 3: + if (me==0): + print('argument count = ', len(sys.argv)) + print("Usage: python nstream-cupy-nvshmem.py <# iterations> ") + nvshmem.finalize() + sys.exit() + + iterations = int(sys.argv[1]) + if iterations < 1: + if (me==0): + print("ERROR: iterations must be >= 1") + nvshmem.finalize() + sys.exit() + + total_length = int(sys.argv[2]) + if total_length < 1: + if (me==0): + print("ERROR: length must be positive") + nvshmem.finalize() + sys.exit() + + # Distribute work across GPUs/PEs + length = int(total_length / np) + remainder = total_length % np + if (remainder > 0): + if (me < remainder): + length += 1 + + if (me==0): + print('Number of PEs = ', np) + print('Number of iterations = ', iterations) + print('Vector length = ', total_length) + print('Local vector length = ', length) + + # Barrier using NVSHMEM + nvshmem.barrier(stream=stream) + stream.synchronize() + + # ******************************************************************** + # ** Allocate space for the input and execute STREAM triad + # ******************************************************************** + + # Allocate symmetric GPU arrays using NVSHMEM4Py interoperability with CuPy + A = nvshmem.interop.cupy.array((length,), dtype="float64") + B = nvshmem.interop.cupy.array((length,), dtype="float64") + C = nvshmem.interop.cupy.array((length,), dtype="float64") + + # Initialize arrays + A[:] = 0.0 + B[:] = 2.0 + C[:] = 2.0 + + scalar = 3.0 + + # Timing loop + for k in range(0, iterations+1): + + if k < 1: + nvshmem.barrier(stream=stream) + stream.synchronize() + t0 = timer() + + # STREAM triad operation on GPU using CuPy operations + A += B + scalar * C + + # Final synchronization + nvshmem.barrier(stream=stream) + stream.synchronize() + t1 = timer() + nstream_time = t1 - t0 + + # ******************************************************************** + # ** Analyze and output results. + # ******************************************************************** + + # Calculate expected result + ar = 0.0 + br = 2.0 + cr = 2.0 + for k in range(0, iterations+1): + ar += br + scalar * cr + + ar *= total_length + + # Calculate local checksum + asum_local = cupy.linalg.norm(A, ord=1) + + # Create source and destination arrays for reduction + src = nvshmem.interop.cupy.array((1,), dtype="float64") + dst = nvshmem.interop.cupy.array((1,), dtype="float64") + src[0] = asum_local + dst[0] = 0.0 + + # Reduce across all PEs using NVSHMEM collective + nvshmem.reduce(dst, src, "sum", stream=stream) + stream.synchronize() + + asum_global = float(dst[0]) + + epsilon = 1.e-8 + if abs(ar - asum_global) / asum_global > epsilon: + if (me == 0): + print('Failed Validation on output array') + print(' Expected checksum: ', ar) + print(' Observed checksum: ', asum_global) + print("ERROR: solution did not validate") + else: + if (me == 0): + print('Solution validates') + avgtime = nstream_time / iterations + nbytes = 4.0 * total_length * 8 # 8 bytes per double + print('Rate (MB/s): ', 1.e-6 * nbytes / avgtime, ' Avg time (s): ', avgtime) + + # Free NVSHMEM arrays + nvshmem.free_array(A) + nvshmem.free_array(B) + nvshmem.free_array(C) + nvshmem.free_array(src) + nvshmem.free_array(dst) + + # Finalize NVSHMEM + nvshmem.finalize() + + +if __name__ == '__main__': + main() \ No newline at end of file From d9f65ed940decfff0ca5088e717f39a078057f10 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 26 Aug 2025 14:11:03 +0300 Subject: [PATCH 02/10] update --- common/make.defs.cuda | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/common/make.defs.cuda b/common/make.defs.cuda index 27633dc01..e22c05ca0 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -163,10 +163,10 @@ CBLASFLAG=${BLASFLAG} # Use appropriate arch or code is compiled to ancient features. #NVCC=${NVHPC_CBIN}nvc++ #NVCC=${NVHPC_CBIN}nvcc -NVCC=/usr/local/cuda-12.6/bin/nvcc +NVCC=/usr/local/cuda-12/bin/nvcc CUDAFLAGS=-g -O3 -std=c++20 CUDAFLAGS+=--extended-lambda -CUDAFLAGS+=--gpu-architecture=sm_89 +CUDAFLAGS+=--gpu-architecture=sm_90 #CUDAFLAGS+=-allow-unsupported-compiler #CUDAFLAGS+=-ccbin=g++-13 -lm #-lstdc++ #CUDAFLAGS+=--compiler-bindir=/opt/gcc/12.3.0/bin/ @@ -196,7 +196,7 @@ CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED #CUDAFLAGS+=-D_AVX512IFMAINTRIN_H_INCLUDED #CUDAFLAGS+=-D_AVX512IFMAVLINTRIN_H_INCLUDED #CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED -CUDAFLAGS += -I/usr/local/cuda-12.6/targets/x86_64-linux/include/cub/detail +CUDAFLAGS += -I/usr/local/cuda-12/targets/x86_64-linux/include/cub/detail # # NCCL # @@ -207,12 +207,15 @@ NCCLLIB=-L${NCCLDIR}/lib -lnccl # NVSHMEM (Apt packages not reliable...) # NVSHMEMFLAGS=-rdc=true -diag-suppress 3012,3013 -#NVSHMEMFLAGS+=-I/usr/include/nvshmem_12 -NVSHMEM_DIR=${HOME}/NVSHMEM/nvshmem/build/src +#NVSHMEM_DIR=${HOME}/NVSHMEM/nvshmem/build/src #NVSHMEM_DIR=/opt/nvidia/hpc_sdk/Linux_x86_64/24.11/comm_libs/12.6/nvshmem -NVSHMEMFLAGS+=-I${NVSHMEM_DIR}/include -NVSHMEMFLAGS+=-L${NVSHMEM_DIR}/lib -NVSHMEMFLAGS+=-Wl,-rpath=${NVSHMEM_DIR}/lib +#NVSHMEMFLAGS+=-I${NVSHMEM_DIR}/include +#NVSHMEMFLAGS+=-L${NVSHMEM_DIR}/lib +#NVSHMEMFLAGS+=-Wl,-rpath=${NVSHMEM_DIR}/lib +# apt or pip installs like this +NVSHMEMFLAGS+=-I/usr/include/nvshmem_12 +NVSHMEMFLAGS+=-L/usr/lib/x86_64-linux-gnu/nvshmem/12 +NVSHMEMFLAGS+=-Wl,-rpath=/usr/lib/x86_64-linux-gnu/nvshmem/12 NVSHMEMFLAGS+=-lnvshmem_device -lnvshmem_host # # CUDASTF From 299525f9d0553f5b863fccb666945ab51f6de801 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 26 Aug 2025 14:13:28 +0300 Subject: [PATCH 03/10] docker --- common/make.defs.cuda | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/make.defs.cuda b/common/make.defs.cuda index e22c05ca0..2d515e81f 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -163,7 +163,8 @@ CBLASFLAG=${BLASFLAG} # Use appropriate arch or code is compiled to ancient features. #NVCC=${NVHPC_CBIN}nvc++ #NVCC=${NVHPC_CBIN}nvcc -NVCC=/usr/local/cuda-12/bin/nvcc +#NVCC=/usr/local/cuda-12/bin/nvcc +NVCC=nvcc CUDAFLAGS=-g -O3 -std=c++20 CUDAFLAGS+=--extended-lambda CUDAFLAGS+=--gpu-architecture=sm_90 From c34610effabac127e9cfa006d2264620c3ce7131 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 26 Aug 2025 14:20:30 +0300 Subject: [PATCH 04/10] rpath nvcc workaround --- common/make.defs.cuda | 1 + 1 file changed, 1 insertion(+) diff --git a/common/make.defs.cuda b/common/make.defs.cuda index 2d515e81f..1017d6902 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -172,6 +172,7 @@ CUDAFLAGS+=--gpu-architecture=sm_90 #CUDAFLAGS+=-ccbin=g++-13 -lm #-lstdc++ #CUDAFLAGS+=--compiler-bindir=/opt/gcc/12.3.0/bin/ #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp +CUDAFLAGS+=--forward-unknown-to-host-compiler # rpath CUDAFLAGS+=-rdc=true # FIXES ptxas fatal : Unresolved extern function 'cudaCGGetIntrinsicHandle' #CUDAFLAGS+=-I${NVHPC_PATH}/math_libs/12.6/targets/$$(uname -m)-linux/include #CUDAFLAGS+=-L${NVHPC_PATH}/math_libs/12.6/targets/$$(uname -m)-linux/lib From 49ba387b90ae55a29249df26d8203036eaf2f32a Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 26 Aug 2025 14:23:03 +0300 Subject: [PATCH 05/10] docker --- common/make.defs.cuda | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/make.defs.cuda b/common/make.defs.cuda index 1017d6902..31f95d08b 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -1,7 +1,7 @@ # # This file shows the CUDA toolchain options # for both NVHPC and GCC. -NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/24.9 +NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/25.7 #NVHPC_PATH=/proj/nv/Linux_$$(uname -m)/21.11 #NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_$$(uname -m)/2021 NVHPC_CBIN=${NVHPC_PATH}/compilers/bin/ @@ -232,7 +232,7 @@ CUDASTF_CFLAGS+=-lcuda # # mpiicc wraps icc. mpicc and mpigcc wrap gcc. #MPIDIR=${NVHPC_PATH}/comm_libs/hpcx -MPIDIR=${NVHPC_PATH}/comm_libs/12.6/openmpi4/latest +MPIDIR=${NVHPC_PATH}/comm_libs/12.9/openmpi4/latest MPICC=${MPIDIR}/bin/mpicc MPICXX=${MPIDIR}/bin/mpicxx MPIFORT=${MPIDIR}/bin/mpifort From ce6a308fe88504a1acdb606993471297571bc187 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 26 Aug 2025 14:41:27 +0300 Subject: [PATCH 06/10] ugh --- PYTHON/cuda_version_check.cpp | 84 +++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 PYTHON/cuda_version_check.cpp diff --git a/PYTHON/cuda_version_check.cpp b/PYTHON/cuda_version_check.cpp new file mode 100644 index 000000000..caef05890 --- /dev/null +++ b/PYTHON/cuda_version_check.cpp @@ -0,0 +1,84 @@ +#include +#include +#include + +int main() { + std::cout << "=== CUDA Version Information ===" << std::endl; + + // Get CUDA Runtime Version + int runtimeVersion; + cudaError_t runtimeResult = cudaRuntimeGetVersion(&runtimeVersion); + if (runtimeResult == cudaSuccess) { + int runtimeMajor = runtimeVersion / 1000; + int runtimeMinor = (runtimeVersion % 1000) / 10; + std::cout << "CUDA Runtime Version: " << runtimeMajor << "." << runtimeMinor + << " (raw: " << runtimeVersion << ")" << std::endl; + } else { + std::cout << "Error getting CUDA Runtime version: " + << cudaGetErrorString(runtimeResult) << std::endl; + } + + // Get CUDA Driver Version + int driverVersion; + cudaError_t driverResult = cudaDriverGetVersion(&driverVersion); + if (driverResult == cudaSuccess) { + int driverMajor = driverVersion / 1000; + int driverMinor = (driverVersion % 1000) / 10; + std::cout << "CUDA Driver Version: " << driverMajor << "." << driverMinor + << " (raw: " << driverVersion << ")" << std::endl; + } else { + std::cout << "Error getting CUDA Driver version: " + << cudaGetErrorString(driverResult) << std::endl; + } + + // Check compatibility + if (driverResult == cudaSuccess && runtimeResult == cudaSuccess) { + std::cout << "\nCompatibility Check:" << std::endl; + if (driverVersion >= runtimeVersion) { + std::cout << "✓ Driver and runtime versions are compatible" << std::endl; + } else { + std::cout << "✗ WARNING: Driver version is older than runtime!" << std::endl; + std::cout << " This may cause cudaErrorInsufficientDriver errors" << std::endl; + } + } + + // Get device information + int deviceCount; + cudaError_t deviceResult = cudaGetDeviceCount(&deviceCount); + if (deviceResult == cudaSuccess) { + std::cout << "\n=== Device Information ===" << std::endl; + std::cout << "Number of CUDA devices: " << deviceCount << std::endl; + + for (int i = 0; i < deviceCount; i++) { + cudaDeviceProp prop; + cudaError_t propResult = cudaGetDeviceProperties(&prop, i); + if (propResult == cudaSuccess) { + std::cout << "Device " << i << ": " << prop.name << std::endl; + std::cout << " Compute Capability: " << prop.major << "." << prop.minor << std::endl; + std::cout << " Total Memory: " << prop.totalGlobalMem / (1024*1024*1024) << " GB" << std::endl; + std::cout << " Multiprocessors: " << prop.multiProcessorCount << std::endl; + } + } + } else { + std::cout << "Error getting device count: " + << cudaGetErrorString(deviceResult) << std::endl; + } + + // Alternative method using CUDA Driver API directly + std::cout << "\n=== Alternative Driver API Check ===" << std::endl; + CUresult cuResult = cuInit(0); + if (cuResult == CUDA_SUCCESS) { + int cuDriverVersion; + cuResult = cuDriverGetVersion(&cuDriverVersion); + if (cuResult == CUDA_SUCCESS) { + int cuDriverMajor = cuDriverVersion / 1000; + int cuDriverMinor = (cuDriverVersion % 1000) / 10; + std::cout << "CUDA Driver Version (Driver API): " << cuDriverMajor << "." << cuDriverMinor + << " (raw: " << cuDriverVersion << ")" << std::endl; + } + } else { + std::cout << "Failed to initialize CUDA Driver API" << std::endl; + } + + return 0; +} From fce1578d864d411b697c66dccf950aa57d4ccd85 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Tue, 26 Aug 2025 16:05:57 +0300 Subject: [PATCH 07/10] docker --- common/make.defs.cuda | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/common/make.defs.cuda b/common/make.defs.cuda index 31f95d08b..169b4280c 100644 --- a/common/make.defs.cuda +++ b/common/make.defs.cuda @@ -232,14 +232,15 @@ CUDASTF_CFLAGS+=-lcuda # # mpiicc wraps icc. mpicc and mpigcc wrap gcc. #MPIDIR=${NVHPC_PATH}/comm_libs/hpcx -MPIDIR=${NVHPC_PATH}/comm_libs/12.9/openmpi4/latest +#MPIDIR=${NVHPC_PATH}/comm_libs/12.9/openmpi4/latest +MPIDIR=/usr/local MPICC=${MPIDIR}/bin/mpicc MPICXX=${MPIDIR}/bin/mpicxx MPIFORT=${MPIDIR}/bin/mpifort MPIINC=-I${MPIDIR}/include MPILIB=-L${MPIDIR}/lib -lmpi #MPILIB+=-Wl,-rpath -Wl,${MPIDIR}/lib -Wl,--enable-new-dtags # NVCC chokes on -Wl -MPILIB+=-lopen-pal -lopen-rte +#MPILIB+=-lopen-pal -lopen-rte #MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi #MPIINC=-I/usr/include/mpich-3.2-$$(uname -m) #MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi From 81e1b866a1e019cc11a4aca4067238425b9c1b35 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 27 Aug 2025 10:56:05 +0300 Subject: [PATCH 08/10] fixes --- PYTHON/hello-nvshmem.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/PYTHON/hello-nvshmem.py b/PYTHON/hello-nvshmem.py index bf39b631f..d4b10dbb8 100644 --- a/PYTHON/hello-nvshmem.py +++ b/PYTHON/hello-nvshmem.py @@ -1,8 +1,8 @@ import numpy -import nvshmem.core as nvshmem from mpi4py import MPI from cuda.core.experimental import Device from cuda.core.experimental import system +import nvshmem.core as nvshmem # Initialize MPI comm = MPI.COMM_WORLD @@ -10,15 +10,15 @@ np = comm.Get_size() # Initialize NVSHMEM with MPI +dev = Device(me % system.num_devices) +dev.set_current() +nvshmem.init(device=dev, mpi_comm=comm, initializer_method="mpi") + +#uid = nvshmem.get_unique_id(empty=(me != 0)) +#comm.Bcast(uid._data.view(numpy.int8), root=0) #dev = Device() #dev.set_current() -#nvshmem.init(device=dev, mpi_comm=comm, initializer_method="mpi") - -uid = nvshmem.get_unique_id(empty=(me != 0)) -comm.Bcast(uid._data.view(numpy.int8), root=0) -dev = Device() -dev.set_current() -nvshmem.init(device=dev, uid=uid, rank=me, nranks=np, initializer_method="uid") +#nvshmem.init(device=dev, uid=uid, rank=me, nranks=np, initializer_method="uid") #dev = Device(me % system.num_devices) #dev.set_current() @@ -32,23 +32,24 @@ # Allocate symmetric memory # array() returns a CuPy NDArray object -#x = nvshmem.array((1024,), dtype="float32") -#y = nvshmem.array((1024,), dtype="float32") +x = nvshmem.array((1024,), dtype="float32") +y = nvshmem.array((1024,), dtype="float32") #if my_pe == 0: # y[:] = 1.0 # Perform communication operations # Put y from PE 0 into x on PE 1 -#if my_pe == 0: -# nvshmem.put(x, y, pe=1, stream=stream) +if my_pe == 0: + nvshmem.put(x, y, 1, stream=stream) # Synchronize PEs +nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream) stream.sync() # Clean up -#nvshmem.free_array(x) -#nvshmem.free_array(y) +nvshmem.free_array(x) +nvshmem.free_array(y) nvshmem.finalize() print('OK') From d7a63a01df97334e271fee1b903387db9de79055 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 27 Aug 2025 11:36:21 +0300 Subject: [PATCH 09/10] fix all the issues i hope --- PYTHON/nstream-cupy-nvshmem.py | 72 +++++++++++++++++----------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/PYTHON/nstream-cupy-nvshmem.py b/PYTHON/nstream-cupy-nvshmem.py index eb626b621..fbb2cf12a 100755 --- a/PYTHON/nstream-cupy-nvshmem.py +++ b/PYTHON/nstream-cupy-nvshmem.py @@ -75,35 +75,37 @@ import cupy -print('=== CUDA Version Information ===') - -try: - # Get CUDA runtime version - runtime_version = cupy.cuda.runtime.runtimeGetVersion() - runtime_major = runtime_version // 1000 - runtime_minor = (runtime_version % 1000) // 10 - print(f'CUDA Runtime Version: {runtime_major}.{runtime_minor} (raw: {runtime_version})') - - # Get CUDA driver version - driver_version = cupy.cuda.runtime.driverGetVersion() - driver_major = driver_version // 1000 - driver_minor = (driver_version % 1000) // 10 - print(f'CUDA Driver Version: {driver_major}.{driver_minor} (raw: {driver_version})') - - print(f'Version compatibility: Driver {driver_major}.{driver_minor} vs Runtime {runtime_major}.{runtime_minor}') - - if driver_version < runtime_version: - print('WARNING: Driver version is older than runtime version!') - print('This can cause \"cudaErrorInsufficientDriver\" errors.') - print('Consider updating your NVIDIA drivers.') - else: - print('Driver and runtime versions are compatible.') +if False: + print('=== CUDA Version Information ===') + + try: + # Get CUDA runtime version + runtime_version = cupy.cuda.runtime.runtimeGetVersion() + runtime_major = runtime_version // 1000 + runtime_minor = (runtime_version % 1000) // 10 + print(f'CUDA Runtime Version: {runtime_major}.{runtime_minor} (raw: {runtime_version})') + + # Get CUDA driver version + driver_version = cupy.cuda.runtime.driverGetVersion() + driver_major = driver_version // 1000 + driver_minor = (driver_version % 1000) // 10 + print(f'CUDA Driver Version: {driver_major}.{driver_minor} (raw: {driver_version})') + + print(f'Version compatibility: Driver {driver_major}.{driver_minor} vs Runtime {runtime_major}.{runtime_minor}') -except Exception as e: - print(f'Error: {e}') + if driver_version < runtime_version: + print('WARNING: Driver version is older than runtime version!') + print('This can cause \"cudaErrorInsufficientDriver\" errors.') + print('Consider updating your NVIDIA drivers.') + else: + print('Driver and runtime versions are compatible.') + + except Exception as e: + print(f'Error: {e}') print('This usually indicates CUDA driver/runtime compatibility issues.') from cuda.core.experimental import Device +from cuda.core.experimental import system import nvshmem.core as nvshmem @@ -111,7 +113,7 @@ def main(): # Initialize MPI and CUDA device comm = MPI.COMM_WORLD - local_rank = comm.Get_rank() % 8 # Assume max 8 GPUs per node + local_rank = comm.Get_rank() % system.num_devices device = Device(local_rank) device.set_current() stream = device.create_stream() @@ -165,8 +167,8 @@ def main(): print('Local vector length = ', length) # Barrier using NVSHMEM - nvshmem.barrier(stream=stream) - stream.synchronize() + nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream) + stream.sync() # ******************************************************************** # ** Allocate space for the input and execute STREAM triad @@ -188,16 +190,16 @@ def main(): for k in range(0, iterations+1): if k < 1: - nvshmem.barrier(stream=stream) - stream.synchronize() + nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream) + stream.sync() t0 = timer() # STREAM triad operation on GPU using CuPy operations A += B + scalar * C # Final synchronization - nvshmem.barrier(stream=stream) - stream.synchronize() + nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream) + stream.sync() t1 = timer() nstream_time = t1 - t0 @@ -224,8 +226,8 @@ def main(): dst[0] = 0.0 # Reduce across all PEs using NVSHMEM collective - nvshmem.reduce(dst, src, "sum", stream=stream) - stream.synchronize() + nvshmem.reduce(nvshmem.Teams.TEAM_WORLD, dst, src, op="sum", stream=stream) + stream.sync() asum_global = float(dst[0]) @@ -255,4 +257,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() From 409c868ae258e488708e0b33c31eb15babba8df0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Wed, 27 Aug 2025 12:05:50 +0300 Subject: [PATCH 10/10] hack to disable jit? --- PYTHON/nstream-cupy-nvshmem.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/PYTHON/nstream-cupy-nvshmem.py b/PYTHON/nstream-cupy-nvshmem.py index fbb2cf12a..a5335999e 100755 --- a/PYTHON/nstream-cupy-nvshmem.py +++ b/PYTHON/nstream-cupy-nvshmem.py @@ -196,6 +196,8 @@ def main(): # STREAM triad operation on GPU using CuPy operations A += B + scalar * C + # it seems like this is required to get proper timings - maybe some weird JiT thing happening + nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream) # Final synchronization nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream) @@ -243,7 +245,7 @@ def main(): print('Solution validates') avgtime = nstream_time / iterations nbytes = 4.0 * total_length * 8 # 8 bytes per double - print('Rate (MB/s): ', 1.e-6 * nbytes / avgtime, ' Avg time (s): ', avgtime) + print('Rate (GB/s): ', 1.e-9 * nbytes / avgtime, ' Avg time (s): ', avgtime) # Free NVSHMEM arrays nvshmem.free_array(A)