From 4c75de096beef1353c2f2c5a7079d7677fd9fbc6 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Tue, 26 Aug 2025 14:06:47 +0300
Subject: [PATCH 01/10] starting nvshmem4py

---
 PYTHON/hello-nvshmem.py        |  54 +++++++
 PYTHON/nstream-cupy-nvshmem.py | 258 +++++++++++++++++++++++++++++++++
 2 files changed, 312 insertions(+)
 create mode 100644 PYTHON/hello-nvshmem.py
 create mode 100755 PYTHON/nstream-cupy-nvshmem.py

diff --git a/PYTHON/hello-nvshmem.py b/PYTHON/hello-nvshmem.py
new file mode 100644
index 000000000..bf39b631f
--- /dev/null
+++ b/PYTHON/hello-nvshmem.py
@@ -0,0 +1,54 @@
+import numpy
+import nvshmem.core as nvshmem
+from mpi4py import MPI
+from cuda.core.experimental import Device
+from cuda.core.experimental import system
+
+# Initialize MPI
+comm = MPI.COMM_WORLD
+me = comm.Get_rank()
+np = comm.Get_size()
+
+# Initialize NVSHMEM with MPI
+#dev = Device()
+#dev.set_current()
+#nvshmem.init(device=dev, mpi_comm=comm, initializer_method="mpi")
+
+uid = nvshmem.get_unique_id(empty=(me != 0))
+comm.Bcast(uid._data.view(numpy.int8), root=0)
+dev = Device()
+dev.set_current()
+nvshmem.init(device=dev, uid=uid, rank=me, nranks=np, initializer_method="uid")
+
+#dev = Device(me % system.num_devices)
+#dev.set_current()
+#nvshmem.init(device=dev, mpi_comm=comm, initializer_method="emulated_mpi")
+
+stream = dev.create_stream()
+
+# Get information about the current PE
+my_pe = nvshmem.my_pe()
+n_pes = nvshmem.n_pes()
+
+# Allocate symmetric memory
+# array() returns a CuPy NDArray object
+#x = nvshmem.array((1024,), dtype="float32")
+#y = nvshmem.array((1024,), dtype="float32")
+
+#if my_pe == 0:
+#    y[:] = 1.0
+
+# Perform communication operations
+# Put y from PE 0 into x on PE 1
+#if my_pe == 0:
+#    nvshmem.put(x, y, pe=1, stream=stream)
+
+# Synchronize PEs
+stream.sync()
+
+# Clean up
+#nvshmem.free_array(x)
+#nvshmem.free_array(y)
+nvshmem.finalize()
+print('OK')
+
diff --git a/PYTHON/nstream-cupy-nvshmem.py b/PYTHON/nstream-cupy-nvshmem.py
new file mode 100755
index 000000000..eb626b621
--- /dev/null
+++ b/PYTHON/nstream-cupy-nvshmem.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2020, Intel Corporation
+# Copyright (c) 2023, NVIDIA
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+#      notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above
+#      copyright notice, this list of conditions and the following
+#      disclaimer in the documentation and/or other materials provided
+#      with the distribution.
+# * Neither the name of Intel Corporation nor the names of its
+#      contributors may be used to endorse or promote products
+#      derived from this software without specific prior written
+#      permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+#*******************************************************************
+#
+# NAME:    nstream
+#
+# PURPOSE: To compute memory bandwidth when adding a vector of a given
+#          number of double precision values to the scalar multiple of
+#          another vector of the same length, and storing the result in
+#          a third vector.
+#
+# USAGE:   The program takes as input the number
+#          of iterations to loop over the triad vectors, the length of the
+#          vectors, and the offset between vectors
+#
+#          <progname> <# iterations> <vector length> <offset>
+#
+#          The output consists of diagnostics to make sure the
+#          algorithm worked, and of timing statistics.
+#
+# NOTES:   Bandwidth is determined as the number of words read, plus the
+#          number of words written, times the size of the words, divided
+#          by the execution time. For a vector length of N, the total
+#          number of words read and written is 4*N*sizeof(double).
+#
+#
+# HISTORY: This code is loosely based on the Stream benchmark by John
+#          McCalpin, but does not follow all the Stream rules. Hence,
+#          reported results should not be associated with Stream in
+#          external publications
+#
+#          Converted to Python by Jeff Hammond, October 2017.
+#          Adapted for CuPy+NVSHMEM4Py, December 2024.
+#
+# *******************************************************************
+
+import sys
+if sys.version_info >= (3, 3):
+    from time import process_time as timer
+else:
+    from timeit import default_timer as timer
+
+from mpi4py import MPI
+
+import cupy
+
+print('=== CUDA Version Information ===')
+
+try:
+    # Get CUDA runtime version
+    runtime_version = cupy.cuda.runtime.runtimeGetVersion()
+    runtime_major = runtime_version // 1000
+    runtime_minor = (runtime_version % 1000) // 10
+    print(f'CUDA Runtime Version: {runtime_major}.{runtime_minor} (raw: {runtime_version})')
+    
+    # Get CUDA driver version  
+    driver_version = cupy.cuda.runtime.driverGetVersion()
+    driver_major = driver_version // 1000
+    driver_minor = (driver_version % 1000) // 10
+    print(f'CUDA Driver Version: {driver_major}.{driver_minor} (raw: {driver_version})')
+    
+    print(f'Version compatibility: Driver {driver_major}.{driver_minor} vs Runtime {runtime_major}.{runtime_minor}')
+    
+    if driver_version < runtime_version:
+        print('WARNING: Driver version is older than runtime version!')
+        print('This can cause \"cudaErrorInsufficientDriver\" errors.')
+        print('Consider updating your NVIDIA drivers.')
+    else:
+        print('Driver and runtime versions are compatible.')
+        
+except Exception as e:
+    print(f'Error: {e}')
+    print('This usually indicates CUDA driver/runtime compatibility issues.')
+
+from cuda.core.experimental import Device
+
+import nvshmem.core as nvshmem
+
+def main():
+    
+    # Initialize MPI and CUDA device
+    comm = MPI.COMM_WORLD
+    local_rank = comm.Get_rank() % 8  # Assume max 8 GPUs per node
+    device = Device(local_rank)
+    device.set_current()
+    stream = device.create_stream()
+    
+    # Initialize NVSHMEM with MPI
+    nvshmem.init(device=device, mpi_comm=comm, initializer_method="mpi")
+    
+    me = nvshmem.my_pe()
+    np = nvshmem.n_pes()
+
+    # ********************************************************************
+    # read and test input parameters
+    # ********************************************************************
+
+    if (me==0):
+        print('Parallel Research Kernels version ') #, PRKVERSION
+        print('Python CuPy/NVSHMEM STREAM triad: A = B + scalar * C')
+
+    if len(sys.argv) != 3:
+        if (me==0):
+            print('argument count = ', len(sys.argv))
+            print("Usage: python nstream-cupy-nvshmem.py <# iterations> <vector length>")
+        nvshmem.finalize()
+        sys.exit()
+
+    iterations = int(sys.argv[1])
+    if iterations < 1:
+        if (me==0):
+            print("ERROR: iterations must be >= 1")
+        nvshmem.finalize()
+        sys.exit()
+
+    total_length = int(sys.argv[2])
+    if total_length < 1:
+        if (me==0):
+            print("ERROR: length must be positive")
+        nvshmem.finalize()
+        sys.exit()
+
+    # Distribute work across GPUs/PEs
+    length = int(total_length / np)
+    remainder = total_length % np
+    if (remainder > 0):
+        if (me < remainder):
+            length += 1
+
+    if (me==0):
+        print('Number of PEs        = ', np)
+        print('Number of iterations = ', iterations)
+        print('Vector length        = ', total_length)
+        print('Local vector length  = ', length)
+
+    # Barrier using NVSHMEM
+    nvshmem.barrier(stream=stream)
+    stream.synchronize()
+
+    # ********************************************************************
+    # ** Allocate space for the input and execute STREAM triad
+    # ********************************************************************
+
+    # Allocate symmetric GPU arrays using NVSHMEM4Py interoperability with CuPy
+    A = nvshmem.interop.cupy.array((length,), dtype="float64")
+    B = nvshmem.interop.cupy.array((length,), dtype="float64") 
+    C = nvshmem.interop.cupy.array((length,), dtype="float64")
+    
+    # Initialize arrays
+    A[:] = 0.0
+    B[:] = 2.0
+    C[:] = 2.0
+
+    scalar = 3.0
+
+    # Timing loop
+    for k in range(0, iterations+1):
+
+        if k < 1:
+            nvshmem.barrier(stream=stream)
+            stream.synchronize()
+            t0 = timer()
+
+        # STREAM triad operation on GPU using CuPy operations
+        A += B + scalar * C
+
+    # Final synchronization
+    nvshmem.barrier(stream=stream)
+    stream.synchronize()
+    t1 = timer()
+    nstream_time = t1 - t0
+
+    # ********************************************************************
+    # ** Analyze and output results.
+    # ********************************************************************
+
+    # Calculate expected result
+    ar = 0.0
+    br = 2.0
+    cr = 2.0
+    for k in range(0, iterations+1):
+        ar += br + scalar * cr
+
+    ar *= total_length
+
+    # Calculate local checksum
+    asum_local = cupy.linalg.norm(A, ord=1)
+    
+    # Create source and destination arrays for reduction
+    src = nvshmem.interop.cupy.array((1,), dtype="float64")
+    dst = nvshmem.interop.cupy.array((1,), dtype="float64")
+    src[0] = asum_local
+    dst[0] = 0.0
+    
+    # Reduce across all PEs using NVSHMEM collective
+    nvshmem.reduce(dst, src, "sum", stream=stream)
+    stream.synchronize()
+    
+    asum_global = float(dst[0])
+
+    epsilon = 1.e-8
+    if abs(ar - asum_global) / asum_global > epsilon:
+        if (me == 0):
+            print('Failed Validation on output array')
+            print('        Expected checksum: ', ar)
+            print('        Observed checksum: ', asum_global)
+            print("ERROR: solution did not validate")
+    else:
+        if (me == 0):
+            print('Solution validates')
+            avgtime = nstream_time / iterations
+            nbytes = 4.0 * total_length * 8  # 8 bytes per double
+            print('Rate (MB/s): ', 1.e-6 * nbytes / avgtime, ' Avg time (s): ', avgtime)
+
+    # Free NVSHMEM arrays
+    nvshmem.free_array(A)
+    nvshmem.free_array(B)
+    nvshmem.free_array(C)
+    nvshmem.free_array(src)
+    nvshmem.free_array(dst)
+    
+    # Finalize NVSHMEM
+    nvshmem.finalize()
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From d9f65ed940decfff0ca5088e717f39a078057f10 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Tue, 26 Aug 2025 14:11:03 +0300
Subject: [PATCH 02/10] update

---
 common/make.defs.cuda | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/common/make.defs.cuda b/common/make.defs.cuda
index 27633dc01..e22c05ca0 100644
--- a/common/make.defs.cuda
+++ b/common/make.defs.cuda
@@ -163,10 +163,10 @@ CBLASFLAG=${BLASFLAG}
 # Use appropriate arch or code is compiled to ancient features.
 #NVCC=${NVHPC_CBIN}nvc++
 #NVCC=${NVHPC_CBIN}nvcc
-NVCC=/usr/local/cuda-12.6/bin/nvcc
+NVCC=/usr/local/cuda-12/bin/nvcc
 CUDAFLAGS=-g -O3 -std=c++20
 CUDAFLAGS+=--extended-lambda
-CUDAFLAGS+=--gpu-architecture=sm_89
+CUDAFLAGS+=--gpu-architecture=sm_90
 #CUDAFLAGS+=-allow-unsupported-compiler
 #CUDAFLAGS+=-ccbin=g++-13 -lm #-lstdc++
 #CUDAFLAGS+=--compiler-bindir=/opt/gcc/12.3.0/bin/
@@ -196,7 +196,7 @@ CUDAFLAGS+=-D_X86INTRIN_H_INCLUDED
 #CUDAFLAGS+=-D_AVX512IFMAINTRIN_H_INCLUDED
 #CUDAFLAGS+=-D_AVX512IFMAVLINTRIN_H_INCLUDED
 #CUDAFLAGS+=-D_AVX512ERINTRIN_H_INCLUDED
-CUDAFLAGS += -I/usr/local/cuda-12.6/targets/x86_64-linux/include/cub/detail
+CUDAFLAGS += -I/usr/local/cuda-12/targets/x86_64-linux/include/cub/detail
 #
 # NCCL
 #
@@ -207,12 +207,15 @@ NCCLLIB=-L${NCCLDIR}/lib -lnccl
 # NVSHMEM (Apt packages not reliable...)
 #
 NVSHMEMFLAGS=-rdc=true -diag-suppress 3012,3013
-#NVSHMEMFLAGS+=-I/usr/include/nvshmem_12
-NVSHMEM_DIR=${HOME}/NVSHMEM/nvshmem/build/src
+#NVSHMEM_DIR=${HOME}/NVSHMEM/nvshmem/build/src
 #NVSHMEM_DIR=/opt/nvidia/hpc_sdk/Linux_x86_64/24.11/comm_libs/12.6/nvshmem
-NVSHMEMFLAGS+=-I${NVSHMEM_DIR}/include
-NVSHMEMFLAGS+=-L${NVSHMEM_DIR}/lib
-NVSHMEMFLAGS+=-Wl,-rpath=${NVSHMEM_DIR}/lib
+#NVSHMEMFLAGS+=-I${NVSHMEM_DIR}/include
+#NVSHMEMFLAGS+=-L${NVSHMEM_DIR}/lib
+#NVSHMEMFLAGS+=-Wl,-rpath=${NVSHMEM_DIR}/lib
+# apt or pip installs like this
+NVSHMEMFLAGS+=-I/usr/include/nvshmem_12
+NVSHMEMFLAGS+=-L/usr/lib/x86_64-linux-gnu/nvshmem/12
+NVSHMEMFLAGS+=-Wl,-rpath=/usr/lib/x86_64-linux-gnu/nvshmem/12
 NVSHMEMFLAGS+=-lnvshmem_device -lnvshmem_host
 #
 # CUDASTF

From 299525f9d0553f5b863fccb666945ab51f6de801 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Tue, 26 Aug 2025 14:13:28 +0300
Subject: [PATCH 03/10] docker

---
 common/make.defs.cuda | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/common/make.defs.cuda b/common/make.defs.cuda
index e22c05ca0..2d515e81f 100644
--- a/common/make.defs.cuda
+++ b/common/make.defs.cuda
@@ -163,7 +163,8 @@ CBLASFLAG=${BLASFLAG}
 # Use appropriate arch or code is compiled to ancient features.
 #NVCC=${NVHPC_CBIN}nvc++
 #NVCC=${NVHPC_CBIN}nvcc
-NVCC=/usr/local/cuda-12/bin/nvcc
+#NVCC=/usr/local/cuda-12/bin/nvcc
+NVCC=nvcc
 CUDAFLAGS=-g -O3 -std=c++20
 CUDAFLAGS+=--extended-lambda
 CUDAFLAGS+=--gpu-architecture=sm_90

From c34610effabac127e9cfa006d2264620c3ce7131 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Tue, 26 Aug 2025 14:20:30 +0300
Subject: [PATCH 04/10] rpath nvcc workaround

---
 common/make.defs.cuda | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common/make.defs.cuda b/common/make.defs.cuda
index 2d515e81f..1017d6902 100644
--- a/common/make.defs.cuda
+++ b/common/make.defs.cuda
@@ -172,6 +172,7 @@ CUDAFLAGS+=--gpu-architecture=sm_90
 #CUDAFLAGS+=-ccbin=g++-13 -lm #-lstdc++
 #CUDAFLAGS+=--compiler-bindir=/opt/gcc/12.3.0/bin/
 #CUDAFLAGS+=-forward-unknown-to-host-compiler -fopenmp
+CUDAFLAGS+=--forward-unknown-to-host-compiler # rpath
 CUDAFLAGS+=-rdc=true # FIXES ptxas fatal   : Unresolved extern function 'cudaCGGetIntrinsicHandle'
 #CUDAFLAGS+=-I${NVHPC_PATH}/math_libs/12.6/targets/$$(uname -m)-linux/include
 #CUDAFLAGS+=-L${NVHPC_PATH}/math_libs/12.6/targets/$$(uname -m)-linux/lib

From 49ba387b90ae55a29249df26d8203036eaf2f32a Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Tue, 26 Aug 2025 14:23:03 +0300
Subject: [PATCH 05/10] docker

---
 common/make.defs.cuda | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/make.defs.cuda b/common/make.defs.cuda
index 1017d6902..31f95d08b 100644
--- a/common/make.defs.cuda
+++ b/common/make.defs.cuda
@@ -1,7 +1,7 @@
 #
 # This file shows the CUDA toolchain options
 # for both NVHPC and GCC.
-NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/24.9
+NVHPC_PATH=/opt/nvidia/hpc_sdk/Linux_$$(uname -m)/25.7
 #NVHPC_PATH=/proj/nv/Linux_$$(uname -m)/21.11
 #NVHPC_PATH=${HOME}/NVIDIA/hpc_sdk/Linux_$$(uname -m)/2021
 NVHPC_CBIN=${NVHPC_PATH}/compilers/bin/
@@ -232,7 +232,7 @@ CUDASTF_CFLAGS+=-lcuda
 #
 # mpiicc wraps icc.  mpicc and mpigcc wrap gcc.
 #MPIDIR=${NVHPC_PATH}/comm_libs/hpcx
-MPIDIR=${NVHPC_PATH}/comm_libs/12.6/openmpi4/latest
+MPIDIR=${NVHPC_PATH}/comm_libs/12.9/openmpi4/latest
 MPICC=${MPIDIR}/bin/mpicc
 MPICXX=${MPIDIR}/bin/mpicxx
 MPIFORT=${MPIDIR}/bin/mpifort

From ce6a308fe88504a1acdb606993471297571bc187 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Tue, 26 Aug 2025 14:41:27 +0300
Subject: [PATCH 06/10] ugh

---
 PYTHON/cuda_version_check.cpp | 84 +++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100644 PYTHON/cuda_version_check.cpp

diff --git a/PYTHON/cuda_version_check.cpp b/PYTHON/cuda_version_check.cpp
new file mode 100644
index 000000000..caef05890
--- /dev/null
+++ b/PYTHON/cuda_version_check.cpp
@@ -0,0 +1,84 @@
+#include <iostream>
+#include <cuda_runtime.h>
+#include <cuda.h>
+
+int main() {
+    std::cout << "=== CUDA Version Information ===" << std::endl;
+    
+    // Get CUDA Runtime Version
+    int runtimeVersion;
+    cudaError_t runtimeResult = cudaRuntimeGetVersion(&runtimeVersion);
+    if (runtimeResult == cudaSuccess) {
+        int runtimeMajor = runtimeVersion / 1000;
+        int runtimeMinor = (runtimeVersion % 1000) / 10;
+        std::cout << "CUDA Runtime Version: " << runtimeMajor << "." << runtimeMinor 
+                  << " (raw: " << runtimeVersion << ")" << std::endl;
+    } else {
+        std::cout << "Error getting CUDA Runtime version: " 
+                  << cudaGetErrorString(runtimeResult) << std::endl;
+    }
+    
+    // Get CUDA Driver Version
+    int driverVersion;
+    cudaError_t driverResult = cudaDriverGetVersion(&driverVersion);
+    if (driverResult == cudaSuccess) {
+        int driverMajor = driverVersion / 1000;
+        int driverMinor = (driverVersion % 1000) / 10;
+        std::cout << "CUDA Driver Version: " << driverMajor << "." << driverMinor 
+                  << " (raw: " << driverVersion << ")" << std::endl;
+    } else {
+        std::cout << "Error getting CUDA Driver version: " 
+                  << cudaGetErrorString(driverResult) << std::endl;
+    }
+    
+    // Check compatibility
+    if (driverResult == cudaSuccess && runtimeResult == cudaSuccess) {
+        std::cout << "\nCompatibility Check:" << std::endl;
+        if (driverVersion >= runtimeVersion) {
+            std::cout << "✓ Driver and runtime versions are compatible" << std::endl;
+        } else {
+            std::cout << "✗ WARNING: Driver version is older than runtime!" << std::endl;
+            std::cout << "  This may cause cudaErrorInsufficientDriver errors" << std::endl;
+        }
+    }
+    
+    // Get device information
+    int deviceCount;
+    cudaError_t deviceResult = cudaGetDeviceCount(&deviceCount);
+    if (deviceResult == cudaSuccess) {
+        std::cout << "\n=== Device Information ===" << std::endl;
+        std::cout << "Number of CUDA devices: " << deviceCount << std::endl;
+        
+        for (int i = 0; i < deviceCount; i++) {
+            cudaDeviceProp prop;
+            cudaError_t propResult = cudaGetDeviceProperties(&prop, i);
+            if (propResult == cudaSuccess) {
+                std::cout << "Device " << i << ": " << prop.name << std::endl;
+                std::cout << "  Compute Capability: " << prop.major << "." << prop.minor << std::endl;
+                std::cout << "  Total Memory: " << prop.totalGlobalMem / (1024*1024*1024) << " GB" << std::endl;
+                std::cout << "  Multiprocessors: " << prop.multiProcessorCount << std::endl;
+            }
+        }
+    } else {
+        std::cout << "Error getting device count: " 
+                  << cudaGetErrorString(deviceResult) << std::endl;
+    }
+    
+    // Alternative method using CUDA Driver API directly
+    std::cout << "\n=== Alternative Driver API Check ===" << std::endl;
+    CUresult cuResult = cuInit(0);
+    if (cuResult == CUDA_SUCCESS) {
+        int cuDriverVersion;
+        cuResult = cuDriverGetVersion(&cuDriverVersion);
+        if (cuResult == CUDA_SUCCESS) {
+            int cuDriverMajor = cuDriverVersion / 1000;
+            int cuDriverMinor = (cuDriverVersion % 1000) / 10;
+            std::cout << "CUDA Driver Version (Driver API): " << cuDriverMajor << "." << cuDriverMinor 
+                      << " (raw: " << cuDriverVersion << ")" << std::endl;
+        }
+    } else {
+        std::cout << "Failed to initialize CUDA Driver API" << std::endl;
+    }
+    
+    return 0;
+}

From fce1578d864d411b697c66dccf950aa57d4ccd85 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Tue, 26 Aug 2025 16:05:57 +0300
Subject: [PATCH 07/10] docker

---
 common/make.defs.cuda | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/common/make.defs.cuda b/common/make.defs.cuda
index 31f95d08b..169b4280c 100644
--- a/common/make.defs.cuda
+++ b/common/make.defs.cuda
@@ -232,14 +232,15 @@ CUDASTF_CFLAGS+=-lcuda
 #
 # mpiicc wraps icc.  mpicc and mpigcc wrap gcc.
 #MPIDIR=${NVHPC_PATH}/comm_libs/hpcx
-MPIDIR=${NVHPC_PATH}/comm_libs/12.9/openmpi4/latest
+#MPIDIR=${NVHPC_PATH}/comm_libs/12.9/openmpi4/latest
+MPIDIR=/usr/local
 MPICC=${MPIDIR}/bin/mpicc
 MPICXX=${MPIDIR}/bin/mpicxx
 MPIFORT=${MPIDIR}/bin/mpifort
 MPIINC=-I${MPIDIR}/include
 MPILIB=-L${MPIDIR}/lib -lmpi
 #MPILIB+=-Wl,-rpath -Wl,${MPIDIR}/lib -Wl,--enable-new-dtags # NVCC chokes on -Wl
-MPILIB+=-lopen-pal -lopen-rte
+#MPILIB+=-lopen-pal -lopen-rte
 #MPILIB=-L/usr/local/opt/libevent/lib -L${MPIDIR}/lib -lmpi
 #MPIINC=-I/usr/include/mpich-3.2-$$(uname -m)
 #MPILIB=-L/usr/lib64/mpich-3.2/lib -lmpi

From 81e1b866a1e019cc11a4aca4067238425b9c1b35 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Wed, 27 Aug 2025 10:56:05 +0300
Subject: [PATCH 08/10] fixes

---
 PYTHON/hello-nvshmem.py | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/PYTHON/hello-nvshmem.py b/PYTHON/hello-nvshmem.py
index bf39b631f..d4b10dbb8 100644
--- a/PYTHON/hello-nvshmem.py
+++ b/PYTHON/hello-nvshmem.py
@@ -1,8 +1,8 @@
 import numpy
-import nvshmem.core as nvshmem
 from mpi4py import MPI
 from cuda.core.experimental import Device
 from cuda.core.experimental import system
+import nvshmem.core as nvshmem
 
 # Initialize MPI
 comm = MPI.COMM_WORLD
@@ -10,15 +10,15 @@
 np = comm.Get_size()
 
 # Initialize NVSHMEM with MPI
+dev = Device(me % system.num_devices)
+dev.set_current()
+nvshmem.init(device=dev, mpi_comm=comm, initializer_method="mpi")
+
+#uid = nvshmem.get_unique_id(empty=(me != 0))
+#comm.Bcast(uid._data.view(numpy.int8), root=0)
 #dev = Device()
 #dev.set_current()
-#nvshmem.init(device=dev, mpi_comm=comm, initializer_method="mpi")
-
-uid = nvshmem.get_unique_id(empty=(me != 0))
-comm.Bcast(uid._data.view(numpy.int8), root=0)
-dev = Device()
-dev.set_current()
-nvshmem.init(device=dev, uid=uid, rank=me, nranks=np, initializer_method="uid")
+#nvshmem.init(device=dev, uid=uid, rank=me, nranks=np, initializer_method="uid")
 
 #dev = Device(me % system.num_devices)
 #dev.set_current()
@@ -32,23 +32,24 @@
 
 # Allocate symmetric memory
 # array() returns a CuPy NDArray object
-#x = nvshmem.array((1024,), dtype="float32")
-#y = nvshmem.array((1024,), dtype="float32")
+x = nvshmem.array((1024,), dtype="float32")
+y = nvshmem.array((1024,), dtype="float32")
 
 #if my_pe == 0:
 #    y[:] = 1.0
 
 # Perform communication operations
 # Put y from PE 0 into x on PE 1
-#if my_pe == 0:
-#    nvshmem.put(x, y, pe=1, stream=stream)
+if my_pe == 0:
+    nvshmem.put(x, y, 1, stream=stream)
 
 # Synchronize PEs
+nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream)
 stream.sync()
 
 # Clean up
-#nvshmem.free_array(x)
-#nvshmem.free_array(y)
+nvshmem.free_array(x)
+nvshmem.free_array(y)
 nvshmem.finalize()
 print('OK')
 

From d7a63a01df97334e271fee1b903387db9de79055 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Wed, 27 Aug 2025 11:36:21 +0300
Subject: [PATCH 09/10] fix all the issues i hope

---
 PYTHON/nstream-cupy-nvshmem.py | 72 +++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/PYTHON/nstream-cupy-nvshmem.py b/PYTHON/nstream-cupy-nvshmem.py
index eb626b621..fbb2cf12a 100755
--- a/PYTHON/nstream-cupy-nvshmem.py
+++ b/PYTHON/nstream-cupy-nvshmem.py
@@ -75,35 +75,37 @@
 
 import cupy
 
-print('=== CUDA Version Information ===')
-
-try:
-    # Get CUDA runtime version
-    runtime_version = cupy.cuda.runtime.runtimeGetVersion()
-    runtime_major = runtime_version // 1000
-    runtime_minor = (runtime_version % 1000) // 10
-    print(f'CUDA Runtime Version: {runtime_major}.{runtime_minor} (raw: {runtime_version})')
-    
-    # Get CUDA driver version  
-    driver_version = cupy.cuda.runtime.driverGetVersion()
-    driver_major = driver_version // 1000
-    driver_minor = (driver_version % 1000) // 10
-    print(f'CUDA Driver Version: {driver_major}.{driver_minor} (raw: {driver_version})')
-    
-    print(f'Version compatibility: Driver {driver_major}.{driver_minor} vs Runtime {runtime_major}.{runtime_minor}')
-    
-    if driver_version < runtime_version:
-        print('WARNING: Driver version is older than runtime version!')
-        print('This can cause \"cudaErrorInsufficientDriver\" errors.')
-        print('Consider updating your NVIDIA drivers.')
-    else:
-        print('Driver and runtime versions are compatible.')
+if False:
+    print('=== CUDA Version Information ===')
+
+    try:
+        # Get CUDA runtime version
+        runtime_version = cupy.cuda.runtime.runtimeGetVersion()
+        runtime_major = runtime_version // 1000
+        runtime_minor = (runtime_version % 1000) // 10
+        print(f'CUDA Runtime Version: {runtime_major}.{runtime_minor} (raw: {runtime_version})')
+        
+        # Get CUDA driver version  
+        driver_version = cupy.cuda.runtime.driverGetVersion()
+        driver_major = driver_version // 1000
+        driver_minor = (driver_version % 1000) // 10
+        print(f'CUDA Driver Version: {driver_major}.{driver_minor} (raw: {driver_version})')
+        
+        print(f'Version compatibility: Driver {driver_major}.{driver_minor} vs Runtime {runtime_major}.{runtime_minor}')
         
-except Exception as e:
-    print(f'Error: {e}')
+        if driver_version < runtime_version:
+            print('WARNING: Driver version is older than runtime version!')
+            print('This can cause \"cudaErrorInsufficientDriver\" errors.')
+            print('Consider updating your NVIDIA drivers.')
+        else:
+            print('Driver and runtime versions are compatible.')
+            
+    except Exception as e:
+        print(f'Error: {e}')
     print('This usually indicates CUDA driver/runtime compatibility issues.')
 
 from cuda.core.experimental import Device
+from cuda.core.experimental import system
 
 import nvshmem.core as nvshmem
 
@@ -111,7 +113,7 @@ def main():
     
     # Initialize MPI and CUDA device
     comm = MPI.COMM_WORLD
-    local_rank = comm.Get_rank() % 8  # Assume max 8 GPUs per node
+    local_rank = comm.Get_rank() % system.num_devices
     device = Device(local_rank)
     device.set_current()
     stream = device.create_stream()
@@ -165,8 +167,8 @@ def main():
         print('Local vector length  = ', length)
 
     # Barrier using NVSHMEM
-    nvshmem.barrier(stream=stream)
-    stream.synchronize()
+    nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream)
+    stream.sync()
 
     # ********************************************************************
     # ** Allocate space for the input and execute STREAM triad
@@ -188,16 +190,16 @@ def main():
     for k in range(0, iterations+1):
 
         if k < 1:
-            nvshmem.barrier(stream=stream)
-            stream.synchronize()
+            nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream)
+            stream.sync()
             t0 = timer()
 
         # STREAM triad operation on GPU using CuPy operations
         A += B + scalar * C
 
     # Final synchronization
-    nvshmem.barrier(stream=stream)
-    stream.synchronize()
+    nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream)
+    stream.sync()
     t1 = timer()
     nstream_time = t1 - t0
 
@@ -224,8 +226,8 @@ def main():
     dst[0] = 0.0
     
     # Reduce across all PEs using NVSHMEM collective
-    nvshmem.reduce(dst, src, "sum", stream=stream)
-    stream.synchronize()
+    nvshmem.reduce(nvshmem.Teams.TEAM_WORLD, dst, src, op="sum", stream=stream)
+    stream.sync()
     
     asum_global = float(dst[0])
 
@@ -255,4 +257,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()

From 409c868ae258e488708e0b33c31eb15babba8df0 Mon Sep 17 00:00:00 2001
From: Jeff Hammond <jehammond@nvidia.com>
Date: Wed, 27 Aug 2025 12:05:50 +0300
Subject: [PATCH 10/10] hack to disable jit?

---
 PYTHON/nstream-cupy-nvshmem.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/PYTHON/nstream-cupy-nvshmem.py b/PYTHON/nstream-cupy-nvshmem.py
index fbb2cf12a..a5335999e 100755
--- a/PYTHON/nstream-cupy-nvshmem.py
+++ b/PYTHON/nstream-cupy-nvshmem.py
@@ -196,6 +196,8 @@ def main():
 
         # STREAM triad operation on GPU using CuPy operations
         A += B + scalar * C
+        # it seems like this is required to get proper timings - maybe some weird JiT thing happening
+        nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream)
 
     # Final synchronization
     nvshmem.barrier(nvshmem.Teams.TEAM_WORLD,stream=stream)
@@ -243,7 +245,7 @@ def main():
             print('Solution validates')
             avgtime = nstream_time / iterations
             nbytes = 4.0 * total_length * 8  # 8 bytes per double
-            print('Rate (MB/s): ', 1.e-6 * nbytes / avgtime, ' Avg time (s): ', avgtime)
+            print('Rate (GB/s): ', 1.e-9 * nbytes / avgtime, ' Avg time (s): ', avgtime)
 
     # Free NVSHMEM arrays
     nvshmem.free_array(A)