quantumlib · mhucka · Mar 19, 2026 · Feb 13, 2026 · Feb 13, 2026 · Feb 13, 2026
diff --git a/lib/simulator_cuda.h b/lib/simulator_cuda.h
@@ -344,13 +344,13 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G;
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
     unsigned threads = 64U;
-    unsigned blocks = std::max(1U, size / 2);
+    uint64_t blocks = std::max(uint64_t{1}, size / 2);
 
     IndicesH<G> d_i(d_ws);
 
-    ApplyGateH_Kernel<G><<<blocks, threads>>>(
+    ApplyGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, state.get());
   }
 
@@ -368,13 +368,13 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + num_effective_qs;
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
     unsigned threads = 32;
-    unsigned blocks = size;
+    uint64_t blocks = size;
 
     IndicesL<G> d_i(d_ws);
 
-    ApplyGateL_Kernel<G><<<blocks, threads>>>(
+    ApplyGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
         1 << num_effective_qs, state.get());
   }
@@ -401,13 +401,13 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G + cqs.size();
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
     unsigned threads = 64U;
-    unsigned blocks = std::max(1U, size / 2);
+    uint64_t blocks = std::max(uint64_t{1}, size / 2);
 
     IndicesH<G> d_i(d_ws);
 
-    ApplyControlledGateH_Kernel<G><<<blocks, threads>>>(
+    ApplyControlledGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
   }
 
@@ -426,13 +426,13 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G + cqs.size();
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
     unsigned threads = 32;
-    unsigned blocks = size;
+    uint64_t blocks = size;
 
     IndicesL<G> d_i(d_ws);
 
-    ApplyControlledGateLH_Kernel<G><<<blocks, threads>>>(
+    ApplyControlledGateLH_Kernel<G><<<CreateGrid(blocks), threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
         d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get());
   }
@@ -452,13 +452,13 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G + cqs.size();
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
     unsigned threads = 32;
-    unsigned blocks = size;
+    uint64_t blocks = size;
 
     IndicesLC<G> d_i(d_ws);
 
-    ApplyControlledGateL_Kernel<G><<<blocks, threads>>>(
+    ApplyControlledGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
         d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs,
         1 << (5 - d.remaining_low_cqs), state.get());
@@ -479,11 +479,11 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G;
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
 
     unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U);
     unsigned threads = 64U;
-    unsigned blocks = std::max(1U, (size / 2) >> s);
+    uint64_t blocks = std::max(uint64_t{1}, (size / 2) >> s);
     unsigned num_iterations_per_block = 1 << s;
 
     constexpr unsigned m = 16;
@@ -493,7 +493,7 @@ class SimulatorCUDA final {
 
     IndicesH<G> d_i(d_ws);
 
-    ExpectationValueH_Kernel<G><<<blocks, threads>>>(
+    ExpectationValueH_Kernel<G><<<CreateGrid(blocks), threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block,
         state.get(), Plus<double>(), d_res1);
 
@@ -517,11 +517,11 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + num_effective_qs;
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
 
     unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U);
     unsigned threads = 32;
-    unsigned blocks = size >> s;
+    uint64_t blocks = size >> s;
     unsigned num_iterations_per_block = 1 << s;
 
     constexpr unsigned m = 16;
@@ -531,7 +531,7 @@ class SimulatorCUDA final {
 
     IndicesL<G> d_i(d_ws);
 
-    ExpectationValueL_Kernel<G><<<blocks, threads>>>(
+    ExpectationValueL_Kernel<G><<<CreateGrid(blocks), threads>>>(
         (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
         num_iterations_per_block, state.get(), Plus<double>(), d_res1);
 
@@ -542,18 +542,18 @@ class SimulatorCUDA final {
 
   template <unsigned m>
   std::complex<double> ExpectationValueReduceFinal(
-      unsigned blocks, double mul,
+      uint64_t blocks, double mul,
       const Complex* d_res1, Complex* d_res2) const {
     Complex res2[m];
 
     if (blocks <= 16) {
       ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex),
                             cudaMemcpyDeviceToHost));
     } else {
-      unsigned threads2 = std::min(1024U, blocks);
-      unsigned blocks2 = std::min(m, blocks / threads2);
+      unsigned threads2 = std::min(uint64_t{1024}, blocks);
+      uint64_t blocks2 = std::min(uint64_t{m}, blocks / threads2);
 
-      unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2));
+      unsigned dblocks = std::max(uint64_t{1}, blocks / (blocks2 * threads2));
       unsigned bytes = threads2 * sizeof(Complex);
 
       Reduce2Kernel<Complex><<<blocks2, threads2, bytes>>>(
@@ -568,7 +568,7 @@ class SimulatorCUDA final {
     double re = 0;
     double im = 0;
 
-    for (unsigned i = 0; i < blocks; ++i) {
+    for (uint64_t i = 0; i < blocks; ++i) {
       re += res2[i].re;
       im += res2[i].im;
     }

diff --git a/lib/simulator_cuda_kernels.h b/lib/simulator_cuda_kernels.h
@@ -18,13 +18,13 @@
 #ifdef __NVCC__
   #include <cuda.h>
   #include <cuda_runtime.h>
-
-  #include "util_cuda.h"
 #elif __HIP__
   #include <hip/hip_runtime.h>
   #include "cuda2hip.h"
 #endif
 
+#include "util_cuda.h"
+
 namespace qsim {
 
 template <unsigned G, typename fp_type, typename idx_type>
@@ -33,6 +33,8 @@ __global__ void ApplyGateH_Kernel(
     const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 64.
 
+  uint64_t blockId = GetBlockId();
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -61,7 +63,7 @@ __global__ void ApplyGateH_Kernel(
 
   __syncthreads();
 
-  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
+  idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0;
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j <= G; ++j) {
     i *= 2;
@@ -115,6 +117,8 @@ __global__ void ApplyGateL_Kernel(
     fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 32.
 
+  uint64_t blockId = GetBlockId();
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -137,7 +141,7 @@ __global__ void ApplyGateL_Kernel(
     }
   }
 
-  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type i = 32 * idx_type{blockId};
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j <= G; ++j) {
     i *= 2;
@@ -204,6 +208,8 @@ __global__ void ApplyControlledGateH_Kernel(
     fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 64.
 
+  uint64_t blockId = GetBlockId();
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -232,7 +238,7 @@ __global__ void ApplyControlledGateH_Kernel(
 
   __syncthreads();
 
-  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
+  idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0;
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j < num_mss; ++j) {
     i *= 2;
@@ -288,6 +294,8 @@ __global__ void ApplyControlledGateLH_Kernel(
     unsigned esize, fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 32.
 
+  uint64_t blockId = GetBlockId();
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -300,7 +308,7 @@ __global__ void ApplyControlledGateLH_Kernel(
   __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
   __shared__ fp_type v[2 * gsize * rows];
 
-  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type i = 32 * idx_type{blockId};
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j < num_mss; ++j) {
     i *= 2;
@@ -381,6 +389,8 @@ __global__ void ApplyControlledGateL_Kernel(
     fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 32.
 
+  uint64_t blockId = GetBlockId();
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -393,7 +403,7 @@ __global__ void ApplyControlledGateL_Kernel(
   __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
   __shared__ fp_type v[2 * gsize * rows];
 
-  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type i = 32 * idx_type{blockId};
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j < num_mss; ++j) {
     i *= 2;
@@ -477,6 +487,8 @@ __global__ void ExpectationValueH_Kernel(
     const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
   // blockDim.x must be equal to 64.
 
+  uint64_t blockId = GetBlockId();
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -508,7 +520,7 @@ __global__ void ExpectationValueH_Kernel(
   double im = 0;
 
   for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) {
-    idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter;
+    idx_type b = num_iterations_per_block * idx_type{blockId} + iter;
 
     idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0;
     idx_type ii = i & mss[0];
@@ -573,8 +585,8 @@ __global__ void ExpectationValueH_Kernel(
   __syncthreads();
 
   if (threadIdx.x == 0) {
-    result[blockIdx.x].re = partial2[0].re + partial2[1].re;
-    result[blockIdx.x].im = partial2[0].im + partial2[1].im;
+    result[blockId].re = partial2[0].re + partial2[1].re;
+    result[blockId].im = partial2[0].im + partial2[1].im;
   }
 }
 
@@ -587,6 +599,8 @@ __global__ void ExpectationValueL_Kernel(
     const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
   // blockDim.x must be equal to 32.
 
+  uint64_t blockId = GetBlockId();
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -612,7 +626,7 @@ __global__ void ExpectationValueL_Kernel(
   double im = 0;
 
   for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) {
-    idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter);
+    idx_type i = 32 * (num_iterations_per_block * idx_type{blockId} + iter);
     idx_type ii = i & mss[0];
     for (unsigned j = 1; j <= G; ++j) {
       i *= 2;
@@ -673,8 +687,8 @@ __global__ void ExpectationValueL_Kernel(
   auto val = WarpReduce(partial[threadIdx.x], op);
 
   if (threadIdx.x == 0) {
-    result[blockIdx.x].re = val.re;
-    result[blockIdx.x].im = val.im;
+    result[blockId].re = val.re;
+    result[blockId].im = val.im;
   }
 }