From 4c272a15a8690aa053cc12f5d29babda0bab0b68 Mon Sep 17 00:00:00 2001
From: sun-9545sunoj <nagasunoj777@gmail.com>
Date: Fri, 13 Feb 2026 09:29:52 +0000
Subject: [PATCH 01/10] fix: resolve #993 -remove formating noise and isolate
 MI300X scaling logic

---
 apps/qsimh_base_cuda.cu       | 220 ++++++++++++++++++++++++++++++++++
 lib/simulator_cuda.h          |  38 +++---
 lib/simulator_cuda_kernels.h  |  43 +++++--
 lib/statespace_cuda.h         |  43 +++++--
 lib/statespace_cuda_kernels.h |  27 +++--
 lib/vectorspace_cuda.h        |  47 +++++---
 6 files changed, 349 insertions(+), 69 deletions(-)
 create mode 100644 apps/qsimh_base_cuda.cu
diff --git a/apps/qsimh_base_cuda.cu b/apps/qsimh_base_cuda.cu
new file mode 100644
index 000000000..e82528212
--- /dev/null
+++ b/apps/qsimh_base_cuda.cu
@@ -0,0 +1,220 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <complex>
+#include <iomanip>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <unistd.h>
+#include <vector>
+
+#include "../lib/bitstring.h"
+#include "../lib/circuit_qsim_parser.h"
+#include "../lib/formux.h"
+#include "../lib/fuser_basic.h"
+#include "../lib/gates_qsim.h"
+#include "../lib/io_file.h"
+#include "../lib/run_qsimh.h"
+#include "../lib/simmux.h"
+#include "../lib/simulator_cuda.h"
+#include "../lib/util.h"
+#include "../lib/util_cpu.h"
+
+constexpr char usage[] =
+    "usage:\n  ./qsimh_base_hip.x -c circuit_file "
+    "-d maximum_time -k part1_qubits "
+    "-w prefix -p num_prefix_gates -r num_root_gates "
+    "-t num_threads -n num_dblocks -v verbosity -z\n";
+
+struct Options {
+  std::string circuit_file;
+  std::vector<unsigned> part1;
+  uint64_t prefix;
+  unsigned maxtime = std::numeric_limits<unsigned>::max();
+  unsigned num_prefix_gatexs = 0;
+  unsigned num_root_gatexs = 0;
+  unsigned num_threads = 256;
+  unsigned num_dblocks = 16;
+  unsigned verbosity = 0;
+  bool denormals_are_zeros = false;
+};
+
+Options GetOptions(int argc, char* argv[]) {
+  Options opt;
+
+  int k;
+
+  auto to_int = [](const std::string& word) -> unsigned {
+    return std::stoul(word);
+  };
+
+  while ((k = getopt(argc, argv, "c:d:k:w:p:r:t:n:v:z")) != -1) {
+    switch (k) {
+      case 'c':
+        opt.circuit_file = optarg;
+        break;
+      case 'd':
+        opt.maxtime = std::stoul(optarg);
+        break;
+      case 'k':
+        qsim::SplitString(optarg, ',', to_int, opt.part1);
+        break;
+      case 'w':
+        opt.prefix = std::stoull(optarg);
+        break;
+      case 'p':
+        opt.num_prefix_gatexs = std::stoul(optarg);
+        break;
+      case 'r':
+        opt.num_root_gatexs = std::stoul(optarg);
+        break;
+      case 't':
+        opt.num_threads = std::stoul(optarg);
+        break;
+      case 'n':
+        opt.num_dblocks = std::stoul(optarg);
+        break;
+      case 'v':
+        opt.verbosity = std::stoul(optarg);
+        break;
+      case 'z':
+        opt.denormals_are_zeros = true;
+        break;
+      default:
+        qsim::IO::errorf(usage);
+        exit(1);
+    }
+  }
+
+  return opt;
+}
+
+bool ValidateOptions(const Options& opt) {
+  if (opt.circuit_file.empty()) {
+    qsim::IO::errorf("circuit file is not provided.\n");
+    qsim::IO::errorf(usage);
+    return false;
+  }
+
+  return true;
+}
+
+bool ValidatePart1(unsigned num_qubits, const std::vector<unsigned>& part1) {
+  for (std::size_t i = 0; i < part1.size(); ++i) {
+    if (part1[i] >= num_qubits) {
+      qsim::IO::errorf("part 1 qubit indices are too large.\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+std::vector<unsigned> GetParts(
+    unsigned num_qubits, const std::vector<unsigned>& part1) {
+  std::vector<unsigned> parts(num_qubits, 0);
+
+  for (std::size_t i = 0; i < part1.size(); ++i) {
+    parts[part1[i]] = 1;
+  }
+
+  return parts;
+}
+
+int main(int argc, char* argv[]) {
+  using namespace qsim;
+
+  auto opt = GetOptions(argc, argv);
+  if (!ValidateOptions(opt)) {
+    return 1;
+  }
+
+  Circuit<GateQSim<float>> circuit;
+  if (!CircuitQsimParser<IOFile>::FromFile(
+          opt.maxtime, opt.circuit_file, circuit)) {
+    return 1;
+  }
+
+  if (!ValidatePart1(circuit.num_qubits, opt.part1)) {
+    return 1;
+  }
+  auto parts = GetParts(circuit.num_qubits, opt.part1);
+
+  if (opt.denormals_are_zeros) {
+    SetFlushToZeroAndDenormalsAreZeros();
+  }
+
+  uint64_t num_bitstrings =
+      std::min(uint64_t{8}, uint64_t{1} << circuit.num_qubits);
+
+  std::vector<Bitstring> bitstrings;
+  bitstrings.reserve(num_bitstrings);
+  for (std::size_t i = 0; i < num_bitstrings; ++i) {
+    bitstrings.push_back(i);
+  }
+
+  struct Factory {
+    using Simulator = qsim::SimulatorCUDA<float>;
+    using StateSpace = Simulator::StateSpace;
+    using fp_type = Simulator::fp_type;
+
+    Factory(const StateSpace::Parameter& param) : param(param) {}
+
+    StateSpace CreateStateSpace() const { return StateSpace(param); }
+
+    Simulator CreateSimulator() const { return Simulator(); }
+
+    const StateSpace::Parameter& param;
+  };
+
+  using HybridSimulator =
+      HybridSimulator<IO, GateQSim<float>, BasicGateFuser, For>;
+  using Runner = QSimHRunner<IO, HybridSimulator>;
+
+  Runner::Parameter param;
+  param.prefix = opt.prefix;
+  param.num_prefix_gatexs = opt.num_prefix_gatexs;
+  param.num_root_gatexs = opt.num_root_gatexs;
+  param.num_threads =
+      opt.num_threads;  // This is reused for StateSpaceCUDA params implicitly
+                        // if not careful, but here we separate.
+  param.verbosity = opt.verbosity;
+
+  std::vector<std::complex<Factory::fp_type>> results(num_bitstrings, 0);
+
+  // Setup CUDA parameters
+  Factory::StateSpace::Parameter cuda_param;
+  cuda_param.num_threads = opt.num_threads;
+  cuda_param.num_dblocks = opt.num_dblocks;
+
+  Factory factory(cuda_param);
+
+  if (Runner::Run(param, factory, circuit, parts, bitstrings, results)) {
+    static constexpr char const* bits[8] = {
+        "000", "001", "010", "011", "100", "101", "110", "111",
+    };
+
+    unsigned s = 3 - std::min(unsigned{3}, circuit.num_qubits);
+
+    for (std::size_t i = 0; i < num_bitstrings; ++i) {
+      const auto& a = results[i];
+      qsim::IO::messagef(
+          "%s:%16.8g%16.8g%16.8g\n", bits[i] + s, std::real(a), std::imag(a),
+          std::norm(a));
+    }
+  }
+
+  return 0;
+}
diff --git a/lib/simulator_cuda.h b/lib/simulator_cuda.h
index 5743bea8b..de9106bcf 100644
--- a/lib/simulator_cuda.h
+++ b/lib/simulator_cuda.h
@@ -350,8 +350,8 @@ class SimulatorCUDA final {
 
     IndicesH<G> d_i(d_ws);
 
-    ApplyGateH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, state.get());
+    ApplyGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, state.get());
   }
 
   template <unsigned G>
@@ -374,8 +374,8 @@ class SimulatorCUDA final {
 
     IndicesL<G> d_i(d_ws);
 
-    ApplyGateL_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+    ApplyGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
         1 << num_effective_qs, state.get());
   }
 
@@ -407,8 +407,8 @@ class SimulatorCUDA final {
 
     IndicesH<G> d_i(d_ws);
 
-    ApplyControlledGateH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
+    ApplyControlledGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
   }
 
   template <unsigned G>
@@ -432,9 +432,9 @@ class SimulatorCUDA final {
 
     IndicesL<G> d_i(d_ws);
 
-    ApplyControlledGateLH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
-        d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get());
+    ApplyControlledGateLH_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d.num_aqs + 1,
+        d.cvalsh, 1 << d.num_effective_qs, state.get());
   }
 
   template <unsigned G>
@@ -458,8 +458,8 @@ class SimulatorCUDA final {
 
     IndicesLC<G> d_i(d_ws);
 
-    ApplyControlledGateL_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
+    ApplyControlledGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
         d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs,
         1 << (5 - d.remaining_low_cqs), state.get());
   }
@@ -493,9 +493,9 @@ class SimulatorCUDA final {
 
     IndicesH<G> d_i(d_ws);
 
-    ExpectationValueH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block,
-        state.get(), Plus<double>(), d_res1);
+    ExpectationValueH_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, num_iterations_per_block, state.get(),
+        Plus<double>(), d_res1);
 
     double mul = size == 1 ? 0.5 : 1.0;
 
@@ -531,8 +531,8 @@ class SimulatorCUDA final {
 
     IndicesL<G> d_i(d_ws);
 
-    ExpectationValueL_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+    ExpectationValueL_Kernel<G><<<CreateGrid(blocks), threads>>>(
+        (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
         num_iterations_per_block, state.get(), Plus<double>(), d_res1);
 
     double mul = double(1 << (5 + num_effective_qs - G)) / 32;
@@ -895,6 +895,12 @@ class SimulatorCUDA final {
     return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs};
   }
 
+  static dim3 CreateGrid(uint64_t blocks) {
+    if (blocks <= 65535) return dim3(blocks);
+    uint32_t x = 65535;
+    uint32_t y = (blocks + x - 1) / x;
+    return dim3(x, y);
+  }
 
   void* AllocScratch(uint64_t size) const {
     if (size > scratch_size_) {
diff --git a/lib/simulator_cuda_kernels.h b/lib/simulator_cuda_kernels.h
index e21a9d62e..cfc9419b1 100644
--- a/lib/simulator_cuda_kernels.h
+++ b/lib/simulator_cuda_kernels.h
@@ -33,6 +33,9 @@ __global__ void ApplyGateH_Kernel(
     const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 64.
 
+  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
+                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -61,7 +64,7 @@ __global__ void ApplyGateH_Kernel(
 
   __syncthreads();
 
-  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
+  idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0;
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j <= G; ++j) {
     i *= 2;
@@ -115,6 +118,9 @@ __global__ void ApplyGateL_Kernel(
     fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 32.
 
+  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
+                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -137,7 +143,7 @@ __global__ void ApplyGateL_Kernel(
     }
   }
 
-  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type i = 32 * idx_type{blockId};
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j <= G; ++j) {
     i *= 2;
@@ -204,6 +210,9 @@ __global__ void ApplyControlledGateH_Kernel(
     fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 64.
 
+  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
+                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -232,7 +241,7 @@ __global__ void ApplyControlledGateH_Kernel(
 
   __syncthreads();
 
-  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
+  idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0;
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j < num_mss; ++j) {
     i *= 2;
@@ -288,6 +297,9 @@ __global__ void ApplyControlledGateLH_Kernel(
     unsigned esize, fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 32.
 
+  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
+                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -300,7 +312,7 @@ __global__ void ApplyControlledGateLH_Kernel(
   __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
   __shared__ fp_type v[2 * gsize * rows];
 
-  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type i = 32 * idx_type{blockId};
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j < num_mss; ++j) {
     i *= 2;
@@ -381,6 +393,9 @@ __global__ void ApplyControlledGateL_Kernel(
     fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 32.
 
+  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
+                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -393,7 +408,7 @@ __global__ void ApplyControlledGateL_Kernel(
   __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
   __shared__ fp_type v[2 * gsize * rows];
 
-  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type i = 32 * idx_type{blockId};
   idx_type ii = i & mss[0];
   for (unsigned j = 1; j < num_mss; ++j) {
     i *= 2;
@@ -477,6 +492,9 @@ __global__ void ExpectationValueH_Kernel(
     const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
   // blockDim.x must be equal to 64.
 
+  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
+                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -508,7 +526,7 @@ __global__ void ExpectationValueH_Kernel(
   double im = 0;
 
   for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) {
-    idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter;
+    idx_type b = num_iterations_per_block * idx_type{blockId} + iter;
 
     idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0;
     idx_type ii = i & mss[0];
@@ -573,8 +591,8 @@ __global__ void ExpectationValueH_Kernel(
   __syncthreads();
 
   if (threadIdx.x == 0) {
-    result[blockIdx.x].re = partial2[0].re + partial2[1].re;
-    result[blockIdx.x].im = partial2[0].im + partial2[1].im;
+    result[blockId].re = partial2[0].re + partial2[1].re;
+    result[blockId].im = partial2[0].im + partial2[1].im;
   }
 }
 
@@ -587,6 +605,9 @@ __global__ void ExpectationValueL_Kernel(
     const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
   // blockDim.x must be equal to 32.
 
+  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
+                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
   constexpr unsigned gsize = 1 << G;
@@ -612,7 +633,7 @@ __global__ void ExpectationValueL_Kernel(
   double im = 0;
 
   for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) {
-    idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter);
+    idx_type i = 32 * (num_iterations_per_block * idx_type{blockId} + iter);
     idx_type ii = i & mss[0];
     for (unsigned j = 1; j <= G; ++j) {
       i *= 2;
@@ -673,8 +694,8 @@ __global__ void ExpectationValueL_Kernel(
   auto val = WarpReduce(partial[threadIdx.x], op);
 
   if (threadIdx.x == 0) {
-    result[blockIdx.x].re = val.re;
-    result[blockIdx.x].im = val.im;
+    result[blockId].re = val.re;
+    result[blockId].im = val.im;
   }
 }
 
diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h
index 660db074c..865ea006b 100644
--- a/lib/statespace_cuda.h
+++ b/lib/statespace_cuda.h
@@ -50,6 +50,7 @@ class StateSpaceCUDA :
     unsigned threads;
     unsigned dblocks;
     unsigned blocks;
+    dim3 device_grid;
   };
 
  public:
@@ -82,6 +83,16 @@ class StateSpaceCUDA :
     return std::max(uint64_t{64}, 2 * (uint64_t{1} << num_qubits));
   };
 
+  bool IsNull(const State& state) const { return state.get() == nullptr; }
+
+  void DeviceSync() const { ErrorCheck(cudaDeviceSynchronize()); }
+
+  void Copy(const State& src, State& dest) const {
+    ErrorCheck(cudaMemcpy(
+        dest.get(), src.get(), MinSize(src.num_qubits()) * sizeof(fp_type),
+        cudaMemcpyDeviceToDevice));
+  }
+
   void InternalToNormalOrder(State& state) const {
     uint64_t size = MinSize(state.num_qubits()) / 2;
 
@@ -89,7 +100,8 @@ class StateSpaceCUDA :
     unsigned blocks = size / threads;
     unsigned bytes = 2 * threads * sizeof(fp_type);
 
-    InternalToNormalOrderKernel<<<blocks, threads, bytes>>>(state.get());
+    InternalToNormalOrderKernel<<<CreateGrid(blocks), threads, bytes>>>(
+        state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
@@ -101,7 +113,8 @@ class StateSpaceCUDA :
     unsigned blocks = size / threads;
     unsigned bytes = 2 * threads * sizeof(fp_type);
 
-    NormalToInternalOrderKernel<<<blocks, threads, bytes>>>(state.get());
+    NormalToInternalOrderKernel<<<CreateGrid(blocks), threads, bytes>>>(
+        state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
@@ -121,7 +134,8 @@ class StateSpaceCUDA :
 
     fp_type v = double{1} / std::sqrt(hsize);
 
-    SetStateUniformKernel<<<blocks, threads>>>(v, hsize, state.get());
+    SetStateUniformKernel<<<CreateGrid(blocks), threads>>>(
+        v, hsize, state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
@@ -180,7 +194,7 @@ class StateSpaceCUDA :
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
     unsigned blocks = size / threads;
 
-    BulkSetAmplKernel<<<blocks, threads>>>(
+    BulkSetAmplKernel<<<CreateGrid(blocks), threads>>>(
         mask, bits, re, im, exclude, state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
@@ -197,7 +211,7 @@ class StateSpaceCUDA :
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
     unsigned blocks = size / threads;
 
-    AddKernel<<<blocks, threads>>>(src.get(), dest.get());
+    AddKernel<<<CreateGrid(blocks), threads>>>(src.get(), dest.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
 
@@ -211,7 +225,7 @@ class StateSpaceCUDA :
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
     unsigned blocks = size / threads;
 
-    MultiplyKernel<<<blocks, threads>>>(a, state.get());
+    MultiplyKernel<<<CreateGrid(blocks), threads>>>(a, state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
@@ -262,7 +276,7 @@ class StateSpaceCUDA :
       auto op1 = RealProduct<fp_type>();
       auto op2 = Plus<double>();
 
-      Reduce1Kernel<double><<<g1.blocks, g1.threads, bytes>>>(
+      Reduce1Kernel<double><<<g1.device_grid, g1.threads, bytes>>>(
           g1.dblocks, op1, op2, op2, state.get(), state.get(), d_res1);
       ErrorCheck(cudaPeekAtLastError());
       ErrorCheck(cudaDeviceSynchronize());
@@ -278,7 +292,7 @@ class StateSpaceCUDA :
 
         auto op3 = Plus<double>();
 
-        Reduce2Kernel<double><<<g2.blocks, g2.threads, bytes>>>(
+        Reduce2Kernel<double><<<g2.device_grid, g2.threads, bytes>>>(
             g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2);
         ErrorCheck(cudaPeekAtLastError());
         ErrorCheck(cudaDeviceSynchronize());
@@ -321,7 +335,8 @@ class StateSpaceCUDA :
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
     unsigned blocks = size / threads;
 
-    CollapseKernel<<<blocks, threads>>>(mr.mask, mr.bits, renorm, state.get());
+    CollapseKernel<<<CreateGrid(blocks), threads>>>(
+        mr.mask, mr.bits, renorm, state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
@@ -337,7 +352,7 @@ class StateSpaceCUDA :
     auto op1 = RealProduct<fp_type>();
     auto op2 = Plus<double>();
 
-    Reduce1Kernel<double><<<g.blocks, g.threads, bytes>>>(
+    Reduce1Kernel<double><<<g.device_grid, g.threads, bytes>>>(
         g.dblocks, op1, op2, op2, state.get(), state.get(), d_res);
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
@@ -391,6 +406,7 @@ class StateSpaceCUDA :
     grid.threads = std::min(size, uint64_t{param_.num_threads});
     grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks});
     grid.blocks = size / (grid.threads * grid.dblocks);
+    grid.device_grid = CreateGrid(grid.blocks);
 
     return grid;
   }
@@ -401,6 +417,7 @@ class StateSpaceCUDA :
     grid.threads = std::min(param_.num_threads, std::max(32U, size));
     grid.dblocks = std::max(1U, size / grid.threads);
     grid.blocks = 1;
+    grid.device_grid = dim3(1, 1, 1);
 
     return grid;
   }
@@ -426,10 +443,10 @@ class StateSpaceCUDA :
     auto op3 = Plus<typename Scalar<FP1>::type>();
 
     if (mask == 0) {
-      Reduce1Kernel<FP1><<<g1.blocks, g1.threads, bytes>>>(
+      Reduce1Kernel<FP1><<<g1.device_grid, g1.threads, bytes>>>(
           g1.dblocks, op1, op2, op3, state1.get(), state2.get(), d_res1);
     } else {
-      Reduce1MaskedKernel<FP1><<<g1.blocks, g1.threads, bytes>>>(
+      Reduce1MaskedKernel<FP1><<<g1.device_grid, g1.threads, bytes>>>(
           g1.dblocks, mask, bits, op1, op2, op3, state1.get(), state2.get(),
           d_res1);
     }
@@ -448,7 +465,7 @@ class StateSpaceCUDA :
       auto op2 = Plus<FP2>();
       auto op3 = Plus<typename Scalar<FP2>::type>();
 
-      Reduce2Kernel<FP2><<<g2.blocks, g2.threads, bytes>>>(
+      Reduce2Kernel<FP2><<<g2.device_grid, g2.threads, bytes>>>(
           g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2);
       ErrorCheck(cudaPeekAtLastError());
       ErrorCheck(cudaDeviceSynchronize());
diff --git a/lib/statespace_cuda_kernels.h b/lib/statespace_cuda_kernels.h
index 0bc4ba706..88ad89911 100644
--- a/lib/statespace_cuda_kernels.h
+++ b/lib/statespace_cuda_kernels.h
@@ -39,7 +39,7 @@ __device__ __forceinline__ FP1 BlockReduce1(
   unsigned warp = threadIdx.x / warp_size;
   unsigned lane = threadIdx.x % warp_size;
 
-  uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane;
+  uint64_t k0 = 2 * n * qsim::GetBlockId() * blockDim.x + 2 * tid - lane;
   uint64_t k1 = k0 + 2 * n * blockDim.x;
 
   FP1 r;
@@ -88,7 +88,7 @@ __device__ __forceinline__ FP1 BlockReduce1Masked(
   unsigned warp = threadIdx.x / warp_size;
   unsigned lane = threadIdx.x % warp_size;
 
-  uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane;
+  uint64_t k0 = 2 * n * qsim::GetBlockId() * blockDim.x + 2 * tid - lane;
   uint64_t k1 = k0 + 2 * n * blockDim.x;
 
   FP1 r = 0;
@@ -137,7 +137,7 @@ __device__ __forceinline__ FP1 BlockReduce2(
   FP1* partial1 = (FP1*) shared;
 
   unsigned tid = threadIdx.x;
-  uint64_t k0 = n * blockIdx.x * blockDim.x + tid;
+  uint64_t k0 = n * qsim::GetBlockId() * blockDim.x + tid;
   uint64_t k1 = k0 + n * blockDim.x;
 
   FP1 r = 0;
@@ -185,7 +185,7 @@ __global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3,
   FP1 sum = detail::BlockReduce1<FP1>(n, op1, op2, op3, s1, s2);
 
   if (threadIdx.x == 0) {
-    result[blockIdx.x] = sum;
+    result[qsim::GetBlockId()] = sum;
   }
 }
 
@@ -198,7 +198,7 @@ __global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits,
       detail::BlockReduce1Masked<FP1>(n, mask, bits, op1, op2, op3, s1, s2);
 
   if (threadIdx.x == 0) {
-    result[blockIdx.x] = sum;
+    result[qsim::GetBlockId()] = sum;
   }
 }
 
@@ -209,7 +209,7 @@ __global__ void Reduce2Kernel(
   FP1 sum = detail::BlockReduce2<FP1>(n, size, op2, op3, s);
 
   if (threadIdx.x == 0) {
-    result[blockIdx.x] = sum;
+    result[qsim::GetBlockId()] = sum;
   }
 }
 
@@ -217,7 +217,7 @@ template <typename FP, unsigned warp_size = 32>
 __global__ void InternalToNormalOrderKernel(FP* state) {
   unsigned lane = threadIdx.x % warp_size;
   unsigned l = 2 * threadIdx.x - lane;
-  uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l;
+  uint64_t k = 2 * uint64_t{qsim::GetBlockId()} * blockDim.x + l;
 
   extern __shared__ float shared[];
   FP* buf = (FP*) shared;
@@ -235,7 +235,7 @@ template <typename FP, unsigned warp_size = 32>
 __global__ void NormalToInternalOrderKernel(FP* state) {
   unsigned lane = threadIdx.x % warp_size;
   unsigned l = 2 * threadIdx.x - lane;
-  uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l;
+  uint64_t k = 2 * uint64_t{qsim::GetBlockId()} * blockDim.x + l;
 
   extern __shared__ float shared[];
   FP* buf = (FP*) shared;
@@ -252,7 +252,8 @@ __global__ void NormalToInternalOrderKernel(FP* state) {
 template <typename FP, unsigned warp_size = 32>
 __global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
   unsigned lane = threadIdx.x % warp_size;
-  uint64_t k = 2 * (uint64_t{blockIdx.x} * blockDim.x + threadIdx.x) - lane;
+  uint64_t k =
+      2 * (uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x) - lane;
 
   state[k] = lane < size ? v : 0;
   state[k + warp_size] = 0;
@@ -260,19 +261,19 @@ __global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
 
 template <typename FP, unsigned warp_size = 32>
 __global__ void AddKernel(const FP* state1, FP* state2) {
-  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+  uint64_t k = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x;
   state2[k] += state1[k];
 }
 
 template <typename FP, unsigned warp_size = 32>
 __global__ void MultiplyKernel(FP a, FP* state) {
-  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+  uint64_t k = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x;
   state[k] *= a;
 }
 
 template <typename FP, unsigned warp_size = 32>
 __global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) {
-  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+  uint64_t k1 = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x;
   uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
 
   if ((k1 & mask) == bits) {
@@ -287,7 +288,7 @@ __global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) {
 template <typename FP, unsigned warp_size = 32>
 __global__ void BulkSetAmplKernel(
     uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) {
-  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+  uint64_t k1 = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x;
   uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
 
   bool set = ((k1 & mask) == bits) ^ exclude;
diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h
index 5cfd4e834..dd67661e2 100644
--- a/lib/vectorspace_cuda.h
+++ b/lib/vectorspace_cuda.h
@@ -28,24 +28,30 @@
 
 namespace qsim {
 
-namespace detail {
+// Use a unique detail namespace to avoid collision with vectorspace.h
+namespace cuda_detail {
 
 inline void do_not_free(void*) {}
-
 inline void free(void* ptr) {
-  ErrorCheck(cudaFree(ptr));
+  if (ptr != nullptr) {
+#ifdef __NVCC__
+    ErrorCheck(cudaFree(ptr));
+#elif __HIP__
+    // Using the qsim ErrorCheck wrapper for HIP
+    ErrorCheck(hipFree(ptr));
+#endif
+  }
 }
 
-}  // namespace detail
+}  // namespace cuda_detail
 
 // Routines for vector manipulations.
 template <typename Impl, typename FP>
 class VectorSpaceCUDA {
  public:
   using fp_type = FP;
-
- private:
-  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
+  // Define Pointer with a clear function pointer type for the deleter
+  using Pointer = std::unique_ptr<fp_type, void (*)(void*)>;
 
  public:
   class Vector {
@@ -86,11 +92,24 @@ class VectorSpaceCUDA {
 
   static Vector Create(unsigned num_qubits) {
     fp_type* p;
-    auto size = sizeof(fp_type) * Impl::MinSize(num_qubits);
+    // Use explicit 64-bit calculation for large simulations (>30 qubits) to prevent overflow.
+    // Otherwise, keep original behavior to minimize impact.
+    uint64_t size;
+    if (num_qubits > 30) {
+      size = uint64_t{sizeof(fp_type)} * Impl::MinSize(num_qubits);
+    } else {
+      size = sizeof(fp_type) * Impl::MinSize(num_qubits);
+    }
+
+// Ensure we use the correct API based on the compiler
+#ifdef __NVCC__
     auto rc = cudaMalloc(&p, size);
+#elif __HIP__
+    auto rc = hipMalloc(&p, size);
+#endif
 
-    if (rc == cudaSuccess) {
-      return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits};
+    if (rc == 0) {  // Success
+      return Vector{Pointer{(fp_type*)p, &cuda_detail::free}, num_qubits};
     } else {
       return Null();
     }
@@ -99,15 +118,11 @@ class VectorSpaceCUDA {
   // It is the client's responsibility to make sure that p has at least
   // Impl::MinSize(num_qubits) elements.
   static Vector Create(fp_type* p, unsigned num_qubits) {
-    return Vector{Pointer{p, &detail::do_not_free}, num_qubits};
+    return Vector{Pointer{p, &cuda_detail::do_not_free}, num_qubits};
   }
 
   static Vector Null() {
-    return Vector{Pointer{nullptr, &detail::free}, 0};
-  }
-
-  static bool IsNull(const Vector& vector) {
-    return vector.get() == nullptr;
+    return Vector{Pointer{nullptr, &cuda_detail::free}, 0};
   }
 
   static void Free(fp_type* ptr) {

From 8edca1d99636231a4e07601fe734a4f597ff59e4 Mon Sep 17 00:00:00 2001
From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com>
Date: Sat, 14 Feb 2026 00:30:07 +0530
Subject: [PATCH 02/10] Update copyright year and usage message for CUDA

---
 apps/qsimh_base_cuda.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/qsimh_base_cuda.cu b/apps/qsimh_base_cuda.cu
index e82528212..951ad7688 100644
--- a/apps/qsimh_base_cuda.cu
+++ b/apps/qsimh_base_cuda.cu
@@ -1,4 +1,4 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
+// Copyright 2026 Google LLC. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@
 #include "../lib/util_cpu.h"
 
 constexpr char usage[] =
-    "usage:\n  ./qsimh_base_hip.x -c circuit_file "
+    "usage:\n  ./qsimh_base_cuda.x -c circuit_file "
     "-d maximum_time -k part1_qubits "
     "-w prefix -p num_prefix_gates -r num_root_gates "
     "-t num_threads -n num_dblocks -v verbosity -z\n";

From de169cc48de09b7e2824bf360943a60d24572527 Mon Sep 17 00:00:00 2001
From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com>
Date: Sat, 14 Feb 2026 00:58:41 +0530
Subject: [PATCH 03/10] Refactor memory management namespace and functions

requested chages are done!
---
 lib/vectorspace_cuda.h | 44 ++++++++++++------------------------------
 1 file changed, 12 insertions(+), 32 deletions(-)

diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h
index dd67661e2..d9ccff76e 100644
--- a/lib/vectorspace_cuda.h
+++ b/lib/vectorspace_cuda.h
@@ -28,30 +28,24 @@
 
 namespace qsim {
 
-// Use a unique detail namespace to avoid collision with vectorspace.h
-namespace cuda_detail {
+namespace detail {
 
 inline void do_not_free(void*) {}
+
 inline void free(void* ptr) {
-  if (ptr != nullptr) {
-#ifdef __NVCC__
-    ErrorCheck(cudaFree(ptr));
-#elif __HIP__
-    // Using the qsim ErrorCheck wrapper for HIP
-    ErrorCheck(hipFree(ptr));
-#endif
-  }
+  ErrorCheck(cudaFree(ptr));
 }
 
-}  // namespace cuda_detail
+}  // namespace detail
 
 // Routines for vector manipulations.
 template <typename Impl, typename FP>
 class VectorSpaceCUDA {
  public:
   using fp_type = FP;
-  // Define Pointer with a clear function pointer type for the deleter
-  using Pointer = std::unique_ptr<fp_type, void (*)(void*)>;
+
+ private:
+  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
 
  public:
   class Vector {
@@ -92,24 +86,10 @@ class VectorSpaceCUDA {
 
   static Vector Create(unsigned num_qubits) {
     fp_type* p;
-    // Use explicit 64-bit calculation for large simulations (>30 qubits) to prevent overflow.
-    // Otherwise, keep original behavior to minimize impact.
-    uint64_t size;
-    if (num_qubits > 30) {
-      size = uint64_t{sizeof(fp_type)} * Impl::MinSize(num_qubits);
-    } else {
-      size = sizeof(fp_type) * Impl::MinSize(num_qubits);
-    }
-
-// Ensure we use the correct API based on the compiler
-#ifdef __NVCC__
+    uint64_t size = uint64_t{sizeof(fp_type)} * Impl::MinSize(num_qubits);
     auto rc = cudaMalloc(&p, size);
-#elif __HIP__
-    auto rc = hipMalloc(&p, size);
-#endif
-
-    if (rc == 0) {  // Success
-      return Vector{Pointer{(fp_type*)p, &cuda_detail::free}, num_qubits};
+    if (rc == cudaSuccess) {
+      return Vector{Pointer{(fp_type*)p, &detail::free}, num_qubits};
     } else {
       return Null();
     }
@@ -118,11 +98,11 @@ class VectorSpaceCUDA {
   // It is the client's responsibility to make sure that p has at least
   // Impl::MinSize(num_qubits) elements.
   static Vector Create(fp_type* p, unsigned num_qubits) {
-    return Vector{Pointer{p, &cuda_detail::do_not_free}, num_qubits};
+    return Vector{Pointer{p, &detail::do_not_free}, num_qubits};
   }
 
   static Vector Null() {
-    return Vector{Pointer{nullptr, &cuda_detail::free}, 0};
+    return Vector{Pointer{nullptr, &detail::free}, 0};
   }
 
   static void Free(fp_type* ptr) {

From 663226094855250be42e0313d9054343d5bb2376 Mon Sep 17 00:00:00 2001
From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com>
Date: Sat, 14 Feb 2026 01:04:09 +0530
Subject: [PATCH 04/10] Refactor memory management namespace and functions

requested changes are done!
---
 lib/vectorspace_cuda.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h
index d9ccff76e..f3652a527 100644
--- a/lib/vectorspace_cuda.h
+++ b/lib/vectorspace_cuda.h
@@ -105,6 +105,10 @@ class VectorSpaceCUDA {
     return Vector{Pointer{nullptr, &detail::free}, 0};
   }
 
+  static bool IsNull(const Vector& vector) {
+    return vector.get() == nullptr;
+  }
+
   static void Free(fp_type* ptr) {
     detail::free(ptr);
   }

From 93b4aa2aebbb56a691980fa5954c94f4afc76403 Mon Sep 17 00:00:00 2001
From: sun-9545sunoj <nagasunoj777@gmail.com>
Date: Tue, 10 Mar 2026 01:12:49 +0530
Subject: [PATCH 05/10] Requested changes

---
 lib/simulator_cuda.h          | 58 ++++++++++------------
 lib/simulator_cuda_kernels.h  | 25 ++++------
 lib/statespace_cuda.h         | 92 +++++++++++++++--------------------
 lib/statespace_cuda_kernels.h | 31 ++++++------
 lib/util_cuda.h               | 19 ++++++++
 lib/vectorspace_cuda.h        |  6 +--
 6 files changed, 111 insertions(+), 120 deletions(-)

diff --git a/lib/simulator_cuda.h b/lib/simulator_cuda.h
index de9106bcf..3772834fa 100644
--- a/lib/simulator_cuda.h
+++ b/lib/simulator_cuda.h
@@ -344,14 +344,14 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G;
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
     unsigned threads = 64U;
-    unsigned blocks = std::max(1U, size / 2);
+    uint64_t blocks = std::max(uint64_t{1}, size / 2);
 
     IndicesH<G> d_i(d_ws);
 
     ApplyGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
-        (fp_type*)d_ws, d_i.xss, d_i.ms, state.get());
+        (fp_type*) d_ws, d_i.xss, d_i.ms, state.get());
   }
 
   template <unsigned G>
@@ -368,14 +368,14 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + num_effective_qs;
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
     unsigned threads = 32;
-    unsigned blocks = size;
+    uint64_t blocks = size;
 
     IndicesL<G> d_i(d_ws);
 
     ApplyGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
-        (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
         1 << num_effective_qs, state.get());
   }
 
@@ -401,14 +401,14 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G + cqs.size();
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
     unsigned threads = 64U;
-    unsigned blocks = std::max(1U, size / 2);
+    uint64_t blocks = std::max(uint64_t{1}, size / 2);
 
     IndicesH<G> d_i(d_ws);
 
     ApplyControlledGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
-        (fp_type*)d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
+        (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
   }
 
   template <unsigned G>
@@ -426,15 +426,15 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G + cqs.size();
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
     unsigned threads = 32;
-    unsigned blocks = size;
+    uint64_t blocks = size;
 
     IndicesL<G> d_i(d_ws);
 
     ApplyControlledGateLH_Kernel<G><<<CreateGrid(blocks), threads>>>(
-        (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d.num_aqs + 1,
-        d.cvalsh, 1 << d.num_effective_qs, state.get());
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+        d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get());
   }
 
   template <unsigned G>
@@ -452,14 +452,14 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G + cqs.size();
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
     unsigned threads = 32;
-    unsigned blocks = size;
+    uint64_t blocks = size;
 
     IndicesLC<G> d_i(d_ws);
 
     ApplyControlledGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
-        (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
         d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs,
         1 << (5 - d.remaining_low_cqs), state.get());
   }
@@ -483,7 +483,7 @@ class SimulatorCUDA final {
 
     unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U);
     unsigned threads = 64U;
-    unsigned blocks = std::max(1U, (size / 2) >> s);
+    uint64_t blocks = std::max(uint64_t{1}, (uint64_t{size} / 2) >> s);
     unsigned num_iterations_per_block = 1 << s;
 
     constexpr unsigned m = 16;
@@ -494,8 +494,8 @@ class SimulatorCUDA final {
     IndicesH<G> d_i(d_ws);
 
     ExpectationValueH_Kernel<G><<<CreateGrid(blocks), threads>>>(
-        (fp_type*)d_ws, d_i.xss, d_i.ms, num_iterations_per_block, state.get(),
-        Plus<double>(), d_res1);
+        (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block,
+        state.get(), Plus<double>(), d_res1);
 
     double mul = size == 1 ? 0.5 : 1.0;
 
@@ -521,7 +521,7 @@ class SimulatorCUDA final {
 
     unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U);
     unsigned threads = 32;
-    unsigned blocks = size >> s;
+    uint64_t blocks = uint64_t{size} >> s;
     unsigned num_iterations_per_block = 1 << s;
 
     constexpr unsigned m = 16;
@@ -532,7 +532,7 @@ class SimulatorCUDA final {
     IndicesL<G> d_i(d_ws);
 
     ExpectationValueL_Kernel<G><<<CreateGrid(blocks), threads>>>(
-        (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
         num_iterations_per_block, state.get(), Plus<double>(), d_res1);
 
     double mul = double(1 << (5 + num_effective_qs - G)) / 32;
@@ -542,7 +542,7 @@ class SimulatorCUDA final {
 
   template <unsigned m>
   std::complex<double> ExpectationValueReduceFinal(
-      unsigned blocks, double mul,
+      uint64_t blocks, double mul,
       const Complex* d_res1, Complex* d_res2) const {
     Complex res2[m];
 
@@ -550,10 +550,10 @@ class SimulatorCUDA final {
       ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex),
                             cudaMemcpyDeviceToHost));
     } else {
-      unsigned threads2 = std::min(1024U, blocks);
-      unsigned blocks2 = std::min(m, blocks / threads2);
+      unsigned threads2 = std::min(uint64_t{1024}, blocks);
+      uint64_t blocks2 = std::min(uint64_t{m}, blocks / threads2);
 
-      unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2));
+      unsigned dblocks = std::max(uint64_t{1}, blocks / (blocks2 * threads2));
       unsigned bytes = threads2 * sizeof(Complex);
 
       Reduce2Kernel<Complex><<<blocks2, threads2, bytes>>>(
@@ -568,7 +568,7 @@ class SimulatorCUDA final {
     double re = 0;
     double im = 0;
 
-    for (unsigned i = 0; i < blocks; ++i) {
+    for (uint64_t i = 0; i < blocks; ++i) {
       re += res2[i].re;
       im += res2[i].im;
     }
@@ -895,12 +895,6 @@ class SimulatorCUDA final {
     return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs};
   }
 
-  static dim3 CreateGrid(uint64_t blocks) {
-    if (blocks <= 65535) return dim3(blocks);
-    uint32_t x = 65535;
-    uint32_t y = (blocks + x - 1) / x;
-    return dim3(x, y);
-  }
 
   void* AllocScratch(uint64_t size) const {
     if (size > scratch_size_) {
diff --git a/lib/simulator_cuda_kernels.h b/lib/simulator_cuda_kernels.h
index cfc9419b1..d39661c56 100644
--- a/lib/simulator_cuda_kernels.h
+++ b/lib/simulator_cuda_kernels.h
@@ -18,13 +18,13 @@
 #ifdef __NVCC__
   #include <cuda.h>
   #include <cuda_runtime.h>
-
-  #include "util_cuda.h"
 #elif __HIP__
   #include <hip/hip_runtime.h>
   #include "cuda2hip.h"
 #endif
 
+#include "util_cuda.h"
+
 namespace qsim {
 
 template <unsigned G, typename fp_type, typename idx_type>
@@ -33,8 +33,7 @@ __global__ void ApplyGateH_Kernel(
     const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 64.
 
-  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
-                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+  uint64_t blockId = GetBlockId();
 
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
@@ -118,8 +117,7 @@ __global__ void ApplyGateL_Kernel(
     fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 32.
 
-  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
-                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+  uint64_t blockId = GetBlockId();
 
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
@@ -210,8 +208,7 @@ __global__ void ApplyControlledGateH_Kernel(
     fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 64.
 
-  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
-                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+  uint64_t blockId = GetBlockId();
 
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
@@ -297,8 +294,7 @@ __global__ void ApplyControlledGateLH_Kernel(
     unsigned esize, fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 32.
 
-  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
-                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+  uint64_t blockId = GetBlockId();
 
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
@@ -393,8 +389,7 @@ __global__ void ApplyControlledGateL_Kernel(
     fp_type* __restrict__ rstate) {
   // blockDim.x must be equal to 32.
 
-  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
-                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+  uint64_t blockId = GetBlockId();
 
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
@@ -492,8 +487,7 @@ __global__ void ExpectationValueH_Kernel(
     const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
   // blockDim.x must be equal to 64.
 
-  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
-                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+  uint64_t blockId = GetBlockId();
 
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
@@ -605,8 +599,7 @@ __global__ void ExpectationValueL_Kernel(
     const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
   // blockDim.x must be equal to 32.
 
-  uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y +
-                     uint64_t{blockIdx.y} * gridDim.x + blockIdx.x;
+  uint64_t blockId = GetBlockId();
 
   static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
 
diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h
index 865ea006b..8f76fe25e 100644
--- a/lib/statespace_cuda.h
+++ b/lib/statespace_cuda.h
@@ -49,10 +49,10 @@ class StateSpaceCUDA :
   struct Grid {
     unsigned threads;
     unsigned dblocks;
-    unsigned blocks;
-    dim3 device_grid;
+    dim3 blocks;
   };
 
+
  public:
   using State = typename Base::State;
   using fp_type = typename Base::fp_type;
@@ -83,25 +83,14 @@ class StateSpaceCUDA :
     return std::max(uint64_t{64}, 2 * (uint64_t{1} << num_qubits));
   };
 
-  bool IsNull(const State& state) const { return state.get() == nullptr; }
-
-  void DeviceSync() const { ErrorCheck(cudaDeviceSynchronize()); }
-
-  void Copy(const State& src, State& dest) const {
-    ErrorCheck(cudaMemcpy(
-        dest.get(), src.get(), MinSize(src.num_qubits()) * sizeof(fp_type),
-        cudaMemcpyDeviceToDevice));
-  }
-
   void InternalToNormalOrder(State& state) const {
     uint64_t size = MinSize(state.num_qubits()) / 2;
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
+    uint64_t blocks = size / threads;
     unsigned bytes = 2 * threads * sizeof(fp_type);
 
-    InternalToNormalOrderKernel<<<CreateGrid(blocks), threads, bytes>>>(
-        state.get());
+    InternalToNormalOrderKernel<<<CreateGrid(blocks), threads, bytes>>>(state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
@@ -110,11 +99,10 @@ class StateSpaceCUDA :
     uint64_t size = MinSize(state.num_qubits()) / 2;
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
+    uint64_t blocks = size / threads;
     unsigned bytes = 2 * threads * sizeof(fp_type);
 
-    NormalToInternalOrderKernel<<<CreateGrid(blocks), threads, bytes>>>(
-        state.get());
+    NormalToInternalOrderKernel<<<CreateGrid(blocks), threads, bytes>>>(state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
@@ -130,12 +118,11 @@ class StateSpaceCUDA :
     uint64_t hsize = uint64_t{1} << state.num_qubits();
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
+    uint64_t blocks = size / threads;
 
     fp_type v = double{1} / std::sqrt(hsize);
 
-    SetStateUniformKernel<<<CreateGrid(blocks), threads>>>(
-        v, hsize, state.get());
+    SetStateUniformKernel<<<CreateGrid(blocks), threads>>>(v, hsize, state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
@@ -192,7 +179,7 @@ class StateSpaceCUDA :
     uint64_t size = MinSize(state.num_qubits()) / 2;
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
+    uint64_t blocks = size / threads;
 
     BulkSetAmplKernel<<<CreateGrid(blocks), threads>>>(
         mask, bits, re, im, exclude, state.get());
@@ -209,7 +196,7 @@ class StateSpaceCUDA :
     uint64_t size = MinSize(src.num_qubits());
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
+    uint64_t blocks = size / threads;
 
     AddKernel<<<CreateGrid(blocks), threads>>>(src.get(), dest.get());
     ErrorCheck(cudaPeekAtLastError());
@@ -223,7 +210,7 @@ class StateSpaceCUDA :
     uint64_t size = MinSize(state.num_qubits());
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
+    uint64_t blocks = size / threads;
 
     MultiplyKernel<<<CreateGrid(blocks), threads>>>(a, state.get());
     ErrorCheck(cudaPeekAtLastError());
@@ -262,38 +249,40 @@ class StateSpaceCUDA :
     if (num_samples > 0) {
       Grid g1 = GetGrid1(MinSize(state.num_qubits()) / 2);
       unsigned bytes = g1.threads * sizeof(double);
+      uint64_t num_blocks1 =
+          MinSize(state.num_qubits()) / 2 / (g1.threads * g1.dblocks);
 
-      unsigned scratch_size = (g1.blocks + 1) * sizeof(double)
+      unsigned scratch_size = (num_blocks1 + 1) * sizeof(double)
           + num_samples * (sizeof(uint64_t) + sizeof(DistrRealType));
 
       void* scratch = AllocScratch(scratch_size);
 
       double* d_res2 = (double*) scratch;
       double* d_res1 = d_res2 + 1;
-      uint64_t* d_bitstrings = (uint64_t*) (d_res1 + g1.blocks);
+      uint64_t* d_bitstrings = (uint64_t*) (d_res1 + num_blocks1);
       DistrRealType* d_rs = (DistrRealType *) (d_bitstrings + num_samples);
 
       auto op1 = RealProduct<fp_type>();
       auto op2 = Plus<double>();
 
-      Reduce1Kernel<double><<<g1.device_grid, g1.threads, bytes>>>(
+      Reduce1Kernel<double><<<g1.blocks, g1.threads, bytes>>>(
           g1.dblocks, op1, op2, op2, state.get(), state.get(), d_res1);
       ErrorCheck(cudaPeekAtLastError());
       ErrorCheck(cudaDeviceSynchronize());
 
       double norm;
 
-      if (g1.blocks == 1) {
+      if (num_blocks1 == 1) {
         ErrorCheck(
             cudaMemcpy(&norm, d_res1, sizeof(double), cudaMemcpyDeviceToHost));
       } else {
-        Grid g2 = GetGrid2(g1.blocks);
+        Grid g2 = GetGrid2(num_blocks1);
         unsigned bytes = g2.threads * sizeof(double);
 
         auto op3 = Plus<double>();
 
-        Reduce2Kernel<double><<<g2.device_grid, g2.threads, bytes>>>(
-            g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2);
+        Reduce2Kernel<double><<<g2.blocks, g2.threads, bytes>>>(
+            g2.dblocks, num_blocks1, op3, op3, d_res1, d_res2);
         ErrorCheck(cudaPeekAtLastError());
         ErrorCheck(cudaDeviceSynchronize());
 
@@ -308,7 +297,7 @@ class StateSpaceCUDA :
                             num_samples * sizeof(DistrRealType),
                             cudaMemcpyHostToDevice));
 
-      SampleKernel<<<1, g1.threads>>>(g1.blocks, g1.dblocks, num_samples,
+      SampleKernel<<<1, g1.threads>>>(num_blocks1, g1.dblocks, num_samples,
                                       d_rs, d_res1, state.get(), d_bitstrings);
       ErrorCheck(cudaPeekAtLastError());
       ErrorCheck(cudaDeviceSynchronize());
@@ -333,18 +322,19 @@ class StateSpaceCUDA :
     uint64_t size = MinSize(state.num_qubits()) / 2;
 
     unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
+    uint64_t blocks = size / threads;
 
-    CollapseKernel<<<CreateGrid(blocks), threads>>>(
-        mr.mask, mr.bits, renorm, state.get());
+    CollapseKernel<<<CreateGrid(blocks), threads>>>(mr.mask, mr.bits, renorm, state.get());
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
   }
 
   std::vector<double> PartialNorms(const State& state) const {
     Grid g = GetGrid1(MinSize(state.num_qubits()) / 2);
+    uint64_t num_blocks =
+        MinSize(state.num_qubits()) / 2 / (g.threads * g.dblocks);
 
-    unsigned scratch_size = g.blocks * sizeof(double);
+    unsigned scratch_size = num_blocks * sizeof(double);
     unsigned bytes = g.threads * sizeof(double);
 
     double* d_res = (double*) AllocScratch(scratch_size);
@@ -352,12 +342,12 @@ class StateSpaceCUDA :
     auto op1 = RealProduct<fp_type>();
     auto op2 = Plus<double>();
 
-    Reduce1Kernel<double><<<g.device_grid, g.threads, bytes>>>(
+    Reduce1Kernel<double><<<g.blocks, g.threads, bytes>>>(
         g.dblocks, op1, op2, op2, state.get(), state.get(), d_res);
     ErrorCheck(cudaPeekAtLastError());
     ErrorCheck(cudaDeviceSynchronize());
 
-    std::vector<double> norms(g.blocks);
+    std::vector<double> norms(num_blocks);
 
     ErrorCheck(
         cudaMemcpy(norms.data(), d_res, scratch_size, cudaMemcpyDeviceToHost));
@@ -402,23 +392,18 @@ class StateSpaceCUDA :
 
   Grid GetGrid1(uint64_t size) const {
     Grid grid;
-
     grid.threads = std::min(size, uint64_t{param_.num_threads});
     grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks});
-    grid.blocks = size / (grid.threads * grid.dblocks);
-    grid.device_grid = CreateGrid(grid.blocks);
-
+    uint64_t num_blocks = size / (grid.threads * grid.dblocks);
+    grid.blocks = CreateGrid(num_blocks);
     return grid;
   }
 
   Grid GetGrid2(unsigned size) const {
     Grid grid;
-
     grid.threads = std::min(param_.num_threads, std::max(32U, size));
     grid.dblocks = std::max(1U, size / grid.threads);
-    grid.blocks = 1;
-    grid.device_grid = dim3(1, 1, 1);
-
+    grid.blocks = dim3(1, 1, 1);
     return grid;
   }
 
@@ -434,8 +419,9 @@ class StateSpaceCUDA :
 
     Grid g1 = GetGrid1(size);
     unsigned bytes = g1.threads * sizeof(FP1);
+    uint64_t num_blocks1 = size / (g1.threads * g1.dblocks);
 
-    FP2* d_res2 = (FP2*) AllocScratch((g1.blocks + 1) * sizeof(FP2));
+    FP2* d_res2 = (FP2*) AllocScratch((num_blocks1 + 1) * sizeof(FP2));
     FP2* d_res1 = d_res2 + 1;
 
     auto op1 = Op();
@@ -443,10 +429,10 @@ class StateSpaceCUDA :
     auto op3 = Plus<typename Scalar<FP1>::type>();
 
     if (mask == 0) {
-      Reduce1Kernel<FP1><<<g1.device_grid, g1.threads, bytes>>>(
+      Reduce1Kernel<FP1><<<g1.blocks, g1.threads, bytes>>>(
           g1.dblocks, op1, op2, op3, state1.get(), state2.get(), d_res1);
     } else {
-      Reduce1MaskedKernel<FP1><<<g1.device_grid, g1.threads, bytes>>>(
+      Reduce1MaskedKernel<FP1><<<g1.blocks, g1.threads, bytes>>>(
           g1.dblocks, mask, bits, op1, op2, op3, state1.get(), state2.get(),
           d_res1);
     }
@@ -455,18 +441,18 @@ class StateSpaceCUDA :
 
     FP2 result;
 
-    if (g1.blocks == 1) {
+    if (num_blocks1 == 1) {
       ErrorCheck(
           cudaMemcpy(&result, d_res1, sizeof(FP2), cudaMemcpyDeviceToHost));
     } else {
-      Grid g2 = GetGrid2(g1.blocks);
+      Grid g2 = GetGrid2(num_blocks1);
       unsigned bytes = g2.threads * sizeof(FP2);
 
       auto op2 = Plus<FP2>();
       auto op3 = Plus<typename Scalar<FP2>::type>();
 
-      Reduce2Kernel<FP2><<<g2.device_grid, g2.threads, bytes>>>(
-          g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2);
+      Reduce2Kernel<FP2><<<g2.blocks, g2.threads, bytes>>>(
+          g2.dblocks, num_blocks1, op2, op3, d_res1, d_res2);
       ErrorCheck(cudaPeekAtLastError());
       ErrorCheck(cudaDeviceSynchronize());
 
diff --git a/lib/statespace_cuda_kernels.h b/lib/statespace_cuda_kernels.h
index 88ad89911..c762dfbb0 100644
--- a/lib/statespace_cuda_kernels.h
+++ b/lib/statespace_cuda_kernels.h
@@ -39,7 +39,7 @@ __device__ __forceinline__ FP1 BlockReduce1(
   unsigned warp = threadIdx.x / warp_size;
   unsigned lane = threadIdx.x % warp_size;
 
-  uint64_t k0 = 2 * n * qsim::GetBlockId() * blockDim.x + 2 * tid - lane;
+  uint64_t k0 = 2 * n * GetBlockId() * blockDim.x + 2 * tid - lane;
   uint64_t k1 = k0 + 2 * n * blockDim.x;
 
   FP1 r;
@@ -88,7 +88,7 @@ __device__ __forceinline__ FP1 BlockReduce1Masked(
   unsigned warp = threadIdx.x / warp_size;
   unsigned lane = threadIdx.x % warp_size;
 
-  uint64_t k0 = 2 * n * qsim::GetBlockId() * blockDim.x + 2 * tid - lane;
+  uint64_t k0 = 2 * n * GetBlockId() * blockDim.x + 2 * tid - lane;
   uint64_t k1 = k0 + 2 * n * blockDim.x;
 
   FP1 r = 0;
@@ -137,7 +137,7 @@ __device__ __forceinline__ FP1 BlockReduce2(
   FP1* partial1 = (FP1*) shared;
 
   unsigned tid = threadIdx.x;
-  uint64_t k0 = n * qsim::GetBlockId() * blockDim.x + tid;
+  uint64_t k0 = n * GetBlockId() * blockDim.x + tid;
   uint64_t k1 = k0 + n * blockDim.x;
 
   FP1 r = 0;
@@ -185,7 +185,7 @@ __global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3,
   FP1 sum = detail::BlockReduce1<FP1>(n, op1, op2, op3, s1, s2);
 
   if (threadIdx.x == 0) {
-    result[qsim::GetBlockId()] = sum;
+    result[GetBlockId()] = sum;
   }
 }
 
@@ -198,7 +198,7 @@ __global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits,
       detail::BlockReduce1Masked<FP1>(n, mask, bits, op1, op2, op3, s1, s2);
 
   if (threadIdx.x == 0) {
-    result[qsim::GetBlockId()] = sum;
+    result[GetBlockId()] = sum;
   }
 }
 
@@ -209,7 +209,7 @@ __global__ void Reduce2Kernel(
   FP1 sum = detail::BlockReduce2<FP1>(n, size, op2, op3, s);
 
   if (threadIdx.x == 0) {
-    result[qsim::GetBlockId()] = sum;
+    result[GetBlockId()] = sum;
   }
 }
 
@@ -217,7 +217,7 @@ template <typename FP, unsigned warp_size = 32>
 __global__ void InternalToNormalOrderKernel(FP* state) {
   unsigned lane = threadIdx.x % warp_size;
   unsigned l = 2 * threadIdx.x - lane;
-  uint64_t k = 2 * uint64_t{qsim::GetBlockId()} * blockDim.x + l;
+  uint64_t k = 2 * GetBlockId() * blockDim.x + l;
 
   extern __shared__ float shared[];
   FP* buf = (FP*) shared;
@@ -235,7 +235,7 @@ template <typename FP, unsigned warp_size = 32>
 __global__ void NormalToInternalOrderKernel(FP* state) {
   unsigned lane = threadIdx.x % warp_size;
   unsigned l = 2 * threadIdx.x - lane;
-  uint64_t k = 2 * uint64_t{qsim::GetBlockId()} * blockDim.x + l;
+  uint64_t k = 2 * GetBlockId() * blockDim.x + l;
 
   extern __shared__ float shared[];
   FP* buf = (FP*) shared;
@@ -252,8 +252,7 @@ __global__ void NormalToInternalOrderKernel(FP* state) {
 template <typename FP, unsigned warp_size = 32>
 __global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
   unsigned lane = threadIdx.x % warp_size;
-  uint64_t k =
-      2 * (uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x) - lane;
+  uint64_t k = 2 * (GetBlockId() * blockDim.x + threadIdx.x) - lane;
 
   state[k] = lane < size ? v : 0;
   state[k + warp_size] = 0;
@@ -261,19 +260,19 @@ __global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
 
 template <typename FP, unsigned warp_size = 32>
 __global__ void AddKernel(const FP* state1, FP* state2) {
-  uint64_t k = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x;
+  uint64_t k = GetBlockId() * blockDim.x + threadIdx.x;
   state2[k] += state1[k];
 }
 
 template <typename FP, unsigned warp_size = 32>
 __global__ void MultiplyKernel(FP a, FP* state) {
-  uint64_t k = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x;
+  uint64_t k = GetBlockId() * blockDim.x + threadIdx.x;
   state[k] *= a;
 }
 
 template <typename FP, unsigned warp_size = 32>
 __global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) {
-  uint64_t k1 = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x;
+  uint64_t k1 = GetBlockId() * blockDim.x + threadIdx.x;
   uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
 
   if ((k1 & mask) == bits) {
@@ -288,7 +287,7 @@ __global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) {
 template <typename FP, unsigned warp_size = 32>
 __global__ void BulkSetAmplKernel(
     uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) {
-  uint64_t k1 = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x;
+  uint64_t k1 = GetBlockId() * blockDim.x + threadIdx.x;
   uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
 
   bool set = ((k1 & mask) == bits) ^ exclude;
@@ -300,7 +299,7 @@ __global__ void BulkSetAmplKernel(
 }
 
 template <typename FP1, typename FP2, typename FP3, unsigned warp_size = 32>
-__global__ void SampleKernel(unsigned num_blocks,
+__global__ void SampleKernel(uint64_t num_blocks,
                              uint64_t n, uint64_t num_samples,
                              const FP1* rs, const FP2* ps, const FP3* state,
                              uint64_t *bitstrings) {
@@ -309,7 +308,7 @@ __global__ void SampleKernel(unsigned num_blocks,
     uint64_t m = 0;
     double csum = 0;
 
-    for (unsigned block_id = 0; block_id < num_blocks; ++block_id) {
+    for (uint64_t block_id = 0; block_id < num_blocks; ++block_id) {
       uint64_t km = n * blockDim.x;
       uint64_t k0 = block_id * km;
 
diff --git a/lib/util_cuda.h b/lib/util_cuda.h
index b34292753..a7880a527 100644
--- a/lib/util_cuda.h
+++ b/lib/util_cuda.h
@@ -130,6 +130,25 @@ __device__ __forceinline__ FP1 WarpReduce(FP1 val, Op op) {
   return val;
 }
 
+__device__ __forceinline__ uint64_t GetBlockId() {
+  return (uint64_t{blockIdx.z} * gridDim.y + uint64_t{blockIdx.y})
+         * gridDim.x + blockIdx.x;
+}
+
+inline dim3 CreateGrid(uint64_t blocks) {
+  if (blocks <= 65535) {
+    return dim3((uint32_t)blocks);
+  }
+  uint32_t x = 65535;
+  uint64_t rem = (blocks + x - 1) / x;
+  if (rem <= 65535) {
+    return dim3(x, (uint32_t)rem);
+  }
+  uint32_t y = 65535;
+  uint32_t z = (uint32_t)((rem + y - 1) / y);
+  return dim3(x, y, z);
+}
+
 template <typename FP1, typename Op, unsigned warp_size = 32>
 __device__ __forceinline__ Complex<FP1> WarpReduce(Complex<FP1> val, Op op) {
   for (unsigned i = warp_size / 2; i > 0; i /= 2) {
diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h
index f3652a527..f1efdd51d 100644
--- a/lib/vectorspace_cuda.h
+++ b/lib/vectorspace_cuda.h
@@ -86,10 +86,10 @@ class VectorSpaceCUDA {
 
   static Vector Create(unsigned num_qubits) {
     fp_type* p;
-    uint64_t size = uint64_t{sizeof(fp_type)} * Impl::MinSize(num_qubits);
+    uint64_t size = sizeof(fp_type) * Impl::MinSize(num_qubits);
     auto rc = cudaMalloc(&p, size);
     if (rc == cudaSuccess) {
-      return Vector{Pointer{(fp_type*)p, &detail::free}, num_qubits};
+      return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits};
     } else {
       return Null();
     }
@@ -168,4 +168,4 @@ class VectorSpaceCUDA {
 
 }  // namespace qsim
 
-#endif  // VECTORSPACE_CUDA_H_
+#endif  // VECTORSPACE_CUDA_H_
\ No newline at end of file

From 8be37484cd234fe9d26377b7a921bba74cc709c3 Mon Sep 17 00:00:00 2001
From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com>
Date: Tue, 10 Mar 2026 01:14:21 +0530
Subject: [PATCH 06/10] Delete apps/qsimh_base_cuda.cu

---
 apps/qsimh_base_cuda.cu | 220 ----------------------------------------
 1 file changed, 220 deletions(-)
 delete mode 100644 apps/qsimh_base_cuda.cu

diff --git a/apps/qsimh_base_cuda.cu b/apps/qsimh_base_cuda.cu
deleted file mode 100644
index 951ad7688..000000000
--- a/apps/qsimh_base_cuda.cu
+++ /dev/null
@@ -1,220 +0,0 @@
-// Copyright 2026 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <complex>
-#include <iomanip>
-#include <limits>
-#include <sstream>
-#include <string>
-#include <unistd.h>
-#include <vector>
-
-#include "../lib/bitstring.h"
-#include "../lib/circuit_qsim_parser.h"
-#include "../lib/formux.h"
-#include "../lib/fuser_basic.h"
-#include "../lib/gates_qsim.h"
-#include "../lib/io_file.h"
-#include "../lib/run_qsimh.h"
-#include "../lib/simmux.h"
-#include "../lib/simulator_cuda.h"
-#include "../lib/util.h"
-#include "../lib/util_cpu.h"
-
-constexpr char usage[] =
-    "usage:\n  ./qsimh_base_cuda.x -c circuit_file "
-    "-d maximum_time -k part1_qubits "
-    "-w prefix -p num_prefix_gates -r num_root_gates "
-    "-t num_threads -n num_dblocks -v verbosity -z\n";
-
-struct Options {
-  std::string circuit_file;
-  std::vector<unsigned> part1;
-  uint64_t prefix;
-  unsigned maxtime = std::numeric_limits<unsigned>::max();
-  unsigned num_prefix_gatexs = 0;
-  unsigned num_root_gatexs = 0;
-  unsigned num_threads = 256;
-  unsigned num_dblocks = 16;
-  unsigned verbosity = 0;
-  bool denormals_are_zeros = false;
-};
-
-Options GetOptions(int argc, char* argv[]) {
-  Options opt;
-
-  int k;
-
-  auto to_int = [](const std::string& word) -> unsigned {
-    return std::stoul(word);
-  };
-
-  while ((k = getopt(argc, argv, "c:d:k:w:p:r:t:n:v:z")) != -1) {
-    switch (k) {
-      case 'c':
-        opt.circuit_file = optarg;
-        break;
-      case 'd':
-        opt.maxtime = std::stoul(optarg);
-        break;
-      case 'k':
-        qsim::SplitString(optarg, ',', to_int, opt.part1);
-        break;
-      case 'w':
-        opt.prefix = std::stoull(optarg);
-        break;
-      case 'p':
-        opt.num_prefix_gatexs = std::stoul(optarg);
-        break;
-      case 'r':
-        opt.num_root_gatexs = std::stoul(optarg);
-        break;
-      case 't':
-        opt.num_threads = std::stoul(optarg);
-        break;
-      case 'n':
-        opt.num_dblocks = std::stoul(optarg);
-        break;
-      case 'v':
-        opt.verbosity = std::stoul(optarg);
-        break;
-      case 'z':
-        opt.denormals_are_zeros = true;
-        break;
-      default:
-        qsim::IO::errorf(usage);
-        exit(1);
-    }
-  }
-
-  return opt;
-}
-
-bool ValidateOptions(const Options& opt) {
-  if (opt.circuit_file.empty()) {
-    qsim::IO::errorf("circuit file is not provided.\n");
-    qsim::IO::errorf(usage);
-    return false;
-  }
-
-  return true;
-}
-
-bool ValidatePart1(unsigned num_qubits, const std::vector<unsigned>& part1) {
-  for (std::size_t i = 0; i < part1.size(); ++i) {
-    if (part1[i] >= num_qubits) {
-      qsim::IO::errorf("part 1 qubit indices are too large.\n");
-      return false;
-    }
-  }
-
-  return true;
-}
-
-std::vector<unsigned> GetParts(
-    unsigned num_qubits, const std::vector<unsigned>& part1) {
-  std::vector<unsigned> parts(num_qubits, 0);
-
-  for (std::size_t i = 0; i < part1.size(); ++i) {
-    parts[part1[i]] = 1;
-  }
-
-  return parts;
-}
-
-int main(int argc, char* argv[]) {
-  using namespace qsim;
-
-  auto opt = GetOptions(argc, argv);
-  if (!ValidateOptions(opt)) {
-    return 1;
-  }
-
-  Circuit<GateQSim<float>> circuit;
-  if (!CircuitQsimParser<IOFile>::FromFile(
-          opt.maxtime, opt.circuit_file, circuit)) {
-    return 1;
-  }
-
-  if (!ValidatePart1(circuit.num_qubits, opt.part1)) {
-    return 1;
-  }
-  auto parts = GetParts(circuit.num_qubits, opt.part1);
-
-  if (opt.denormals_are_zeros) {
-    SetFlushToZeroAndDenormalsAreZeros();
-  }
-
-  uint64_t num_bitstrings =
-      std::min(uint64_t{8}, uint64_t{1} << circuit.num_qubits);
-
-  std::vector<Bitstring> bitstrings;
-  bitstrings.reserve(num_bitstrings);
-  for (std::size_t i = 0; i < num_bitstrings; ++i) {
-    bitstrings.push_back(i);
-  }
-
-  struct Factory {
-    using Simulator = qsim::SimulatorCUDA<float>;
-    using StateSpace = Simulator::StateSpace;
-    using fp_type = Simulator::fp_type;
-
-    Factory(const StateSpace::Parameter& param) : param(param) {}
-
-    StateSpace CreateStateSpace() const { return StateSpace(param); }
-
-    Simulator CreateSimulator() const { return Simulator(); }
-
-    const StateSpace::Parameter& param;
-  };
-
-  using HybridSimulator =
-      HybridSimulator<IO, GateQSim<float>, BasicGateFuser, For>;
-  using Runner = QSimHRunner<IO, HybridSimulator>;
-
-  Runner::Parameter param;
-  param.prefix = opt.prefix;
-  param.num_prefix_gatexs = opt.num_prefix_gatexs;
-  param.num_root_gatexs = opt.num_root_gatexs;
-  param.num_threads =
-      opt.num_threads;  // This is reused for StateSpaceCUDA params implicitly
-                        // if not careful, but here we separate.
-  param.verbosity = opt.verbosity;
-
-  std::vector<std::complex<Factory::fp_type>> results(num_bitstrings, 0);
-
-  // Setup CUDA parameters
-  Factory::StateSpace::Parameter cuda_param;
-  cuda_param.num_threads = opt.num_threads;
-  cuda_param.num_dblocks = opt.num_dblocks;
-
-  Factory factory(cuda_param);
-
-  if (Runner::Run(param, factory, circuit, parts, bitstrings, results)) {
-    static constexpr char const* bits[8] = {
-        "000", "001", "010", "011", "100", "101", "110", "111",
-    };
-
-    unsigned s = 3 - std::min(unsigned{3}, circuit.num_qubits);
-
-    for (std::size_t i = 0; i < num_bitstrings; ++i) {
-      const auto& a = results[i];
-      qsim::IO::messagef(
-          "%s:%16.8g%16.8g%16.8g\n", bits[i] + s, std::real(a), std::imag(a),
-          std::norm(a));
-    }
-  }
-
-  return 0;
-}

From 6f1ac8d3585f7eb22d7493419c8c5a312b2af9e1 Mon Sep 17 00:00:00 2001
From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com>
Date: Mon, 16 Mar 2026 10:14:37 +0530
Subject: [PATCH 07/10] variables changes requested

---
 lib/simulator_cuda.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/simulator_cuda.h b/lib/simulator_cuda.h
index 3772834fa..2a5e651e4 100644
--- a/lib/simulator_cuda.h
+++ b/lib/simulator_cuda.h
@@ -479,11 +479,11 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + G;
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
 
     unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U);
     unsigned threads = 64U;
-    uint64_t blocks = std::max(uint64_t{1}, (uint64_t{size} / 2) >> s);
+    uint64_t blocks = std::max(uint64_t{1}, (size / 2) >> s);
     unsigned num_iterations_per_block = 1 << s;
 
     constexpr unsigned m = 16;
@@ -517,11 +517,11 @@ class SimulatorCUDA final {
 
     unsigned k = 5 + num_effective_qs;
     unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
+    uint64_t size = uint64_t{1} << n;
 
     unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U);
     unsigned threads = 32;
-    uint64_t blocks = uint64_t{size} >> s;
+    uint64_t blocks = size >> s;
     unsigned num_iterations_per_block = 1 << s;
 
     constexpr unsigned m = 16;

From 3262c73b7dd4d187d393a344e733cd7d51837714 Mon Sep 17 00:00:00 2001
From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com>
Date: Mon, 16 Mar 2026 10:15:26 +0530
Subject: [PATCH 08/10] Remove unnecessary blank line in statespace_cuda.h

---
 lib/statespace_cuda.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h
index 8f76fe25e..e0f3cfe03 100644
--- a/lib/statespace_cuda.h
+++ b/lib/statespace_cuda.h
@@ -52,7 +52,6 @@ class StateSpaceCUDA :
     dim3 blocks;
   };
 
-
  public:
   using State = typename Base::State;
   using fp_type = typename Base::fp_type;

From 1008c97c904d6b5df38d4d791d1c3297da8c0477 Mon Sep 17 00:00:00 2001
From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com>
Date: Mon, 16 Mar 2026 10:29:19 +0530
Subject: [PATCH 09/10] Increase block limit in CreateGrid function

---
 lib/util_cuda.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/lib/util_cuda.h b/lib/util_cuda.h
index a7880a527..514b054d8 100644
--- a/lib/util_cuda.h
+++ b/lib/util_cuda.h
@@ -136,16 +136,16 @@ __device__ __forceinline__ uint64_t GetBlockId() {
 }
 
 inline dim3 CreateGrid(uint64_t blocks) {
-  if (blocks <= 65535) {
-    return dim3((uint32_t)blocks);
+  if (blocks <= 65536) {
+    return dim3((uint32_t) blocks);
   }
-  uint32_t x = 65535;
-  uint64_t rem = (blocks + x - 1) / x;
-  if (rem <= 65535) {
-    return dim3(x, (uint32_t)rem);
+  uint32_t x = 65536;
+  uint64_t rem = blocks / x;
+  if (rem <= 65536) {
+    return dim3(x, (uint32_t) rem);
   }
-  uint32_t y = 65535;
-  uint32_t z = (uint32_t)((rem + y - 1) / y);
+  uint32_t y = 65536;
+  uint32_t z = (uint32_t) (rem / y);
   return dim3(x, y, z);
 }
 

From 28a6022f58ec834c712184376cbdab8027f8bba9 Mon Sep 17 00:00:00 2001
From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com>
Date: Wed, 18 Mar 2026 09:06:44 +0530
Subject: [PATCH 10/10] Adjust block size limits in util_cuda.h

---
 lib/util_cuda.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/util_cuda.h b/lib/util_cuda.h
index 514b054d8..004ddc11b 100644
--- a/lib/util_cuda.h
+++ b/lib/util_cuda.h
@@ -141,10 +141,10 @@ inline dim3 CreateGrid(uint64_t blocks) {
   }
   uint32_t x = 65536;
   uint64_t rem = blocks / x;
-  if (rem <= 65536) {
+  if (rem <= 32768) {
     return dim3(x, (uint32_t) rem);
   }
-  uint32_t y = 65536;
+  uint32_t y = 32768;
   uint32_t z = (uint32_t) (rem / y);
   return dim3(x, y, z);
 }