From 4c272a15a8690aa053cc12f5d29babda0bab0b68 Mon Sep 17 00:00:00 2001 From: sun-9545sunoj Date: Fri, 13 Feb 2026 09:29:52 +0000 Subject: [PATCH 01/10] fix: resolve #993 -remove formating noise and isolate MI300X scaling logic --- apps/qsimh_base_cuda.cu | 220 ++++++++++++++++++++++++++++++++++ lib/simulator_cuda.h | 38 +++--- lib/simulator_cuda_kernels.h | 43 +++++-- lib/statespace_cuda.h | 43 +++++-- lib/statespace_cuda_kernels.h | 27 +++-- lib/vectorspace_cuda.h | 47 +++++--- 6 files changed, 349 insertions(+), 69 deletions(-) create mode 100644 apps/qsimh_base_cuda.cu diff --git a/apps/qsimh_base_cuda.cu b/apps/qsimh_base_cuda.cu new file mode 100644 index 000000000..e82528212 --- /dev/null +++ b/apps/qsimh_base_cuda.cu @@ -0,0 +1,220 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "../lib/bitstring.h" +#include "../lib/circuit_qsim_parser.h" +#include "../lib/formux.h" +#include "../lib/fuser_basic.h" +#include "../lib/gates_qsim.h" +#include "../lib/io_file.h" +#include "../lib/run_qsimh.h" +#include "../lib/simmux.h" +#include "../lib/simulator_cuda.h" +#include "../lib/util.h" +#include "../lib/util_cpu.h" + +constexpr char usage[] = + "usage:\n ./qsimh_base_hip.x -c circuit_file " + "-d maximum_time -k part1_qubits " + "-w prefix -p num_prefix_gates -r num_root_gates " + "-t num_threads -n num_dblocks -v verbosity -z\n"; + +struct Options { + std::string circuit_file; + std::vector part1; + uint64_t prefix; + unsigned maxtime = std::numeric_limits::max(); + unsigned num_prefix_gatexs = 0; + unsigned num_root_gatexs = 0; + unsigned num_threads = 256; + unsigned num_dblocks = 16; + unsigned verbosity = 0; + bool denormals_are_zeros = false; +}; + +Options GetOptions(int argc, char* argv[]) { + Options opt; + + int k; + + auto to_int = [](const std::string& word) -> unsigned { + return std::stoul(word); + }; + + while ((k = getopt(argc, argv, "c:d:k:w:p:r:t:n:v:z")) != -1) { + switch (k) { + case 'c': + opt.circuit_file = optarg; + break; + case 'd': + opt.maxtime = std::stoul(optarg); + break; + case 'k': + qsim::SplitString(optarg, ',', to_int, opt.part1); + break; + case 'w': + opt.prefix = std::stoull(optarg); + break; + case 'p': + opt.num_prefix_gatexs = std::stoul(optarg); + break; + case 'r': + opt.num_root_gatexs = std::stoul(optarg); + break; + case 't': + opt.num_threads = std::stoul(optarg); + break; + case 'n': + opt.num_dblocks = std::stoul(optarg); + break; + case 'v': + opt.verbosity = std::stoul(optarg); + break; + case 'z': + opt.denormals_are_zeros = true; + break; + default: + qsim::IO::errorf(usage); + exit(1); + } + } + + return opt; +} + +bool ValidateOptions(const Options& opt) { + if (opt.circuit_file.empty()) { + qsim::IO::errorf("circuit file is not provided.\n"); + qsim::IO::errorf(usage); + return false; + } + + return true; +} + +bool ValidatePart1(unsigned num_qubits, const std::vector& part1) { + for (std::size_t i = 0; i < part1.size(); ++i) { + if (part1[i] >= num_qubits) { + qsim::IO::errorf("part 1 qubit indices are too large.\n"); + return false; + } + } + + return true; +} + +std::vector GetParts( + unsigned num_qubits, const std::vector& part1) { + std::vector parts(num_qubits, 0); + + for (std::size_t i = 0; i < part1.size(); ++i) { + parts[part1[i]] = 1; + } + + return parts; +} + +int main(int argc, char* argv[]) { + using namespace qsim; + + auto opt = GetOptions(argc, argv); + if (!ValidateOptions(opt)) { + return 1; + } + + Circuit> circuit; + if (!CircuitQsimParser::FromFile( + opt.maxtime, opt.circuit_file, circuit)) { + return 1; + } + + if (!ValidatePart1(circuit.num_qubits, opt.part1)) { + return 1; + } + auto parts = GetParts(circuit.num_qubits, opt.part1); + + if (opt.denormals_are_zeros) { + SetFlushToZeroAndDenormalsAreZeros(); + } + + uint64_t num_bitstrings = + std::min(uint64_t{8}, uint64_t{1} << circuit.num_qubits); + + std::vector bitstrings; + bitstrings.reserve(num_bitstrings); + for (std::size_t i = 0; i < num_bitstrings; ++i) { + bitstrings.push_back(i); + } + + struct Factory { + using Simulator = qsim::SimulatorCUDA; + using StateSpace = Simulator::StateSpace; + using fp_type = Simulator::fp_type; + + Factory(const StateSpace::Parameter& param) : param(param) {} + + StateSpace CreateStateSpace() const { return StateSpace(param); } + + Simulator CreateSimulator() const { return Simulator(); } + + const StateSpace::Parameter& param; + }; + + using HybridSimulator = + HybridSimulator, BasicGateFuser, For>; + using Runner = QSimHRunner; + + Runner::Parameter param; + param.prefix = opt.prefix; + param.num_prefix_gatexs = opt.num_prefix_gatexs; + param.num_root_gatexs = opt.num_root_gatexs; + param.num_threads = + opt.num_threads; // This is reused for StateSpaceCUDA params implicitly + // if not careful, but here we separate. + param.verbosity = opt.verbosity; + + std::vector> results(num_bitstrings, 0); + + // Setup CUDA parameters + Factory::StateSpace::Parameter cuda_param; + cuda_param.num_threads = opt.num_threads; + cuda_param.num_dblocks = opt.num_dblocks; + + Factory factory(cuda_param); + + if (Runner::Run(param, factory, circuit, parts, bitstrings, results)) { + static constexpr char const* bits[8] = { + "000", "001", "010", "011", "100", "101", "110", "111", + }; + + unsigned s = 3 - std::min(unsigned{3}, circuit.num_qubits); + + for (std::size_t i = 0; i < num_bitstrings; ++i) { + const auto& a = results[i]; + qsim::IO::messagef( + "%s:%16.8g%16.8g%16.8g\n", bits[i] + s, std::real(a), std::imag(a), + std::norm(a)); + } + } + + return 0; +} diff --git a/lib/simulator_cuda.h b/lib/simulator_cuda.h index 5743bea8b..de9106bcf 100644 --- a/lib/simulator_cuda.h +++ b/lib/simulator_cuda.h @@ -350,8 +350,8 @@ class SimulatorCUDA final { IndicesH d_i(d_ws); - ApplyGateH_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, state.get()); + ApplyGateH_Kernel<<>>( + (fp_type*)d_ws, d_i.xss, d_i.ms, state.get()); } template @@ -374,8 +374,8 @@ class SimulatorCUDA final { IndicesL d_i(d_ws); - ApplyGateL_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + ApplyGateL_Kernel<<>>( + (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, 1 << num_effective_qs, state.get()); } @@ -407,8 +407,8 @@ class SimulatorCUDA final { IndicesH d_i(d_ws); - ApplyControlledGateH_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get()); + ApplyControlledGateH_Kernel<<>>( + (fp_type*)d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get()); } template @@ -432,9 +432,9 @@ class SimulatorCUDA final { IndicesL d_i(d_ws); - ApplyControlledGateLH_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, - d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get()); + ApplyControlledGateLH_Kernel<<>>( + (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d.num_aqs + 1, + d.cvalsh, 1 << d.num_effective_qs, state.get()); } template @@ -458,8 +458,8 @@ class SimulatorCUDA final { IndicesLC d_i(d_ws); - ApplyControlledGateL_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis, + ApplyControlledGateL_Kernel<<>>( + (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis, d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, 1 << (5 - d.remaining_low_cqs), state.get()); } @@ -493,9 +493,9 @@ class SimulatorCUDA final { IndicesH d_i(d_ws); - ExpectationValueH_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block, - state.get(), Plus(), d_res1); + ExpectationValueH_Kernel<<>>( + (fp_type*)d_ws, d_i.xss, d_i.ms, num_iterations_per_block, state.get(), + Plus(), d_res1); double mul = size == 1 ? 0.5 : 1.0; @@ -531,8 +531,8 @@ class SimulatorCUDA final { IndicesL d_i(d_ws); - ExpectationValueL_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + ExpectationValueL_Kernel<<>>( + (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, num_iterations_per_block, state.get(), Plus(), d_res1); double mul = double(1 << (5 + num_effective_qs - G)) / 32; @@ -895,6 +895,12 @@ class SimulatorCUDA final { return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs}; } + static dim3 CreateGrid(uint64_t blocks) { + if (blocks <= 65535) return dim3(blocks); + uint32_t x = 65535; + uint32_t y = (blocks + x - 1) / x; + return dim3(x, y); + } void* AllocScratch(uint64_t size) const { if (size > scratch_size_) { diff --git a/lib/simulator_cuda_kernels.h b/lib/simulator_cuda_kernels.h index e21a9d62e..cfc9419b1 100644 --- a/lib/simulator_cuda_kernels.h +++ b/lib/simulator_cuda_kernels.h @@ -33,6 +33,9 @@ __global__ void ApplyGateH_Kernel( const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) { // blockDim.x must be equal to 64. + uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + + uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -61,7 +64,7 @@ __global__ void ApplyGateH_Kernel( __syncthreads(); - idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; + idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0; idx_type ii = i & mss[0]; for (unsigned j = 1; j <= G; ++j) { i *= 2; @@ -115,6 +118,9 @@ __global__ void ApplyGateL_Kernel( fp_type* __restrict__ rstate) { // blockDim.x must be equal to 32. + uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + + uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -137,7 +143,7 @@ __global__ void ApplyGateL_Kernel( } } - idx_type i = 32 * idx_type{blockIdx.x}; + idx_type i = 32 * idx_type{blockId}; idx_type ii = i & mss[0]; for (unsigned j = 1; j <= G; ++j) { i *= 2; @@ -204,6 +210,9 @@ __global__ void ApplyControlledGateH_Kernel( fp_type* __restrict__ rstate) { // blockDim.x must be equal to 64. + uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + + uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -232,7 +241,7 @@ __global__ void ApplyControlledGateH_Kernel( __syncthreads(); - idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; + idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0; idx_type ii = i & mss[0]; for (unsigned j = 1; j < num_mss; ++j) { i *= 2; @@ -288,6 +297,9 @@ __global__ void ApplyControlledGateLH_Kernel( unsigned esize, fp_type* __restrict__ rstate) { // blockDim.x must be equal to 32. + uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + + uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -300,7 +312,7 @@ __global__ void ApplyControlledGateLH_Kernel( __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; __shared__ fp_type v[2 * gsize * rows]; - idx_type i = 32 * idx_type{blockIdx.x}; + idx_type i = 32 * idx_type{blockId}; idx_type ii = i & mss[0]; for (unsigned j = 1; j < num_mss; ++j) { i *= 2; @@ -381,6 +393,9 @@ __global__ void ApplyControlledGateL_Kernel( fp_type* __restrict__ rstate) { // blockDim.x must be equal to 32. + uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + + uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -393,7 +408,7 @@ __global__ void ApplyControlledGateL_Kernel( __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; __shared__ fp_type v[2 * gsize * rows]; - idx_type i = 32 * idx_type{blockIdx.x}; + idx_type i = 32 * idx_type{blockId}; idx_type ii = i & mss[0]; for (unsigned j = 1; j < num_mss; ++j) { i *= 2; @@ -477,6 +492,9 @@ __global__ void ExpectationValueH_Kernel( const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { // blockDim.x must be equal to 64. + uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + + uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -508,7 +526,7 @@ __global__ void ExpectationValueH_Kernel( double im = 0; for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) { - idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter; + idx_type b = num_iterations_per_block * idx_type{blockId} + iter; idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0; idx_type ii = i & mss[0]; @@ -573,8 +591,8 @@ __global__ void ExpectationValueH_Kernel( __syncthreads(); if (threadIdx.x == 0) { - result[blockIdx.x].re = partial2[0].re + partial2[1].re; - result[blockIdx.x].im = partial2[0].im + partial2[1].im; + result[blockId].re = partial2[0].re + partial2[1].re; + result[blockId].im = partial2[0].im + partial2[1].im; } } @@ -587,6 +605,9 @@ __global__ void ExpectationValueL_Kernel( const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { // blockDim.x must be equal to 32. + uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + + uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); constexpr unsigned gsize = 1 << G; @@ -612,7 +633,7 @@ __global__ void ExpectationValueL_Kernel( double im = 0; for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) { - idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter); + idx_type i = 32 * (num_iterations_per_block * idx_type{blockId} + iter); idx_type ii = i & mss[0]; for (unsigned j = 1; j <= G; ++j) { i *= 2; @@ -673,8 +694,8 @@ __global__ void ExpectationValueL_Kernel( auto val = WarpReduce(partial[threadIdx.x], op); if (threadIdx.x == 0) { - result[blockIdx.x].re = val.re; - result[blockIdx.x].im = val.im; + result[blockId].re = val.re; + result[blockId].im = val.im; } } diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h index 660db074c..865ea006b 100644 --- a/lib/statespace_cuda.h +++ b/lib/statespace_cuda.h @@ -50,6 +50,7 @@ class StateSpaceCUDA : unsigned threads; unsigned dblocks; unsigned blocks; + dim3 device_grid; }; public: @@ -82,6 +83,16 @@ class StateSpaceCUDA : return std::max(uint64_t{64}, 2 * (uint64_t{1} << num_qubits)); }; + bool IsNull(const State& state) const { return state.get() == nullptr; } + + void DeviceSync() const { ErrorCheck(cudaDeviceSynchronize()); } + + void Copy(const State& src, State& dest) const { + ErrorCheck(cudaMemcpy( + dest.get(), src.get(), MinSize(src.num_qubits()) * sizeof(fp_type), + cudaMemcpyDeviceToDevice)); + } + void InternalToNormalOrder(State& state) const { uint64_t size = MinSize(state.num_qubits()) / 2; @@ -89,7 +100,8 @@ class StateSpaceCUDA : unsigned blocks = size / threads; unsigned bytes = 2 * threads * sizeof(fp_type); - InternalToNormalOrderKernel<<>>(state.get()); + InternalToNormalOrderKernel<<>>( + state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } @@ -101,7 +113,8 @@ class StateSpaceCUDA : unsigned blocks = size / threads; unsigned bytes = 2 * threads * sizeof(fp_type); - NormalToInternalOrderKernel<<>>(state.get()); + NormalToInternalOrderKernel<<>>( + state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } @@ -121,7 +134,8 @@ class StateSpaceCUDA : fp_type v = double{1} / std::sqrt(hsize); - SetStateUniformKernel<<>>(v, hsize, state.get()); + SetStateUniformKernel<<>>( + v, hsize, state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } @@ -180,7 +194,7 @@ class StateSpaceCUDA : unsigned threads = std::min(size, uint64_t{param_.num_threads}); unsigned blocks = size / threads; - BulkSetAmplKernel<<>>( + BulkSetAmplKernel<<>>( mask, bits, re, im, exclude, state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); @@ -197,7 +211,7 @@ class StateSpaceCUDA : unsigned threads = std::min(size, uint64_t{param_.num_threads}); unsigned blocks = size / threads; - AddKernel<<>>(src.get(), dest.get()); + AddKernel<<>>(src.get(), dest.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); @@ -211,7 +225,7 @@ class StateSpaceCUDA : unsigned threads = std::min(size, uint64_t{param_.num_threads}); unsigned blocks = size / threads; - MultiplyKernel<<>>(a, state.get()); + MultiplyKernel<<>>(a, state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } @@ -262,7 +276,7 @@ class StateSpaceCUDA : auto op1 = RealProduct(); auto op2 = Plus(); - Reduce1Kernel<<>>( + Reduce1Kernel<<>>( g1.dblocks, op1, op2, op2, state.get(), state.get(), d_res1); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); @@ -278,7 +292,7 @@ class StateSpaceCUDA : auto op3 = Plus(); - Reduce2Kernel<<>>( + Reduce2Kernel<<>>( g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); @@ -321,7 +335,8 @@ class StateSpaceCUDA : unsigned threads = std::min(size, uint64_t{param_.num_threads}); unsigned blocks = size / threads; - CollapseKernel<<>>(mr.mask, mr.bits, renorm, state.get()); + CollapseKernel<<>>( + mr.mask, mr.bits, renorm, state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } @@ -337,7 +352,7 @@ class StateSpaceCUDA : auto op1 = RealProduct(); auto op2 = Plus(); - Reduce1Kernel<<>>( + Reduce1Kernel<<>>( g.dblocks, op1, op2, op2, state.get(), state.get(), d_res); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); @@ -391,6 +406,7 @@ class StateSpaceCUDA : grid.threads = std::min(size, uint64_t{param_.num_threads}); grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks}); grid.blocks = size / (grid.threads * grid.dblocks); + grid.device_grid = CreateGrid(grid.blocks); return grid; } @@ -401,6 +417,7 @@ class StateSpaceCUDA : grid.threads = std::min(param_.num_threads, std::max(32U, size)); grid.dblocks = std::max(1U, size / grid.threads); grid.blocks = 1; + grid.device_grid = dim3(1, 1, 1); return grid; } @@ -426,10 +443,10 @@ class StateSpaceCUDA : auto op3 = Plus::type>(); if (mask == 0) { - Reduce1Kernel<<>>( + Reduce1Kernel<<>>( g1.dblocks, op1, op2, op3, state1.get(), state2.get(), d_res1); } else { - Reduce1MaskedKernel<<>>( + Reduce1MaskedKernel<<>>( g1.dblocks, mask, bits, op1, op2, op3, state1.get(), state2.get(), d_res1); } @@ -448,7 +465,7 @@ class StateSpaceCUDA : auto op2 = Plus(); auto op3 = Plus::type>(); - Reduce2Kernel<<>>( + Reduce2Kernel<<>>( g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); diff --git a/lib/statespace_cuda_kernels.h b/lib/statespace_cuda_kernels.h index 0bc4ba706..88ad89911 100644 --- a/lib/statespace_cuda_kernels.h +++ b/lib/statespace_cuda_kernels.h @@ -39,7 +39,7 @@ __device__ __forceinline__ FP1 BlockReduce1( unsigned warp = threadIdx.x / warp_size; unsigned lane = threadIdx.x % warp_size; - uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane; + uint64_t k0 = 2 * n * qsim::GetBlockId() * blockDim.x + 2 * tid - lane; uint64_t k1 = k0 + 2 * n * blockDim.x; FP1 r; @@ -88,7 +88,7 @@ __device__ __forceinline__ FP1 BlockReduce1Masked( unsigned warp = threadIdx.x / warp_size; unsigned lane = threadIdx.x % warp_size; - uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane; + uint64_t k0 = 2 * n * qsim::GetBlockId() * blockDim.x + 2 * tid - lane; uint64_t k1 = k0 + 2 * n * blockDim.x; FP1 r = 0; @@ -137,7 +137,7 @@ __device__ __forceinline__ FP1 BlockReduce2( FP1* partial1 = (FP1*) shared; unsigned tid = threadIdx.x; - uint64_t k0 = n * blockIdx.x * blockDim.x + tid; + uint64_t k0 = n * qsim::GetBlockId() * blockDim.x + tid; uint64_t k1 = k0 + n * blockDim.x; FP1 r = 0; @@ -185,7 +185,7 @@ __global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3, FP1 sum = detail::BlockReduce1(n, op1, op2, op3, s1, s2); if (threadIdx.x == 0) { - result[blockIdx.x] = sum; + result[qsim::GetBlockId()] = sum; } } @@ -198,7 +198,7 @@ __global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits, detail::BlockReduce1Masked(n, mask, bits, op1, op2, op3, s1, s2); if (threadIdx.x == 0) { - result[blockIdx.x] = sum; + result[qsim::GetBlockId()] = sum; } } @@ -209,7 +209,7 @@ __global__ void Reduce2Kernel( FP1 sum = detail::BlockReduce2(n, size, op2, op3, s); if (threadIdx.x == 0) { - result[blockIdx.x] = sum; + result[qsim::GetBlockId()] = sum; } } @@ -217,7 +217,7 @@ template __global__ void InternalToNormalOrderKernel(FP* state) { unsigned lane = threadIdx.x % warp_size; unsigned l = 2 * threadIdx.x - lane; - uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l; + uint64_t k = 2 * uint64_t{qsim::GetBlockId()} * blockDim.x + l; extern __shared__ float shared[]; FP* buf = (FP*) shared; @@ -235,7 +235,7 @@ template __global__ void NormalToInternalOrderKernel(FP* state) { unsigned lane = threadIdx.x % warp_size; unsigned l = 2 * threadIdx.x - lane; - uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l; + uint64_t k = 2 * uint64_t{qsim::GetBlockId()} * blockDim.x + l; extern __shared__ float shared[]; FP* buf = (FP*) shared; @@ -252,7 +252,8 @@ __global__ void NormalToInternalOrderKernel(FP* state) { template __global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { unsigned lane = threadIdx.x % warp_size; - uint64_t k = 2 * (uint64_t{blockIdx.x} * blockDim.x + threadIdx.x) - lane; + uint64_t k = + 2 * (uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x) - lane; state[k] = lane < size ? v : 0; state[k + warp_size] = 0; @@ -260,19 +261,19 @@ __global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { template __global__ void AddKernel(const FP* state1, FP* state2) { - uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + uint64_t k = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x; state2[k] += state1[k]; } template __global__ void MultiplyKernel(FP a, FP* state) { - uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + uint64_t k = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x; state[k] *= a; } template __global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) { - uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + uint64_t k1 = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x; uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; if ((k1 & mask) == bits) { @@ -287,7 +288,7 @@ __global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) { template __global__ void BulkSetAmplKernel( uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) { - uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + uint64_t k1 = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x; uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; bool set = ((k1 & mask) == bits) ^ exclude; diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h index 5cfd4e834..dd67661e2 100644 --- a/lib/vectorspace_cuda.h +++ b/lib/vectorspace_cuda.h @@ -28,24 +28,30 @@ namespace qsim { -namespace detail { +// Use a unique detail namespace to avoid collision with vectorspace.h +namespace cuda_detail { inline void do_not_free(void*) {} - inline void free(void* ptr) { - ErrorCheck(cudaFree(ptr)); + if (ptr != nullptr) { +#ifdef __NVCC__ + ErrorCheck(cudaFree(ptr)); +#elif __HIP__ + // Using the qsim ErrorCheck wrapper for HIP + ErrorCheck(hipFree(ptr)); +#endif + } } -} // namespace detail +} // namespace cuda_detail // Routines for vector manipulations. template class VectorSpaceCUDA { public: using fp_type = FP; - - private: - using Pointer = std::unique_ptr; + // Define Pointer with a clear function pointer type for the deleter + using Pointer = std::unique_ptr; public: class Vector { @@ -86,11 +92,24 @@ class VectorSpaceCUDA { static Vector Create(unsigned num_qubits) { fp_type* p; - auto size = sizeof(fp_type) * Impl::MinSize(num_qubits); + // Use explicit 64-bit calculation for large simulations (>30 qubits) to prevent overflow. + // Otherwise, keep original behavior to minimize impact. + uint64_t size; + if (num_qubits > 30) { + size = uint64_t{sizeof(fp_type)} * Impl::MinSize(num_qubits); + } else { + size = sizeof(fp_type) * Impl::MinSize(num_qubits); + } + +// Ensure we use the correct API based on the compiler +#ifdef __NVCC__ auto rc = cudaMalloc(&p, size); +#elif __HIP__ + auto rc = hipMalloc(&p, size); +#endif - if (rc == cudaSuccess) { - return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits}; + if (rc == 0) { // Success + return Vector{Pointer{(fp_type*)p, &cuda_detail::free}, num_qubits}; } else { return Null(); } @@ -99,15 +118,11 @@ class VectorSpaceCUDA { // It is the client's responsibility to make sure that p has at least // Impl::MinSize(num_qubits) elements. static Vector Create(fp_type* p, unsigned num_qubits) { - return Vector{Pointer{p, &detail::do_not_free}, num_qubits}; + return Vector{Pointer{p, &cuda_detail::do_not_free}, num_qubits}; } static Vector Null() { - return Vector{Pointer{nullptr, &detail::free}, 0}; - } - - static bool IsNull(const Vector& vector) { - return vector.get() == nullptr; + return Vector{Pointer{nullptr, &cuda_detail::free}, 0}; } static void Free(fp_type* ptr) { From 8edca1d99636231a4e07601fe734a4f597ff59e4 Mon Sep 17 00:00:00 2001 From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com> Date: Sat, 14 Feb 2026 00:30:07 +0530 Subject: [PATCH 02/10] Update copyright year and usage message for CUDA --- apps/qsimh_base_cuda.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/qsimh_base_cuda.cu b/apps/qsimh_base_cuda.cu index e82528212..951ad7688 100644 --- a/apps/qsimh_base_cuda.cu +++ b/apps/qsimh_base_cuda.cu @@ -1,4 +1,4 @@ -// Copyright 2019 Google LLC. All Rights Reserved. +// Copyright 2026 Google LLC. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -33,7 +33,7 @@ #include "../lib/util_cpu.h" constexpr char usage[] = - "usage:\n ./qsimh_base_hip.x -c circuit_file " + "usage:\n ./qsimh_base_cuda.x -c circuit_file " "-d maximum_time -k part1_qubits " "-w prefix -p num_prefix_gates -r num_root_gates " "-t num_threads -n num_dblocks -v verbosity -z\n"; From de169cc48de09b7e2824bf360943a60d24572527 Mon Sep 17 00:00:00 2001 From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com> Date: Sat, 14 Feb 2026 00:58:41 +0530 Subject: [PATCH 03/10] Refactor memory management namespace and functions requested chages are done! --- lib/vectorspace_cuda.h | 44 ++++++++++++------------------------------ 1 file changed, 12 insertions(+), 32 deletions(-) diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h index dd67661e2..d9ccff76e 100644 --- a/lib/vectorspace_cuda.h +++ b/lib/vectorspace_cuda.h @@ -28,30 +28,24 @@ namespace qsim { -// Use a unique detail namespace to avoid collision with vectorspace.h -namespace cuda_detail { +namespace detail { inline void do_not_free(void*) {} + inline void free(void* ptr) { - if (ptr != nullptr) { -#ifdef __NVCC__ - ErrorCheck(cudaFree(ptr)); -#elif __HIP__ - // Using the qsim ErrorCheck wrapper for HIP - ErrorCheck(hipFree(ptr)); -#endif - } + ErrorCheck(cudaFree(ptr)); } -} // namespace cuda_detail +} // namespace detail // Routines for vector manipulations. template class VectorSpaceCUDA { public: using fp_type = FP; - // Define Pointer with a clear function pointer type for the deleter - using Pointer = std::unique_ptr; + + private: + using Pointer = std::unique_ptr; public: class Vector { @@ -92,24 +86,10 @@ class VectorSpaceCUDA { static Vector Create(unsigned num_qubits) { fp_type* p; - // Use explicit 64-bit calculation for large simulations (>30 qubits) to prevent overflow. - // Otherwise, keep original behavior to minimize impact. - uint64_t size; - if (num_qubits > 30) { - size = uint64_t{sizeof(fp_type)} * Impl::MinSize(num_qubits); - } else { - size = sizeof(fp_type) * Impl::MinSize(num_qubits); - } - -// Ensure we use the correct API based on the compiler -#ifdef __NVCC__ + uint64_t size = uint64_t{sizeof(fp_type)} * Impl::MinSize(num_qubits); auto rc = cudaMalloc(&p, size); -#elif __HIP__ - auto rc = hipMalloc(&p, size); -#endif - - if (rc == 0) { // Success - return Vector{Pointer{(fp_type*)p, &cuda_detail::free}, num_qubits}; + if (rc == cudaSuccess) { + return Vector{Pointer{(fp_type*)p, &detail::free}, num_qubits}; } else { return Null(); } @@ -118,11 +98,11 @@ class VectorSpaceCUDA { // It is the client's responsibility to make sure that p has at least // Impl::MinSize(num_qubits) elements. static Vector Create(fp_type* p, unsigned num_qubits) { - return Vector{Pointer{p, &cuda_detail::do_not_free}, num_qubits}; + return Vector{Pointer{p, &detail::do_not_free}, num_qubits}; } static Vector Null() { - return Vector{Pointer{nullptr, &cuda_detail::free}, 0}; + return Vector{Pointer{nullptr, &detail::free}, 0}; } static void Free(fp_type* ptr) { From 663226094855250be42e0313d9054343d5bb2376 Mon Sep 17 00:00:00 2001 From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com> Date: Sat, 14 Feb 2026 01:04:09 +0530 Subject: [PATCH 04/10] Refactor memory management namespace and functions requested changes are done! --- lib/vectorspace_cuda.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h index d9ccff76e..f3652a527 100644 --- a/lib/vectorspace_cuda.h +++ b/lib/vectorspace_cuda.h @@ -105,6 +105,10 @@ class VectorSpaceCUDA { return Vector{Pointer{nullptr, &detail::free}, 0}; } + static bool IsNull(const Vector& vector) { + return vector.get() == nullptr; + } + static void Free(fp_type* ptr) { detail::free(ptr); } From 93b4aa2aebbb56a691980fa5954c94f4afc76403 Mon Sep 17 00:00:00 2001 From: sun-9545sunoj Date: Tue, 10 Mar 2026 01:12:49 +0530 Subject: [PATCH 05/10] Requested changes --- lib/simulator_cuda.h | 58 ++++++++++------------ lib/simulator_cuda_kernels.h | 25 ++++------ lib/statespace_cuda.h | 92 +++++++++++++++-------------------- lib/statespace_cuda_kernels.h | 31 ++++++------ lib/util_cuda.h | 19 ++++++++ lib/vectorspace_cuda.h | 6 +-- 6 files changed, 111 insertions(+), 120 deletions(-) diff --git a/lib/simulator_cuda.h b/lib/simulator_cuda.h index de9106bcf..3772834fa 100644 --- a/lib/simulator_cuda.h +++ b/lib/simulator_cuda.h @@ -344,14 +344,14 @@ class SimulatorCUDA final { unsigned k = 5 + G; unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; + uint64_t size = uint64_t{1} << n; unsigned threads = 64U; - unsigned blocks = std::max(1U, size / 2); + uint64_t blocks = std::max(uint64_t{1}, size / 2); IndicesH d_i(d_ws); ApplyGateH_Kernel<<>>( - (fp_type*)d_ws, d_i.xss, d_i.ms, state.get()); + (fp_type*) d_ws, d_i.xss, d_i.ms, state.get()); } template @@ -368,14 +368,14 @@ class SimulatorCUDA final { unsigned k = 5 + num_effective_qs; unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; + uint64_t size = uint64_t{1} << n; unsigned threads = 32; - unsigned blocks = size; + uint64_t blocks = size; IndicesL d_i(d_ws); ApplyGateL_Kernel<<>>( - (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, 1 << num_effective_qs, state.get()); } @@ -401,14 +401,14 @@ class SimulatorCUDA final { unsigned k = 5 + G + cqs.size(); unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; + uint64_t size = uint64_t{1} << n; unsigned threads = 64U; - unsigned blocks = std::max(1U, size / 2); + uint64_t blocks = std::max(uint64_t{1}, size / 2); IndicesH d_i(d_ws); ApplyControlledGateH_Kernel<<>>( - (fp_type*)d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get()); + (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get()); } template @@ -426,15 +426,15 @@ class SimulatorCUDA final { unsigned k = 5 + G + cqs.size(); unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; + uint64_t size = uint64_t{1} << n; unsigned threads = 32; - unsigned blocks = size; + uint64_t blocks = size; IndicesL d_i(d_ws); ApplyControlledGateLH_Kernel<<>>( - (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d.num_aqs + 1, - d.cvalsh, 1 << d.num_effective_qs, state.get()); + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get()); } template @@ -452,14 +452,14 @@ class SimulatorCUDA final { unsigned k = 5 + G + cqs.size(); unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; + uint64_t size = uint64_t{1} << n; unsigned threads = 32; - unsigned blocks = size; + uint64_t blocks = size; IndicesLC d_i(d_ws); ApplyControlledGateL_Kernel<<>>( - (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis, + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis, d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, 1 << (5 - d.remaining_low_cqs), state.get()); } @@ -483,7 +483,7 @@ class SimulatorCUDA final { unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U); unsigned threads = 64U; - unsigned blocks = std::max(1U, (size / 2) >> s); + uint64_t blocks = std::max(uint64_t{1}, (uint64_t{size} / 2) >> s); unsigned num_iterations_per_block = 1 << s; constexpr unsigned m = 16; @@ -494,8 +494,8 @@ class SimulatorCUDA final { IndicesH d_i(d_ws); ExpectationValueH_Kernel<<>>( - (fp_type*)d_ws, d_i.xss, d_i.ms, num_iterations_per_block, state.get(), - Plus(), d_res1); + (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block, + state.get(), Plus(), d_res1); double mul = size == 1 ? 0.5 : 1.0; @@ -521,7 +521,7 @@ class SimulatorCUDA final { unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U); unsigned threads = 32; - unsigned blocks = size >> s; + uint64_t blocks = uint64_t{size} >> s; unsigned num_iterations_per_block = 1 << s; constexpr unsigned m = 16; @@ -532,7 +532,7 @@ class SimulatorCUDA final { IndicesL d_i(d_ws); ExpectationValueL_Kernel<<>>( - (fp_type*)d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, num_iterations_per_block, state.get(), Plus(), d_res1); double mul = double(1 << (5 + num_effective_qs - G)) / 32; @@ -542,7 +542,7 @@ class SimulatorCUDA final { template std::complex ExpectationValueReduceFinal( - unsigned blocks, double mul, + uint64_t blocks, double mul, const Complex* d_res1, Complex* d_res2) const { Complex res2[m]; @@ -550,10 +550,10 @@ class SimulatorCUDA final { ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex), cudaMemcpyDeviceToHost)); } else { - unsigned threads2 = std::min(1024U, blocks); - unsigned blocks2 = std::min(m, blocks / threads2); + unsigned threads2 = std::min(uint64_t{1024}, blocks); + uint64_t blocks2 = std::min(uint64_t{m}, blocks / threads2); - unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2)); + unsigned dblocks = std::max(uint64_t{1}, blocks / (blocks2 * threads2)); unsigned bytes = threads2 * sizeof(Complex); Reduce2Kernel<<>>( @@ -568,7 +568,7 @@ class SimulatorCUDA final { double re = 0; double im = 0; - for (unsigned i = 0; i < blocks; ++i) { + for (uint64_t i = 0; i < blocks; ++i) { re += res2[i].re; im += res2[i].im; } @@ -895,12 +895,6 @@ class SimulatorCUDA final { return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs}; } - static dim3 CreateGrid(uint64_t blocks) { - if (blocks <= 65535) return dim3(blocks); - uint32_t x = 65535; - uint32_t y = (blocks + x - 1) / x; - return dim3(x, y); - } void* AllocScratch(uint64_t size) const { if (size > scratch_size_) { diff --git a/lib/simulator_cuda_kernels.h b/lib/simulator_cuda_kernels.h index cfc9419b1..d39661c56 100644 --- a/lib/simulator_cuda_kernels.h +++ b/lib/simulator_cuda_kernels.h @@ -18,13 +18,13 @@ #ifdef __NVCC__ #include #include - - #include "util_cuda.h" #elif __HIP__ #include #include "cuda2hip.h" #endif +#include "util_cuda.h" + namespace qsim { template @@ -33,8 +33,7 @@ __global__ void ApplyGateH_Kernel( const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) { // blockDim.x must be equal to 64. - uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + - uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + uint64_t blockId = GetBlockId(); static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); @@ -118,8 +117,7 @@ __global__ void ApplyGateL_Kernel( fp_type* __restrict__ rstate) { // blockDim.x must be equal to 32. - uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + - uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + uint64_t blockId = GetBlockId(); static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); @@ -210,8 +208,7 @@ __global__ void ApplyControlledGateH_Kernel( fp_type* __restrict__ rstate) { // blockDim.x must be equal to 64. - uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + - uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + uint64_t blockId = GetBlockId(); static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); @@ -297,8 +294,7 @@ __global__ void ApplyControlledGateLH_Kernel( unsigned esize, fp_type* __restrict__ rstate) { // blockDim.x must be equal to 32. - uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + - uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + uint64_t blockId = GetBlockId(); static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); @@ -393,8 +389,7 @@ __global__ void ApplyControlledGateL_Kernel( fp_type* __restrict__ rstate) { // blockDim.x must be equal to 32. - uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + - uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + uint64_t blockId = GetBlockId(); static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); @@ -492,8 +487,7 @@ __global__ void ExpectationValueH_Kernel( const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { // blockDim.x must be equal to 64. - uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + - uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + uint64_t blockId = GetBlockId(); static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); @@ -605,8 +599,7 @@ __global__ void ExpectationValueL_Kernel( const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { // blockDim.x must be equal to 32. - uint64_t blockId = uint64_t{blockIdx.z} * gridDim.x * gridDim.y + - uint64_t{blockIdx.y} * gridDim.x + blockIdx.x; + uint64_t blockId = GetBlockId(); static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h index 865ea006b..8f76fe25e 100644 --- a/lib/statespace_cuda.h +++ b/lib/statespace_cuda.h @@ -49,10 +49,10 @@ class StateSpaceCUDA : struct Grid { unsigned threads; unsigned dblocks; - unsigned blocks; - dim3 device_grid; + dim3 blocks; }; + public: using State = typename Base::State; using fp_type = typename Base::fp_type; @@ -83,25 +83,14 @@ class StateSpaceCUDA : return std::max(uint64_t{64}, 2 * (uint64_t{1} << num_qubits)); }; - bool IsNull(const State& state) const { return state.get() == nullptr; } - - void DeviceSync() const { ErrorCheck(cudaDeviceSynchronize()); } - - void Copy(const State& src, State& dest) const { - ErrorCheck(cudaMemcpy( - dest.get(), src.get(), MinSize(src.num_qubits()) * sizeof(fp_type), - cudaMemcpyDeviceToDevice)); - } - void InternalToNormalOrder(State& state) const { uint64_t size = MinSize(state.num_qubits()) / 2; unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; + uint64_t blocks = size / threads; unsigned bytes = 2 * threads * sizeof(fp_type); - InternalToNormalOrderKernel<<>>( - state.get()); + InternalToNormalOrderKernel<<>>(state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } @@ -110,11 +99,10 @@ class StateSpaceCUDA : uint64_t size = MinSize(state.num_qubits()) / 2; unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; + uint64_t blocks = size / threads; unsigned bytes = 2 * threads * sizeof(fp_type); - NormalToInternalOrderKernel<<>>( - state.get()); + NormalToInternalOrderKernel<<>>(state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } @@ -130,12 +118,11 @@ class StateSpaceCUDA : uint64_t hsize = uint64_t{1} << state.num_qubits(); unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; + uint64_t blocks = size / threads; fp_type v = double{1} / std::sqrt(hsize); - SetStateUniformKernel<<>>( - v, hsize, state.get()); + SetStateUniformKernel<<>>(v, hsize, state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } @@ -192,7 +179,7 @@ class StateSpaceCUDA : uint64_t size = MinSize(state.num_qubits()) / 2; unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; + uint64_t blocks = size / threads; BulkSetAmplKernel<<>>( mask, bits, re, im, exclude, state.get()); @@ -209,7 +196,7 @@ class StateSpaceCUDA : uint64_t size = MinSize(src.num_qubits()); unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; + uint64_t blocks = size / threads; AddKernel<<>>(src.get(), dest.get()); ErrorCheck(cudaPeekAtLastError()); @@ -223,7 +210,7 @@ class StateSpaceCUDA : uint64_t size = MinSize(state.num_qubits()); unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; + uint64_t blocks = size / threads; MultiplyKernel<<>>(a, state.get()); ErrorCheck(cudaPeekAtLastError()); @@ -262,38 +249,40 @@ class StateSpaceCUDA : if (num_samples > 0) { Grid g1 = GetGrid1(MinSize(state.num_qubits()) / 2); unsigned bytes = g1.threads * sizeof(double); + uint64_t num_blocks1 = + MinSize(state.num_qubits()) / 2 / (g1.threads * g1.dblocks); - unsigned scratch_size = (g1.blocks + 1) * sizeof(double) + unsigned scratch_size = (num_blocks1 + 1) * sizeof(double) + num_samples * (sizeof(uint64_t) + sizeof(DistrRealType)); void* scratch = AllocScratch(scratch_size); double* d_res2 = (double*) scratch; double* d_res1 = d_res2 + 1; - uint64_t* d_bitstrings = (uint64_t*) (d_res1 + g1.blocks); + uint64_t* d_bitstrings = (uint64_t*) (d_res1 + num_blocks1); DistrRealType* d_rs = (DistrRealType *) (d_bitstrings + num_samples); auto op1 = RealProduct(); auto op2 = Plus(); - Reduce1Kernel<<>>( + Reduce1Kernel<<>>( g1.dblocks, op1, op2, op2, state.get(), state.get(), d_res1); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); double norm; - if (g1.blocks == 1) { + if (num_blocks1 == 1) { ErrorCheck( cudaMemcpy(&norm, d_res1, sizeof(double), cudaMemcpyDeviceToHost)); } else { - Grid g2 = GetGrid2(g1.blocks); + Grid g2 = GetGrid2(num_blocks1); unsigned bytes = g2.threads * sizeof(double); auto op3 = Plus(); - Reduce2Kernel<<>>( - g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2); + Reduce2Kernel<<>>( + g2.dblocks, num_blocks1, op3, op3, d_res1, d_res2); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); @@ -308,7 +297,7 @@ class StateSpaceCUDA : num_samples * sizeof(DistrRealType), cudaMemcpyHostToDevice)); - SampleKernel<<<1, g1.threads>>>(g1.blocks, g1.dblocks, num_samples, + SampleKernel<<<1, g1.threads>>>(num_blocks1, g1.dblocks, num_samples, d_rs, d_res1, state.get(), d_bitstrings); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); @@ -333,18 +322,19 @@ class StateSpaceCUDA : uint64_t size = MinSize(state.num_qubits()) / 2; unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; + uint64_t blocks = size / threads; - CollapseKernel<<>>( - mr.mask, mr.bits, renorm, state.get()); + CollapseKernel<<>>(mr.mask, mr.bits, renorm, state.get()); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); } std::vector PartialNorms(const State& state) const { Grid g = GetGrid1(MinSize(state.num_qubits()) / 2); + uint64_t num_blocks = + MinSize(state.num_qubits()) / 2 / (g.threads * g.dblocks); - unsigned scratch_size = g.blocks * sizeof(double); + unsigned scratch_size = num_blocks * sizeof(double); unsigned bytes = g.threads * sizeof(double); double* d_res = (double*) AllocScratch(scratch_size); @@ -352,12 +342,12 @@ class StateSpaceCUDA : auto op1 = RealProduct(); auto op2 = Plus(); - Reduce1Kernel<<>>( + Reduce1Kernel<<>>( g.dblocks, op1, op2, op2, state.get(), state.get(), d_res); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); - std::vector norms(g.blocks); + std::vector norms(num_blocks); ErrorCheck( cudaMemcpy(norms.data(), d_res, scratch_size, cudaMemcpyDeviceToHost)); @@ -402,23 +392,18 @@ class StateSpaceCUDA : Grid GetGrid1(uint64_t size) const { Grid grid; - grid.threads = std::min(size, uint64_t{param_.num_threads}); grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks}); - grid.blocks = size / (grid.threads * grid.dblocks); - grid.device_grid = CreateGrid(grid.blocks); - + uint64_t num_blocks = size / (grid.threads * grid.dblocks); + grid.blocks = CreateGrid(num_blocks); return grid; } Grid GetGrid2(unsigned size) const { Grid grid; - grid.threads = std::min(param_.num_threads, std::max(32U, size)); grid.dblocks = std::max(1U, size / grid.threads); - grid.blocks = 1; - grid.device_grid = dim3(1, 1, 1); - + grid.blocks = dim3(1, 1, 1); return grid; } @@ -434,8 +419,9 @@ class StateSpaceCUDA : Grid g1 = GetGrid1(size); unsigned bytes = g1.threads * sizeof(FP1); + uint64_t num_blocks1 = size / (g1.threads * g1.dblocks); - FP2* d_res2 = (FP2*) AllocScratch((g1.blocks + 1) * sizeof(FP2)); + FP2* d_res2 = (FP2*) AllocScratch((num_blocks1 + 1) * sizeof(FP2)); FP2* d_res1 = d_res2 + 1; auto op1 = Op(); @@ -443,10 +429,10 @@ class StateSpaceCUDA : auto op3 = Plus::type>(); if (mask == 0) { - Reduce1Kernel<<>>( + Reduce1Kernel<<>>( g1.dblocks, op1, op2, op3, state1.get(), state2.get(), d_res1); } else { - Reduce1MaskedKernel<<>>( + Reduce1MaskedKernel<<>>( g1.dblocks, mask, bits, op1, op2, op3, state1.get(), state2.get(), d_res1); } @@ -455,18 +441,18 @@ class StateSpaceCUDA : FP2 result; - if (g1.blocks == 1) { + if (num_blocks1 == 1) { ErrorCheck( cudaMemcpy(&result, d_res1, sizeof(FP2), cudaMemcpyDeviceToHost)); } else { - Grid g2 = GetGrid2(g1.blocks); + Grid g2 = GetGrid2(num_blocks1); unsigned bytes = g2.threads * sizeof(FP2); auto op2 = Plus(); auto op3 = Plus::type>(); - Reduce2Kernel<<>>( - g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2); + Reduce2Kernel<<>>( + g2.dblocks, num_blocks1, op2, op3, d_res1, d_res2); ErrorCheck(cudaPeekAtLastError()); ErrorCheck(cudaDeviceSynchronize()); diff --git a/lib/statespace_cuda_kernels.h b/lib/statespace_cuda_kernels.h index 88ad89911..c762dfbb0 100644 --- a/lib/statespace_cuda_kernels.h +++ b/lib/statespace_cuda_kernels.h @@ -39,7 +39,7 @@ __device__ __forceinline__ FP1 BlockReduce1( unsigned warp = threadIdx.x / warp_size; unsigned lane = threadIdx.x % warp_size; - uint64_t k0 = 2 * n * qsim::GetBlockId() * blockDim.x + 2 * tid - lane; + uint64_t k0 = 2 * n * GetBlockId() * blockDim.x + 2 * tid - lane; uint64_t k1 = k0 + 2 * n * blockDim.x; FP1 r; @@ -88,7 +88,7 @@ __device__ __forceinline__ FP1 BlockReduce1Masked( unsigned warp = threadIdx.x / warp_size; unsigned lane = threadIdx.x % warp_size; - uint64_t k0 = 2 * n * qsim::GetBlockId() * blockDim.x + 2 * tid - lane; + uint64_t k0 = 2 * n * GetBlockId() * blockDim.x + 2 * tid - lane; uint64_t k1 = k0 + 2 * n * blockDim.x; FP1 r = 0; @@ -137,7 +137,7 @@ __device__ __forceinline__ FP1 BlockReduce2( FP1* partial1 = (FP1*) shared; unsigned tid = threadIdx.x; - uint64_t k0 = n * qsim::GetBlockId() * blockDim.x + tid; + uint64_t k0 = n * GetBlockId() * blockDim.x + tid; uint64_t k1 = k0 + n * blockDim.x; FP1 r = 0; @@ -185,7 +185,7 @@ __global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3, FP1 sum = detail::BlockReduce1(n, op1, op2, op3, s1, s2); if (threadIdx.x == 0) { - result[qsim::GetBlockId()] = sum; + result[GetBlockId()] = sum; } } @@ -198,7 +198,7 @@ __global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits, detail::BlockReduce1Masked(n, mask, bits, op1, op2, op3, s1, s2); if (threadIdx.x == 0) { - result[qsim::GetBlockId()] = sum; + result[GetBlockId()] = sum; } } @@ -209,7 +209,7 @@ __global__ void Reduce2Kernel( FP1 sum = detail::BlockReduce2(n, size, op2, op3, s); if (threadIdx.x == 0) { - result[qsim::GetBlockId()] = sum; + result[GetBlockId()] = sum; } } @@ -217,7 +217,7 @@ template __global__ void InternalToNormalOrderKernel(FP* state) { unsigned lane = threadIdx.x % warp_size; unsigned l = 2 * threadIdx.x - lane; - uint64_t k = 2 * uint64_t{qsim::GetBlockId()} * blockDim.x + l; + uint64_t k = 2 * GetBlockId() * blockDim.x + l; extern __shared__ float shared[]; FP* buf = (FP*) shared; @@ -235,7 +235,7 @@ template __global__ void NormalToInternalOrderKernel(FP* state) { unsigned lane = threadIdx.x % warp_size; unsigned l = 2 * threadIdx.x - lane; - uint64_t k = 2 * uint64_t{qsim::GetBlockId()} * blockDim.x + l; + uint64_t k = 2 * GetBlockId() * blockDim.x + l; extern __shared__ float shared[]; FP* buf = (FP*) shared; @@ -252,8 +252,7 @@ __global__ void NormalToInternalOrderKernel(FP* state) { template __global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { unsigned lane = threadIdx.x % warp_size; - uint64_t k = - 2 * (uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x) - lane; + uint64_t k = 2 * (GetBlockId() * blockDim.x + threadIdx.x) - lane; state[k] = lane < size ? v : 0; state[k + warp_size] = 0; @@ -261,19 +260,19 @@ __global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { template __global__ void AddKernel(const FP* state1, FP* state2) { - uint64_t k = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x; + uint64_t k = GetBlockId() * blockDim.x + threadIdx.x; state2[k] += state1[k]; } template __global__ void MultiplyKernel(FP a, FP* state) { - uint64_t k = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x; + uint64_t k = GetBlockId() * blockDim.x + threadIdx.x; state[k] *= a; } template __global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) { - uint64_t k1 = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x; + uint64_t k1 = GetBlockId() * blockDim.x + threadIdx.x; uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; if ((k1 & mask) == bits) { @@ -288,7 +287,7 @@ __global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) { template __global__ void BulkSetAmplKernel( uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) { - uint64_t k1 = uint64_t{qsim::GetBlockId()} * blockDim.x + threadIdx.x; + uint64_t k1 = GetBlockId() * blockDim.x + threadIdx.x; uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; bool set = ((k1 & mask) == bits) ^ exclude; @@ -300,7 +299,7 @@ __global__ void BulkSetAmplKernel( } template -__global__ void SampleKernel(unsigned num_blocks, +__global__ void SampleKernel(uint64_t num_blocks, uint64_t n, uint64_t num_samples, const FP1* rs, const FP2* ps, const FP3* state, uint64_t *bitstrings) { @@ -309,7 +308,7 @@ __global__ void SampleKernel(unsigned num_blocks, uint64_t m = 0; double csum = 0; - for (unsigned block_id = 0; block_id < num_blocks; ++block_id) { + for (uint64_t block_id = 0; block_id < num_blocks; ++block_id) { uint64_t km = n * blockDim.x; uint64_t k0 = block_id * km; diff --git a/lib/util_cuda.h b/lib/util_cuda.h index b34292753..a7880a527 100644 --- a/lib/util_cuda.h +++ b/lib/util_cuda.h @@ -130,6 +130,25 @@ __device__ __forceinline__ FP1 WarpReduce(FP1 val, Op op) { return val; } +__device__ __forceinline__ uint64_t GetBlockId() { + return (uint64_t{blockIdx.z} * gridDim.y + uint64_t{blockIdx.y}) + * gridDim.x + blockIdx.x; +} + +inline dim3 CreateGrid(uint64_t blocks) { + if (blocks <= 65535) { + return dim3((uint32_t)blocks); + } + uint32_t x = 65535; + uint64_t rem = (blocks + x - 1) / x; + if (rem <= 65535) { + return dim3(x, (uint32_t)rem); + } + uint32_t y = 65535; + uint32_t z = (uint32_t)((rem + y - 1) / y); + return dim3(x, y, z); +} + template __device__ __forceinline__ Complex WarpReduce(Complex val, Op op) { for (unsigned i = warp_size / 2; i > 0; i /= 2) { diff --git a/lib/vectorspace_cuda.h b/lib/vectorspace_cuda.h index f3652a527..f1efdd51d 100644 --- a/lib/vectorspace_cuda.h +++ b/lib/vectorspace_cuda.h @@ -86,10 +86,10 @@ class VectorSpaceCUDA { static Vector Create(unsigned num_qubits) { fp_type* p; - uint64_t size = uint64_t{sizeof(fp_type)} * Impl::MinSize(num_qubits); + uint64_t size = sizeof(fp_type) * Impl::MinSize(num_qubits); auto rc = cudaMalloc(&p, size); if (rc == cudaSuccess) { - return Vector{Pointer{(fp_type*)p, &detail::free}, num_qubits}; + return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits}; } else { return Null(); } @@ -168,4 +168,4 @@ class VectorSpaceCUDA { } // namespace qsim -#endif // VECTORSPACE_CUDA_H_ +#endif // VECTORSPACE_CUDA_H_ \ No newline at end of file From 8be37484cd234fe9d26377b7a921bba74cc709c3 Mon Sep 17 00:00:00 2001 From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com> Date: Tue, 10 Mar 2026 01:14:21 +0530 Subject: [PATCH 06/10] Delete apps/qsimh_base_cuda.cu --- apps/qsimh_base_cuda.cu | 220 ---------------------------------------- 1 file changed, 220 deletions(-) delete mode 100644 apps/qsimh_base_cuda.cu diff --git a/apps/qsimh_base_cuda.cu b/apps/qsimh_base_cuda.cu deleted file mode 100644 index 951ad7688..000000000 --- a/apps/qsimh_base_cuda.cu +++ /dev/null @@ -1,220 +0,0 @@ -// Copyright 2026 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include -#include -#include - -#include "../lib/bitstring.h" -#include "../lib/circuit_qsim_parser.h" -#include "../lib/formux.h" -#include "../lib/fuser_basic.h" -#include "../lib/gates_qsim.h" -#include "../lib/io_file.h" -#include "../lib/run_qsimh.h" -#include "../lib/simmux.h" -#include "../lib/simulator_cuda.h" -#include "../lib/util.h" -#include "../lib/util_cpu.h" - -constexpr char usage[] = - "usage:\n ./qsimh_base_cuda.x -c circuit_file " - "-d maximum_time -k part1_qubits " - "-w prefix -p num_prefix_gates -r num_root_gates " - "-t num_threads -n num_dblocks -v verbosity -z\n"; - -struct Options { - std::string circuit_file; - std::vector part1; - uint64_t prefix; - unsigned maxtime = std::numeric_limits::max(); - unsigned num_prefix_gatexs = 0; - unsigned num_root_gatexs = 0; - unsigned num_threads = 256; - unsigned num_dblocks = 16; - unsigned verbosity = 0; - bool denormals_are_zeros = false; -}; - -Options GetOptions(int argc, char* argv[]) { - Options opt; - - int k; - - auto to_int = [](const std::string& word) -> unsigned { - return std::stoul(word); - }; - - while ((k = getopt(argc, argv, "c:d:k:w:p:r:t:n:v:z")) != -1) { - switch (k) { - case 'c': - opt.circuit_file = optarg; - break; - case 'd': - opt.maxtime = std::stoul(optarg); - break; - case 'k': - qsim::SplitString(optarg, ',', to_int, opt.part1); - break; - case 'w': - opt.prefix = std::stoull(optarg); - break; - case 'p': - opt.num_prefix_gatexs = std::stoul(optarg); - break; - case 'r': - opt.num_root_gatexs = std::stoul(optarg); - break; - case 't': - opt.num_threads = std::stoul(optarg); - break; - case 'n': - opt.num_dblocks = std::stoul(optarg); - break; - case 'v': - opt.verbosity = std::stoul(optarg); - break; - case 'z': - opt.denormals_are_zeros = true; - break; - default: - qsim::IO::errorf(usage); - exit(1); - } - } - - return opt; -} - -bool ValidateOptions(const Options& opt) { - if (opt.circuit_file.empty()) { - qsim::IO::errorf("circuit file is not provided.\n"); - qsim::IO::errorf(usage); - return false; - } - - return true; -} - -bool ValidatePart1(unsigned num_qubits, const std::vector& part1) { - for (std::size_t i = 0; i < part1.size(); ++i) { - if (part1[i] >= num_qubits) { - qsim::IO::errorf("part 1 qubit indices are too large.\n"); - return false; - } - } - - return true; -} - -std::vector GetParts( - unsigned num_qubits, const std::vector& part1) { - std::vector parts(num_qubits, 0); - - for (std::size_t i = 0; i < part1.size(); ++i) { - parts[part1[i]] = 1; - } - - return parts; -} - -int main(int argc, char* argv[]) { - using namespace qsim; - - auto opt = GetOptions(argc, argv); - if (!ValidateOptions(opt)) { - return 1; - } - - Circuit> circuit; - if (!CircuitQsimParser::FromFile( - opt.maxtime, opt.circuit_file, circuit)) { - return 1; - } - - if (!ValidatePart1(circuit.num_qubits, opt.part1)) { - return 1; - } - auto parts = GetParts(circuit.num_qubits, opt.part1); - - if (opt.denormals_are_zeros) { - SetFlushToZeroAndDenormalsAreZeros(); - } - - uint64_t num_bitstrings = - std::min(uint64_t{8}, uint64_t{1} << circuit.num_qubits); - - std::vector bitstrings; - bitstrings.reserve(num_bitstrings); - for (std::size_t i = 0; i < num_bitstrings; ++i) { - bitstrings.push_back(i); - } - - struct Factory { - using Simulator = qsim::SimulatorCUDA; - using StateSpace = Simulator::StateSpace; - using fp_type = Simulator::fp_type; - - Factory(const StateSpace::Parameter& param) : param(param) {} - - StateSpace CreateStateSpace() const { return StateSpace(param); } - - Simulator CreateSimulator() const { return Simulator(); } - - const StateSpace::Parameter& param; - }; - - using HybridSimulator = - HybridSimulator, BasicGateFuser, For>; - using Runner = QSimHRunner; - - Runner::Parameter param; - param.prefix = opt.prefix; - param.num_prefix_gatexs = opt.num_prefix_gatexs; - param.num_root_gatexs = opt.num_root_gatexs; - param.num_threads = - opt.num_threads; // This is reused for StateSpaceCUDA params implicitly - // if not careful, but here we separate. - param.verbosity = opt.verbosity; - - std::vector> results(num_bitstrings, 0); - - // Setup CUDA parameters - Factory::StateSpace::Parameter cuda_param; - cuda_param.num_threads = opt.num_threads; - cuda_param.num_dblocks = opt.num_dblocks; - - Factory factory(cuda_param); - - if (Runner::Run(param, factory, circuit, parts, bitstrings, results)) { - static constexpr char const* bits[8] = { - "000", "001", "010", "011", "100", "101", "110", "111", - }; - - unsigned s = 3 - std::min(unsigned{3}, circuit.num_qubits); - - for (std::size_t i = 0; i < num_bitstrings; ++i) { - const auto& a = results[i]; - qsim::IO::messagef( - "%s:%16.8g%16.8g%16.8g\n", bits[i] + s, std::real(a), std::imag(a), - std::norm(a)); - } - } - - return 0; -} From 6f1ac8d3585f7eb22d7493419c8c5a312b2af9e1 Mon Sep 17 00:00:00 2001 From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com> Date: Mon, 16 Mar 2026 10:14:37 +0530 Subject: [PATCH 07/10] variables changes requested --- lib/simulator_cuda.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/simulator_cuda.h b/lib/simulator_cuda.h index 3772834fa..2a5e651e4 100644 --- a/lib/simulator_cuda.h +++ b/lib/simulator_cuda.h @@ -479,11 +479,11 @@ class SimulatorCUDA final { unsigned k = 5 + G; unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; + uint64_t size = uint64_t{1} << n; unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U); unsigned threads = 64U; - uint64_t blocks = std::max(uint64_t{1}, (uint64_t{size} / 2) >> s); + uint64_t blocks = std::max(uint64_t{1}, (size / 2) >> s); unsigned num_iterations_per_block = 1 << s; constexpr unsigned m = 16; @@ -517,11 +517,11 @@ class SimulatorCUDA final { unsigned k = 5 + num_effective_qs; unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; + uint64_t size = uint64_t{1} << n; unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U); unsigned threads = 32; - uint64_t blocks = uint64_t{size} >> s; + uint64_t blocks = size >> s; unsigned num_iterations_per_block = 1 << s; constexpr unsigned m = 16; From 3262c73b7dd4d187d393a344e733cd7d51837714 Mon Sep 17 00:00:00 2001 From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com> Date: Mon, 16 Mar 2026 10:15:26 +0530 Subject: [PATCH 08/10] Remove unnecessary blank line in statespace_cuda.h --- lib/statespace_cuda.h | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/statespace_cuda.h b/lib/statespace_cuda.h index 8f76fe25e..e0f3cfe03 100644 --- a/lib/statespace_cuda.h +++ b/lib/statespace_cuda.h @@ -52,7 +52,6 @@ class StateSpaceCUDA : dim3 blocks; }; - public: using State = typename Base::State; using fp_type = typename Base::fp_type; From 1008c97c904d6b5df38d4d791d1c3297da8c0477 Mon Sep 17 00:00:00 2001 From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com> Date: Mon, 16 Mar 2026 10:29:19 +0530 Subject: [PATCH 09/10] Increase block limit in CreateGrid function --- lib/util_cuda.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lib/util_cuda.h b/lib/util_cuda.h index a7880a527..514b054d8 100644 --- a/lib/util_cuda.h +++ b/lib/util_cuda.h @@ -136,16 +136,16 @@ __device__ __forceinline__ uint64_t GetBlockId() { } inline dim3 CreateGrid(uint64_t blocks) { - if (blocks <= 65535) { - return dim3((uint32_t)blocks); + if (blocks <= 65536) { + return dim3((uint32_t) blocks); } - uint32_t x = 65535; - uint64_t rem = (blocks + x - 1) / x; - if (rem <= 65535) { - return dim3(x, (uint32_t)rem); + uint32_t x = 65536; + uint64_t rem = blocks / x; + if (rem <= 65536) { + return dim3(x, (uint32_t) rem); } - uint32_t y = 65535; - uint32_t z = (uint32_t)((rem + y - 1) / y); + uint32_t y = 65536; + uint32_t z = (uint32_t) (rem / y); return dim3(x, y, z); } From 28a6022f58ec834c712184376cbdab8027f8bba9 Mon Sep 17 00:00:00 2001 From: sunoj <156280523+sun-9545sunoj@users.noreply.github.com> Date: Wed, 18 Mar 2026 09:06:44 +0530 Subject: [PATCH 10/10] Adjust block size limits in util_cuda.h --- lib/util_cuda.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/util_cuda.h b/lib/util_cuda.h index 514b054d8..004ddc11b 100644 --- a/lib/util_cuda.h +++ b/lib/util_cuda.h @@ -141,10 +141,10 @@ inline dim3 CreateGrid(uint64_t blocks) { } uint32_t x = 65536; uint64_t rem = blocks / x; - if (rem <= 65536) { + if (rem <= 32768) { return dim3(x, (uint32_t) rem); } - uint32_t y = 65536; + uint32_t y = 32768; uint32_t z = (uint32_t) (rem / y); return dim3(x, y, z); }