Skip to content
Merged
52 changes: 26 additions & 26 deletions lib/simulator_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -344,13 +344,13 @@ class SimulatorCUDA final {

unsigned k = 5 + G;
unsigned n = num_qubits > k ? num_qubits - k : 0;
unsigned size = unsigned{1} << n;
uint64_t size = uint64_t{1} << n;
unsigned threads = 64U;
unsigned blocks = std::max(1U, size / 2);
uint64_t blocks = std::max(uint64_t{1}, size / 2);

IndicesH<G> d_i(d_ws);

ApplyGateH_Kernel<G><<<blocks, threads>>>(
ApplyGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, state.get());
}

Expand All @@ -368,13 +368,13 @@ class SimulatorCUDA final {

unsigned k = 5 + num_effective_qs;
unsigned n = num_qubits > k ? num_qubits - k : 0;
unsigned size = unsigned{1} << n;
uint64_t size = uint64_t{1} << n;
unsigned threads = 32;
unsigned blocks = size;
uint64_t blocks = size;

IndicesL<G> d_i(d_ws);

ApplyGateL_Kernel<G><<<blocks, threads>>>(
ApplyGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
1 << num_effective_qs, state.get());
}
Expand All @@ -401,13 +401,13 @@ class SimulatorCUDA final {

unsigned k = 5 + G + cqs.size();
unsigned n = num_qubits > k ? num_qubits - k : 0;
unsigned size = unsigned{1} << n;
uint64_t size = uint64_t{1} << n;
unsigned threads = 64U;
unsigned blocks = std::max(1U, size / 2);
uint64_t blocks = std::max(uint64_t{1}, size / 2);

IndicesH<G> d_i(d_ws);

ApplyControlledGateH_Kernel<G><<<blocks, threads>>>(
ApplyControlledGateH_Kernel<G><<<CreateGrid(blocks), threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
}

Expand All @@ -426,13 +426,13 @@ class SimulatorCUDA final {

unsigned k = 5 + G + cqs.size();
unsigned n = num_qubits > k ? num_qubits - k : 0;
unsigned size = unsigned{1} << n;
uint64_t size = uint64_t{1} << n;
unsigned threads = 32;
unsigned blocks = size;
uint64_t blocks = size;

IndicesL<G> d_i(d_ws);

ApplyControlledGateLH_Kernel<G><<<blocks, threads>>>(
ApplyControlledGateLH_Kernel<G><<<CreateGrid(blocks), threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get());
}
Expand All @@ -452,13 +452,13 @@ class SimulatorCUDA final {

unsigned k = 5 + G + cqs.size();
unsigned n = num_qubits > k ? num_qubits - k : 0;
unsigned size = unsigned{1} << n;
uint64_t size = uint64_t{1} << n;
unsigned threads = 32;
unsigned blocks = size;
uint64_t blocks = size;

IndicesLC<G> d_i(d_ws);

ApplyControlledGateL_Kernel<G><<<blocks, threads>>>(
ApplyControlledGateL_Kernel<G><<<CreateGrid(blocks), threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs,
1 << (5 - d.remaining_low_cqs), state.get());
Expand All @@ -479,11 +479,11 @@ class SimulatorCUDA final {

unsigned k = 5 + G;
unsigned n = num_qubits > k ? num_qubits - k : 0;
unsigned size = unsigned{1} << n;
uint64_t size = uint64_t{1} << n;

unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U);
unsigned threads = 64U;
unsigned blocks = std::max(1U, (size / 2) >> s);
uint64_t blocks = std::max(uint64_t{1}, (size / 2) >> s);
unsigned num_iterations_per_block = 1 << s;

constexpr unsigned m = 16;
Expand All @@ -493,7 +493,7 @@ class SimulatorCUDA final {

IndicesH<G> d_i(d_ws);

ExpectationValueH_Kernel<G><<<blocks, threads>>>(
ExpectationValueH_Kernel<G><<<CreateGrid(blocks), threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block,
state.get(), Plus<double>(), d_res1);

Expand All @@ -517,11 +517,11 @@ class SimulatorCUDA final {

unsigned k = 5 + num_effective_qs;
unsigned n = num_qubits > k ? num_qubits - k : 0;
unsigned size = unsigned{1} << n;
uint64_t size = uint64_t{1} << n;

unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U);
unsigned threads = 32;
unsigned blocks = size >> s;
uint64_t blocks = size >> s;
unsigned num_iterations_per_block = 1 << s;

constexpr unsigned m = 16;
Expand All @@ -531,7 +531,7 @@ class SimulatorCUDA final {

IndicesL<G> d_i(d_ws);

ExpectationValueL_Kernel<G><<<blocks, threads>>>(
ExpectationValueL_Kernel<G><<<CreateGrid(blocks), threads>>>(
(fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
num_iterations_per_block, state.get(), Plus<double>(), d_res1);

Expand All @@ -542,18 +542,18 @@ class SimulatorCUDA final {

template <unsigned m>
std::complex<double> ExpectationValueReduceFinal(
unsigned blocks, double mul,
uint64_t blocks, double mul,
const Complex* d_res1, Complex* d_res2) const {
Complex res2[m];

if (blocks <= 16) {
ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex),
cudaMemcpyDeviceToHost));
} else {
unsigned threads2 = std::min(1024U, blocks);
unsigned blocks2 = std::min(m, blocks / threads2);
unsigned threads2 = std::min(uint64_t{1024}, blocks);
uint64_t blocks2 = std::min(uint64_t{m}, blocks / threads2);

unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2));
unsigned dblocks = std::max(uint64_t{1}, blocks / (blocks2 * threads2));
unsigned bytes = threads2 * sizeof(Complex);

Reduce2Kernel<Complex><<<blocks2, threads2, bytes>>>(
Expand All @@ -568,7 +568,7 @@ class SimulatorCUDA final {
double re = 0;
double im = 0;

for (unsigned i = 0; i < blocks; ++i) {
for (uint64_t i = 0; i < blocks; ++i) {
re += res2[i].re;
im += res2[i].im;
}
Expand Down
40 changes: 27 additions & 13 deletions lib/simulator_cuda_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@
#ifdef __NVCC__
#include <cuda.h>
#include <cuda_runtime.h>

#include "util_cuda.h"
#elif __HIP__
#include <hip/hip_runtime.h>
#include "cuda2hip.h"
#endif

#include "util_cuda.h"

namespace qsim {

template <unsigned G, typename fp_type, typename idx_type>
Expand All @@ -33,6 +33,8 @@ __global__ void ApplyGateH_Kernel(
const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) {
// blockDim.x must be equal to 64.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand Down Expand Up @@ -61,7 +63,7 @@ __global__ void ApplyGateH_Kernel(

__syncthreads();

idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0;
idx_type ii = i & mss[0];
for (unsigned j = 1; j <= G; ++j) {
i *= 2;
Expand Down Expand Up @@ -115,6 +117,8 @@ __global__ void ApplyGateL_Kernel(
fp_type* __restrict__ rstate) {
// blockDim.x must be equal to 32.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand All @@ -137,7 +141,7 @@ __global__ void ApplyGateL_Kernel(
}
}

idx_type i = 32 * idx_type{blockIdx.x};
idx_type i = 32 * idx_type{blockId};
idx_type ii = i & mss[0];
for (unsigned j = 1; j <= G; ++j) {
i *= 2;
Expand Down Expand Up @@ -204,6 +208,8 @@ __global__ void ApplyControlledGateH_Kernel(
fp_type* __restrict__ rstate) {
// blockDim.x must be equal to 64.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand Down Expand Up @@ -232,7 +238,7 @@ __global__ void ApplyControlledGateH_Kernel(

__syncthreads();

idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
idx_type i = (64 * idx_type{blockId} + threadIdx.x) & 0xffffffffffe0;
idx_type ii = i & mss[0];
for (unsigned j = 1; j < num_mss; ++j) {
i *= 2;
Expand Down Expand Up @@ -288,6 +294,8 @@ __global__ void ApplyControlledGateLH_Kernel(
unsigned esize, fp_type* __restrict__ rstate) {
// blockDim.x must be equal to 32.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand All @@ -300,7 +308,7 @@ __global__ void ApplyControlledGateLH_Kernel(
__shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
__shared__ fp_type v[2 * gsize * rows];

idx_type i = 32 * idx_type{blockIdx.x};
idx_type i = 32 * idx_type{blockId};
idx_type ii = i & mss[0];
for (unsigned j = 1; j < num_mss; ++j) {
i *= 2;
Expand Down Expand Up @@ -381,6 +389,8 @@ __global__ void ApplyControlledGateL_Kernel(
fp_type* __restrict__ rstate) {
// blockDim.x must be equal to 32.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand All @@ -393,7 +403,7 @@ __global__ void ApplyControlledGateL_Kernel(
__shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
__shared__ fp_type v[2 * gsize * rows];

idx_type i = 32 * idx_type{blockIdx.x};
idx_type i = 32 * idx_type{blockId};
idx_type ii = i & mss[0];
for (unsigned j = 1; j < num_mss; ++j) {
i *= 2;
Expand Down Expand Up @@ -477,6 +487,8 @@ __global__ void ExpectationValueH_Kernel(
const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
// blockDim.x must be equal to 64.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand Down Expand Up @@ -508,7 +520,7 @@ __global__ void ExpectationValueH_Kernel(
double im = 0;

for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) {
idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter;
idx_type b = num_iterations_per_block * idx_type{blockId} + iter;

idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0;
idx_type ii = i & mss[0];
Expand Down Expand Up @@ -573,8 +585,8 @@ __global__ void ExpectationValueH_Kernel(
__syncthreads();

if (threadIdx.x == 0) {
result[blockIdx.x].re = partial2[0].re + partial2[1].re;
result[blockIdx.x].im = partial2[0].im + partial2[1].im;
result[blockId].re = partial2[0].re + partial2[1].re;
result[blockId].im = partial2[0].im + partial2[1].im;
}
}

Expand All @@ -587,6 +599,8 @@ __global__ void ExpectationValueL_Kernel(
const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
// blockDim.x must be equal to 32.

uint64_t blockId = GetBlockId();

static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");

constexpr unsigned gsize = 1 << G;
Expand All @@ -612,7 +626,7 @@ __global__ void ExpectationValueL_Kernel(
double im = 0;

for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) {
idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter);
idx_type i = 32 * (num_iterations_per_block * idx_type{blockId} + iter);
idx_type ii = i & mss[0];
for (unsigned j = 1; j <= G; ++j) {
i *= 2;
Expand Down Expand Up @@ -673,8 +687,8 @@ __global__ void ExpectationValueL_Kernel(
auto val = WarpReduce(partial[threadIdx.x], op);

if (threadIdx.x == 0) {
result[blockIdx.x].re = val.re;
result[blockIdx.x].im = val.im;
result[blockId].re = val.re;
result[blockId].im = val.im;
}
}

Expand Down
Loading
Loading