diff --git a/CMakeLists.txt b/CMakeLists.txt index 07d8dc9..e8a6524 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,9 +6,10 @@ if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() # 如果需要指定显卡版本号的话: -# set(CMAKE_CUDA_ARCHITECTURES 52) +set(CMAKE_CUDA_ARCHITECTURES 52) project(hellocmake LANGUAGES CXX CUDA) add_executable(main main.cu) target_include_directories(main PUBLIC include) +target_compile_options(main PUBLIC $<$:--extended-lambda>) diff --git a/main.cu b/main.cu index 018bc33..fb752a5 100644 --- a/main.cu +++ b/main.cu @@ -5,39 +5,47 @@ #include #include // #include // 如果想用 thrust 也是没问题的 +#include +#include // 这是基于“边角料法”的,请把他改成基于“网格跨步循环”的:10 分 -__global__ void fill_sin(int *arr, int n) { - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) return; - arr[i] = sinf(i); +template +__global__ void fill_sin(int n, Fun func) { + int step = gridDim.x * blockDim.x; + for(size_t i=blockIdx.x * blockDim.x + threadIdx.x; i < n; i+=step) + func(i); } -__global__ void filter_positive(int *counter, int *res, int const *arr, int n) { +template + __global__ void filter_positive(T1 arr, T2 res, T3 cnt){ int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) return; - if (arr[i] >= 0) { - // 这里有什么问题?请改正:10 分 - int loc = *counter; - *counter += 1; - res[loc] = n; + if (i > N) return; + if(arr[i] >= 0){ + int loc = atomicAdd(&cnt[0], 1); + res[loc] = arr[i]; } } int main() { constexpr int n = 1<<24; - std::vector> arr(n); - std::vector> res(n); - std::vector> counter(1); - - // fill_sin 改成“网格跨步循环”以后,这里三重尖括号里的参数如何调整?10 分 - fill_sin<<>>(arr.data(), n); - - // 这里的“边角料法”对于不是 1024 整数倍的 n 会出错,为什么?请修复:10 分 - filter_positive<<>>(counter.data(), res.data(), arr.data(), n); - - // 这里 CPU 访问数据前漏了一步什么操作?请补上:10 分 - + thrust::universal_vector arr(n); + thrust::universal_vector res(n); + thrust::universal_vector counter(1); + + // TICK(FILL_SIN); + fill_sin<<>>(n, + [arr = arr.data()] __device__ (size_t i){ + arr[i] = sinf(i); + }); + // checkCudaErrors(cudaDeviceSynchronize()); + // TOCK(FILL_SIN); + + // TICK(filter_positive); + filter_positive<<<(n + 1023) / 1024, 1024>>>(arr.data(), res.data(), counter.data()); + // checkCudaErrors(cudaDeviceSynchronize()); + // TOCK(filter_positive); + + checkCudaErrors(cudaDeviceSynchronize()); if (counter[0] <= n / 50) { printf("Result too short! %d <= %d\n", counter[0], n / 50); return -1; diff --git a/run.sh b/run.sh index 99e6ef6..b08acf3 100755 --- a/run.sh +++ b/run.sh @@ -2,4 +2,5 @@ set -e cmake -B build cmake --build build -build/main +# build/main +build/Debug/main.exe