parallel101 · Riga27527 · Aug 17, 2022 · Aug 17, 2022
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -6,9 +6,10 @@ if (NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release)
 endif()
 # 如果需要指定显卡版本号的话：
-# set(CMAKE_CUDA_ARCHITECTURES 52)
+set(CMAKE_CUDA_ARCHITECTURES 52)
 
 project(hellocmake LANGUAGES CXX CUDA)
 
 add_executable(main main.cu)
 target_include_directories(main PUBLIC include)
+target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
diff --git a/main.cu b/main.cu
@@ -5,39 +5,47 @@
 #include <cmath>
 #include <vector>
 // #include <thrust/device_vector.h>  // 如果想用 thrust 也是没问题的
+#include <thrust/universal_vector.h>
+#include <ticktock.h>
 
 // 这是基于“边角料法”的，请把他改成基于“网格跨步循环”的：10 分
-__global__ void fill_sin(int *arr, int n) {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n) return;
-    arr[i] = sinf(i);
+template <class Fun>
+__global__ void fill_sin(int n, Fun func) {
+    int step = gridDim.x * blockDim.x;
+    for(size_t i=blockIdx.x * blockDim.x + threadIdx.x; i < n; i+=step)
+        func(i);
 }
 
-__global__ void filter_positive(int *counter, int *res, int const *arr, int n) {
+template <int N, class T1, class T2, class T3>
+ __global__ void filter_positive(T1 arr, T2 res, T3 cnt){
     int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < n) return;
-    if (arr[i] >= 0) {
-        // 这里有什么问题？请改正：10 分
-        int loc = *counter;
-        *counter += 1;
-        res[loc] = n;
+    if (i > N) return;
+    if(arr[i] >= 0){
+        int loc = atomicAdd(&cnt[0], 1);
+        res[loc] = arr[i];
     }
 }
 
 int main() {
     constexpr int n = 1<<24;
-    std::vector<int, CudaAllocator<int>> arr(n);
-    std::vector<int, CudaAllocator<int>> res(n);
-    std::vector<int, CudaAllocator<int>> counter(1);
-
-    // fill_sin 改成“网格跨步循环”以后，这里三重尖括号里的参数如何调整？10 分
-    fill_sin<<<n / 1024, 1024>>>(arr.data(), n);
-
-    // 这里的“边角料法”对于不是 1024 整数倍的 n 会出错，为什么？请修复：10 分
-    filter_positive<<<n / 1024, 1024>>>(counter.data(), res.data(), arr.data(), n);
-
-    // 这里 CPU 访问数据前漏了一步什么操作？请补上：10 分
-
+    thrust::universal_vector<float> arr(n);
+    thrust::universal_vector<float> res(n);
+    thrust::universal_vector<int> counter(1);
+
+    // TICK(FILL_SIN);
+    fill_sin<<<n / 1024, 32>>>(n, 
+    [arr = arr.data()] __device__ (size_t i){
+        arr[i] = sinf(i);
+    });
+    // checkCudaErrors(cudaDeviceSynchronize());
+    // TOCK(FILL_SIN);
+
+    // TICK(filter_positive);
+    filter_positive<n><<<(n + 1023) / 1024, 1024>>>(arr.data(), res.data(), counter.data());
+    // checkCudaErrors(cudaDeviceSynchronize());
+    // TOCK(filter_positive);
+
+    checkCudaErrors(cudaDeviceSynchronize());
     if (counter[0] <= n / 50) {
         printf("Result too short! %d <= %d\n", counter[0], n / 50);
         return -1;

diff --git a/run.sh b/run.sh
@@ -2,4 +2,5 @@
 set -e
 cmake -B build
 cmake --build build
-build/main
+# build/main
+build/Debug/main.exe