Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ if (NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
# 如果需要指定显卡版本号的话:
# set(CMAKE_CUDA_ARCHITECTURES 52)
set(CMAKE_CUDA_ARCHITECTURES 52)

project(hellocmake LANGUAGES CXX CUDA)

add_executable(main main.cu)
target_include_directories(main PUBLIC include)
target_compile_options(main PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
54 changes: 31 additions & 23 deletions main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,47 @@
#include <cmath>
#include <vector>
// #include <thrust/device_vector.h> // 如果想用 thrust 也是没问题的
#include <thrust/universal_vector.h>
#include <ticktock.h>

// 这是基于“边角料法”的,请把他改成基于“网格跨步循环”的:10 分
__global__ void fill_sin(int *arr, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) return;
arr[i] = sinf(i);
template <class Fun>
__global__ void fill_sin(int n, Fun func) {
int step = gridDim.x * blockDim.x;
for(size_t i=blockIdx.x * blockDim.x + threadIdx.x; i < n; i+=step)
func(i);
}

__global__ void filter_positive(int *counter, int *res, int const *arr, int n) {
template <int N, class T1, class T2, class T3>
__global__ void filter_positive(T1 arr, T2 res, T3 cnt){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) return;
if (arr[i] >= 0) {
// 这里有什么问题?请改正:10 分
int loc = *counter;
*counter += 1;
res[loc] = n;
if (i > N) return;
if(arr[i] >= 0){
int loc = atomicAdd(&cnt[0], 1);
res[loc] = arr[i];
}
}

int main() {
constexpr int n = 1<<24;
std::vector<int, CudaAllocator<int>> arr(n);
std::vector<int, CudaAllocator<int>> res(n);
std::vector<int, CudaAllocator<int>> counter(1);

// fill_sin 改成“网格跨步循环”以后,这里三重尖括号里的参数如何调整?10 分
fill_sin<<<n / 1024, 1024>>>(arr.data(), n);

// 这里的“边角料法”对于不是 1024 整数倍的 n 会出错,为什么?请修复:10 分
filter_positive<<<n / 1024, 1024>>>(counter.data(), res.data(), arr.data(), n);

// 这里 CPU 访问数据前漏了一步什么操作?请补上:10 分

thrust::universal_vector<float> arr(n);
thrust::universal_vector<float> res(n);
thrust::universal_vector<int> counter(1);

// TICK(FILL_SIN);
fill_sin<<<n / 1024, 32>>>(n,
[arr = arr.data()] __device__ (size_t i){
arr[i] = sinf(i);
});
// checkCudaErrors(cudaDeviceSynchronize());
// TOCK(FILL_SIN);

// TICK(filter_positive);
filter_positive<n><<<(n + 1023) / 1024, 1024>>>(arr.data(), res.data(), counter.data());
// checkCudaErrors(cudaDeviceSynchronize());
// TOCK(filter_positive);

checkCudaErrors(cudaDeviceSynchronize());
if (counter[0] <= n / 50) {
printf("Result too short! %d <= %d\n", counter[0], n / 50);
return -1;
Expand Down
3 changes: 2 additions & 1 deletion run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
set -e
cmake -B build
cmake --build build
build/main
# build/main
build/Debug/main.exe