diff --git a/ANSWER.md b/ANSWER.md index 83349d8..92da641 100644 --- a/ANSWER.md +++ b/ANSWER.md @@ -1,23 +1,110 @@ +# 实验环境 +* 16cores 32threads +``` +L1d: 512 KiB (16 instances) +L1i: 512 KiB (16 instances) +L2: 8 MiB (16 instances) +L3: 64 MiB (2 instances) +``` + # 改进前 ``` -这里贴改进前的运行结果。 -matrix_randomize: 100s +t=0: n=1120 +matrix_randomize: 0.00971097s +matrix_randomize: 0.00703878s +matrix_transpose: 0.00927667s +matrix_multiply: 0.117476s +matrix_multiply: 0.117298s +matrix_RtAR: 0.244081s +matrix_trace: 0.00835175s +1.75932e+08 +test_func: 0.274363s +t=1: n=928 +matrix_randomize: 0.006546s +matrix_randomize: 0.00750128s +matrix_transpose: 0.010777s +matrix_multiply: 0.0455778s +matrix_multiply: 0.0537599s +matrix_RtAR: 0.110256s +matrix_trace: 0.00894227s +1.00156e+08 +test_func: 0.141392s +t=2: n=1024 +matrix_randomize: 0.00713252s +matrix_randomize: 0.00754496s +matrix_transpose: 0.00931327s +matrix_multiply: 0.194533s +matrix_multiply: 0.187665s +matrix_RtAR: 0.391531s +matrix_trace: 0.00910213s +1.34324e+08 +test_func: 0.419415s +t=3: n=1056 +matrix_randomize: 0.0118236s +matrix_randomize: 0.00950726s +matrix_transpose: 0.00668107s +matrix_multiply: 0.0764283s +matrix_multiply: 0.0805108s +matrix_RtAR: 0.163727s +matrix_trace: 0.00896033s +1.47405e+08 +test_func: 0.197871s +overall: 1.0342s ``` # 改进后 ``` -这里贴改进后的运行结果。 -matrix_randomize: 0.01s +t=0: n=1120 +matrix_randomize: 0.000273793s +matrix_randomize: 0.000278233s +matrix_transpose: 0.00315305s +matrix_multiply: 0.0296057s +matrix_multiply: 0.0279403s +matrix_RtAR: 0.060762s +matrix_trace: 0.00917836s +1.75932e+08 +test_func: 0.0749736s +t=1: n=928 +matrix_randomize: 0.000221557s +matrix_randomize: 0.000185628s +matrix_transpose: 0.000508399s +matrix_multiply: 0.0319896s +matrix_multiply: 0.0188709s +matrix_RtAR: 0.0516253s +matrix_trace: 0.00931379s +1.00156e+08 +test_func: 0.0648803s +t=2: n=1024 +matrix_randomize: 0.000262414s +matrix_randomize: 0.000265054s +matrix_transpose: 0.000564065s +matrix_multiply: 0.0198131s +matrix_multiply: 0.0238569s +matrix_RtAR: 0.0443844s +matrix_trace: 0.0134706s +1.34324e+08 +test_func: 0.062279s +t=3: n=1056 +matrix_randomize: 0.000309561s +matrix_randomize: 0.000290673s +matrix_transpose: 0.000471451s +matrix_multiply: 0.0261711s +matrix_multiply: 0.0282188s +matrix_RtAR: 0.0549805s +matrix_trace: 0.00934559s +1.47405e+08 +test_func: 0.0692714s +overall: 0.273484s ``` # 加速比 -matrix_randomize: 10000x -matrix_transpose: 10000x -matrix_multiply: 10000x -matrix_RtAR: 10000x +matrix_randomize: 50x +matrix_transpose: 20x +matrix_multiply: 6x +matrix_RtAR: 4x > 如果记录了多种优化方法,可以做表格比较 @@ -26,20 +113,16 @@ matrix_RtAR: 10000x 下面这些函数你是如何优化的?是什么思路?用了老师上课的哪个知识点? > matrix_randomize - -请回答。 +更改循环XY顺序,提高cache命中率即可 > matrix_transpose - -请回答。 +tbb+分块 > matrix_multiply - -请回答。 +寄存器分块+内部unroll > matrix_RtAR - -请回答。 +static thread local # 我的创新点 diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d76276..3cd661c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,8 +11,8 @@ add_executable(main main.cpp) find_package(OpenMP REQUIRED) target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) -#find_package(TBB REQUIRED) -#target_link_libraries(main PUBLIC TBB::tbb) +find_package(TBB REQUIRED) +target_link_libraries(main PUBLIC TBB::tbb) if (MSVC) target_compile_options(main PUBLIC /fp:fast /arch:AVX) diff --git a/main.cpp b/main.cpp index d5af053..1c21ae3 100644 --- a/main.cpp +++ b/main.cpp @@ -8,22 +8,29 @@ // 并行可以用 OpenMP 也可以用 TBB #include -//#include // _mm 系列指令都来自这个头文件 +#include // _mm 系列指令都来自这个头文件 //#include // 如果上面那个不行,试试这个 #include "ndarray.h" #include "wangsrng.h" #include "ticktock.h" +#include +#include +#include +#include + // Matrix 是 YX 序的二维浮点数组:mat(x, y) = mat.data()[y * mat.shape(0) + x] using Matrix = ndarray<2, float>; // 注意:默认对齐到 64 字节,如需 4096 字节,请用 ndarray<2, float, AlignedAllocator<4096, float>> - +// using Matrix = ndarray<2, float, 0, 0, AlignedAllocator>; static void matrix_randomize(Matrix &out) { TICK(matrix_randomize); size_t nx = out.shape(0); size_t ny = out.shape(1); // 这个循环为什么不够高效?如何优化? 10 分 + // Matrix是 YX 序列的,此循环按照XY顺序遍历二维数组,导致数据不是顺序访问,没有利用好cache +/* #pragma omp parallel for collapse(2) for (int x = 0; x < nx; x++) { for (int y = 0; y < ny; y++) { @@ -31,9 +38,19 @@ static void matrix_randomize(Matrix &out) { out(x, y) = val; } } +*/ + + for (int y = 0; y < ny; y ++) { + for (int x = 0; x < nx; x ++) { + float val = wangsrng(x, y).next_float(); + out(x, y) = val; + } + } TOCK(matrix_randomize); } + + static void matrix_transpose(Matrix &out, Matrix const &in) { TICK(matrix_transpose); size_t nx = in.shape(0); @@ -41,6 +58,10 @@ static void matrix_transpose(Matrix &out, Matrix const &in) { out.reshape(ny, nx); // 这个循环为什么不够高效?如何优化? 15 分 + // Matrix是YX序列,此循环按照XY顺序访问二维数组,虽然out(y,x)是顺序访问,但是in(x, y)仍然是跳跃访问, + // 违背了空间局域性,每次跳跃了ny,所以只要缓存容量小于ny就无法命中,因此采用循环分块的方法,使得 + // block_size^2的大小小于缓存容量,即可全部命中 +/* #pragma omp parallel for collapse(2) for (int x = 0; x < nx; x++) { for (int y = 0; y < ny; y++) { @@ -49,6 +70,22 @@ static void matrix_transpose(Matrix &out, Matrix const &in) { } TOCK(matrix_transpose); } +*/ + + constexpr int block_size = 64; + tbb::parallel_for(tbb::blocked_range2d(0,ny, block_size, 0, nx, block_size), + [&](const tbb::blocked_range2d &r){ + for(size_t y=r.cols().begin(); y