diff --git a/ANSWER.md b/ANSWER.md index 83349d8..d6fd2ac 100644 --- a/ANSWER.md +++ b/ANSWER.md @@ -1,23 +1,241 @@ -# 改进前 +# 环境 +CPU(s): 48 +On-line CPU(s) list: 0-47 +Thread(s) per core: 2 +L1d cache: 32K +L1i cache: 32K +L2 cache: 256K +L3 cache: 30720K + +# randomize +## 改进前 ``` -这里贴改进前的运行结果。 -matrix_randomize: 100s +原始数据 size_t n = 32 * (rng.next_uint64() % 16 + 24) +t=0: n=1120 +matrix_randomize: 0.00722261s +matrix_randomize: 0.00122988s + +t=1: n=928 +matrix_randomize: 0.000710999s +matrix_randomize: 0.000410166s + +t=2: n=1024 +matrix_randomize: 0.000646529s +matrix_randomize: 0.000539234s + +t=3: n=1056 +matrix_randomize: 0.00059311s +matrix_randomize: 0.000655608s + ``` +(0.00722261 + 0.00122988 + 0.000710999 + 0.000410166 + 0.000646529 + 0.000539234 + 0.00059311 + 0.000655608 ) / 8 = 0.001501017 -# 改进后 ``` -这里贴改进后的运行结果。 -matrix_randomize: 0.01s +1G数据 size_t n = 32 * (rng.next_uint64() % 16 + 24) * 20; +t=0: n=22400 +matrix_randomize: 1.37512s +matrix_randomize: 1.2348s +test_func: 4.62349s +t=1: n=18560 +matrix_randomize: 0.762454s +matrix_randomize: 0.709083s +test_func: 2.82452s +t=2: n=20480 +matrix_randomize: 1.22947s +matrix_randomize: 1.39852s +test_func: 4.28212s +t=3: n=21120 +matrix_randomize: 1.04415s +matrix_randomize: 1.06759s +test_func: 3.82594s +overall: 16.2411s ``` +(1.37512+1.2348+0.762454+0.709083+1.22947+1.39852+1.04415+1.06759)/8 = 1.1026483749999998 + +## 改进后 + +``` +原始数据 size_t n = 32 * (rng.next_uint64() % 16 + 24) +t=0: n=1120 +matrix_randomize: 0.00737349s +matrix_randomize: 0.000541591s + +t=1: n=928 +matrix_randomize: 0.000393746s +matrix_randomize: 0.000375455s + +t=2: n=1024 +matrix_randomize: 0.000367141s +matrix_randomize: 0.000707518s + +t=3: n=1056 +matrix_randomize: 0.000370469s +matrix_randomize: 0.000512963s +``` +(0.00737349+0.000541591+0.000393746+0.000375455+0.000367141+0.000707518+0.000370469+0.000512963) / 8 = 0.001330296625 + + +``` +1G 数据:size_t n = 32 * (rng.next_uint64() % 16 + 24) * 20; + + +t=0: n=22400 +matrix_randomize: 0.0764535s +matrix_randomize: 0.183382s +test_func: 2.33063s +t=1: n=18560 +matrix_randomize: 0.125141s +matrix_randomize: 0.0510746s +test_func: 1.58189s +t=2: n=20480 +matrix_randomize: 0.0598267s +matrix_randomize: 0.156077s +test_func: 1.93101s +t=3: n=21120 +matrix_randomize: 0.165283s +matrix_randomize: 0.0625712s +test_func: 2.09014s +overall: 8.67846s +``` +(0.0764535+ 0.183382+ 0.125141+0.0510746+0.0598267+ 0.156077+ 0.165283+0.0625712 ) / 8 = 0.10997612500000001 + +## 加速比 + +matrix_randomize: 10.026x ; 对矩阵大小进行了修改,n = 32 * (rng.next_uint64() % 16 + 24) * 20; 因为原来的n太小,三级cache就能把矩阵都放进去了,基本没有加速。 + -# 加速比 -matrix_randomize: 10000x -matrix_transpose: 10000x -matrix_multiply: 10000x -matrix_RtAR: 10000x + + + +# transpose +## 改进前 +``` +t=0: n=44800 +matrix_randomize: 0.261456s +matrix_randomize: 0.262549s +matrix_transpose: 8.39193s +matrix_transpose: 8.84256s +matrix_transpose: 4.72458s +matrix_transpose: 6.28754s +matrix_transpose: 5.01172s +``` + +## 改进后 +``` +t=0: n=44800 +matrix_randomize: 0.264891s +matrix_randomize: 0.587853s +matrix_transpose: 4.6243s +matrix_transpose: 1.51406s +matrix_transpose: 1.48309s +matrix_transpose: 2.88639s +matrix_transpose: 2.11405s +``` + + +## 加速比 +transpose: 2.3707x + + + +# multiply +## 改进前 +``` +t=0: n=1120 +matrix_randomize: 0.00686121s +matrix_randomize: 0.0020274s +matrix_transpose: 0.00631455s +matrix_multiply: 0.510823s +matrix_multiply: 0.479044s +matrix_RtAR: 0.996241s +matrix_trace: 1.8018e-05s +1.75932e+08 +test_func: 1.01514s +t=1: n=928 +matrix_randomize: 0.000342635s +matrix_randomize: 0.00167575s +matrix_transpose: 0.00546872s +matrix_multiply: 0.244366s +matrix_multiply: 0.244578s +matrix_RtAR: 0.494883s +matrix_trace: 6.2349e-05s +1.00156e+08 +test_func: 0.502805s +t=2: n=1024 +matrix_randomize: 0.00041904s +matrix_randomize: 0.000321632s +matrix_transpose: 0.0027552s +matrix_multiply: 0.351989s +matrix_multiply: 0.352222s +matrix_RtAR: 0.707403s +matrix_trace: 0.000112475s +1.34324e+08 +test_func: 0.714231s +t=3: n=1056 +matrix_randomize: 0.000361293s +matrix_randomize: 0.000411746s +matrix_transpose: 0.00290072s +matrix_multiply: 0.39902s +matrix_multiply: 0.396405s +matrix_RtAR: 0.798926s +matrix_trace: 0.00153154s +1.47405e+08 +test_func: 0.806829s +overall: 3.04113s +``` +(0.510823+0.479044+0.244366+0.244578+0.351989+0.352222+0.39902+0.396405)/8=0.372305875 +## 改进后 +``` +t=0: n=1120 +matrix_randomize: 0.00837036s +matrix_randomize: 0.000462134s +matrix_transpose: 0.0056561s +matrix_multiply: 0.0813393s +matrix_multiply: 0.0748275s +matrix_RtAR: 0.161888s +matrix_trace: 1.9308e-05s +1.76466e+08 +test_func: 0.180869s +t=1: n=928 +matrix_randomize: 0.000277639s +matrix_randomize: 0.00205225s +matrix_transpose: 0.00221065s +matrix_multiply: 0.0436374s +matrix_multiply: 0.0386557s +matrix_RtAR: 0.084553s +matrix_trace: 0.00322023s +1.00585e+08 +test_func: 0.094229s +t=2: n=1024 +matrix_randomize: 0.00182822s +matrix_randomize: 0.000406697s +matrix_transpose: 0.0028676s +matrix_multiply: 0.0494244s +matrix_multiply: 0.0495718s +matrix_RtAR: 0.102309s +matrix_trace: 6.6259e-05s +1.34691e+08 +test_func: 0.110543s +t=3: n=1056 +matrix_randomize: 0.000332161s +matrix_randomize: 0.000432429s +matrix_transpose: 0.00303381s +matrix_multiply: 0.0559908s +matrix_multiply: 0.0543113s +matrix_RtAR: 0.113629s +matrix_trace: 6.0144e-05s +1.47779e+08 +test_func: 0.120749s +overall: 0.508416s +``` +(0.0813393+0.0748275+0.0436374+0.0386557+0.0494244+0.0495718+0.0559908+0.0543113)/8=0.055969775 + +## 加速比 +multiply: 6.65x + > 如果记录了多种优化方法,可以做表格比较 @@ -27,19 +245,20 @@ matrix_RtAR: 10000x > matrix_randomize -请回答。 +使用_mm_stream_ps直写。 > matrix_transpose -请回答。 +使用tbb的simple_partitioner,其内置的morton能够充分利用cache。 > matrix_multiply -请回答。 +使用分块的思想。只对x进行了分块足矣。 > matrix_RtAR -请回答。 +Rt 和 RtA 改为static变量,预先分配好空间。 + # 我的创新点 diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d76276..8e6a07b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,9 @@ cmake_minimum_required(VERSION 3.12) project(hellocmake LANGUAGES CXX) set(CMAKE_CXX_STANDARD 17) + #if (NOT CMAKE_BUILD_TYPE) +#set(CMAKE_BUILD_TYPE Debug) set(CMAKE_BUILD_TYPE Release) #endif() @@ -11,8 +13,8 @@ add_executable(main main.cpp) find_package(OpenMP REQUIRED) target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX) -#find_package(TBB REQUIRED) -#target_link_libraries(main PUBLIC TBB::tbb) +find_package(TBB REQUIRED) +target_link_libraries(main PUBLIC TBB::tbb) if (MSVC) target_compile_options(main PUBLIC /fp:fast /arch:AVX) diff --git a/main.cpp b/main.cpp index d5af053..b413e7e 100644 --- a/main.cpp +++ b/main.cpp @@ -7,15 +7,23 @@ // 作业中有很多个问句,请通过注释回答问题,并改进其代码,以使其更快 // 并行可以用 OpenMP 也可以用 TBB +#include #include //#include // _mm 系列指令都来自这个头文件 -//#include // 如果上面那个不行,试试这个 +#include +#include +#include +#include +#include // 如果上面那个不行,试试这个 +#include +#include #include "ndarray.h" #include "wangsrng.h" #include "ticktock.h" // Matrix 是 YX 序的二维浮点数组:mat(x, y) = mat.data()[y * mat.shape(0) + x] -using Matrix = ndarray<2, float>; +// using Matrix = ndarray<2, float>; +using Matrix = ndarray<2, float, 0, 0, AlignedAllocator >; // 注意:默认对齐到 64 字节,如需 4096 字节,请用 ndarray<2, float, AlignedAllocator<4096, float>> static void matrix_randomize(Matrix &out) { @@ -24,13 +32,41 @@ static void matrix_randomize(Matrix &out) { size_t ny = out.shape(1); // 这个循环为什么不够高效?如何优化? 10 分 -#pragma omp parallel for collapse(2) - for (int x = 0; x < nx; x++) { - for (int y = 0; y < ny; y++) { - float val = wangsrng(x, y).next_float(); - out(x, y) = val; + // ans: 矩阵的x轴是紧密排列的,但是循环的内循环是y,访问数据时会跳跃,不利于cache; + // 写入数据可以采用直写的策略。 + +// #pragma omp parallel for collapse(2) +// for (int x = 0; x < nx; x++) { +// for (int y = 0; y < ny; y++) { +// float val = wangsrng(x, y).next_float(); +// out(x, y) = val; +// } +// } + + #pragma omp parallel for collapse(2) + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x+=4) { + // float t1[16]; + // #pragma omp simd + // for(int offset = 0; offset < 16; offset++){ + // t1[offset] = wangsrng(x+offset, y).next_float(); + // } + // for(int offset=0; offset < 16; offset+=4){ + // _mm_stream_si32((int *)&out(x+offset,y), *(int *)&t1[offset]); + // } + + + + __m128 tmp = {wangsrng(x, y).next_float(), wangsrng(x+1, y).next_float(), wangsrng(x+2, y).next_float(), wangsrng(x+3, y).next_float()}; + _mm_stream_ps(&out(x,y), tmp); + + // float val = wangsrng(x, y).next_float(); + // _mm_stream_si32((int *)&out(x,y), *(int *)&val); + // out(x, y) = val; } } + + TOCK(matrix_randomize); } @@ -41,12 +77,29 @@ static void matrix_transpose(Matrix &out, Matrix const &in) { out.reshape(ny, nx); // 这个循环为什么不够高效?如何优化? 15 分 -#pragma omp parallel for collapse(2) - for (int x = 0; x < nx; x++) { - for (int y = 0; y < ny; y++) { - out(y, x) = in(x, y); - } - } + // ans: 因为out矩阵是紧密访问,但是in矩阵是跳跃访问,cache中放不下。应改为分块转置。 + + +// #pragma omp parallel for collapse(2) +// for (int x = 0; x < nx; x++) { +// for (int y = 0; y < ny; y++) { +// out(y, x) = in(x, y); +// } +// } + + constexpr int block_size = 64; + tbb::parallel_for(tbb::blocked_range2d(0,ny, block_size, 0, nx, block_size), + [&](const tbb::blocked_range2d &r){ + for(size_t y=r.cols().begin(); y{1024, 1024}), RtA(std::array{1024, 1024}); + matrix_transpose(Rt, R); matrix_multiply(RtA, Rt, A); matrix_multiply(RtAR, RtA, R); @@ -106,6 +175,7 @@ static void test_func(size_t n) { Matrix RtAR; matrix_RtAR(RtAR, R, A); + std::cout << matrix_trace(RtAR) << std::endl; TOCK(test_func); @@ -116,6 +186,7 @@ int main() { TICK(overall); for (int t = 0; t < 4; t++) { size_t n = 32 * (rng.next_uint64() % 16 + 24); + // size_t n = 1<<13; std::cout << "t=" << t << ": n=" << n << std::endl; test_func(n); }