diff --git a/ANSWER.md b/ANSWER.md
index 83349d8..d6fd2ac 100644
--- a/ANSWER.md
+++ b/ANSWER.md
@@ -1,23 +1,241 @@
-# 改进前
+# 环境
+CPU(s):              48
+On-line CPU(s) list: 0-47
+Thread(s) per core:  2
+L1d cache:           32K
+L1i cache:           32K
+L2 cache:            256K
+L3 cache:            30720K
 
+
+# randomize
+## 改进前
 ```
-这里贴改进前的运行结果。
-matrix_randomize: 100s
+原始数据 size_t n = 32 * (rng.next_uint64() % 16 + 24)
+t=0: n=1120
+matrix_randomize: 0.00722261s
+matrix_randomize: 0.00122988s
+
+t=1: n=928
+matrix_randomize: 0.000710999s
+matrix_randomize: 0.000410166s
+
+t=2: n=1024
+matrix_randomize: 0.000646529s
+matrix_randomize: 0.000539234s
+
+t=3: n=1056
+matrix_randomize: 0.00059311s
+matrix_randomize: 0.000655608s
+
 ```
+(0.00722261 + 0.00122988 + 0.000710999 + 0.000410166 + 0.000646529 + 0.000539234 + 0.00059311 + 0.000655608 ) / 8 = 0.001501017
 
-# 改进后
 
 ```
-这里贴改进后的运行结果。
-matrix_randomize: 0.01s
+1G数据 size_t n = 32 * (rng.next_uint64() % 16 + 24) * 20;
+t=0: n=22400
+matrix_randomize: 1.37512s
+matrix_randomize: 1.2348s
+test_func: 4.62349s
+t=1: n=18560
+matrix_randomize: 0.762454s
+matrix_randomize: 0.709083s
+test_func: 2.82452s
+t=2: n=20480
+matrix_randomize: 1.22947s
+matrix_randomize: 1.39852s
+test_func: 4.28212s
+t=3: n=21120
+matrix_randomize: 1.04415s
+matrix_randomize: 1.06759s
+test_func: 3.82594s
+overall: 16.2411s
 ```
+(1.37512+1.2348+0.762454+0.709083+1.22947+1.39852+1.04415+1.06759)/8 = 1.1026483749999998
+
+## 改进后
+
+```
+原始数据 size_t n = 32 * (rng.next_uint64() % 16 + 24)
+t=0: n=1120
+matrix_randomize: 0.00737349s
+matrix_randomize: 0.000541591s
+
+t=1: n=928
+matrix_randomize: 0.000393746s
+matrix_randomize: 0.000375455s
+
+t=2: n=1024
+matrix_randomize: 0.000367141s
+matrix_randomize: 0.000707518s
+
+t=3: n=1056
+matrix_randomize: 0.000370469s
+matrix_randomize: 0.000512963s
+```
+(0.00737349+0.000541591+0.000393746+0.000375455+0.000367141+0.000707518+0.000370469+0.000512963) / 8 = 0.001330296625
+
+
+```
+1G 数据：size_t n = 32 * (rng.next_uint64() % 16 + 24) * 20;
+
+
+t=0: n=22400
+matrix_randomize: 0.0764535s
+matrix_randomize: 0.183382s
+test_func: 2.33063s
+t=1: n=18560
+matrix_randomize: 0.125141s
+matrix_randomize: 0.0510746s
+test_func: 1.58189s
+t=2: n=20480
+matrix_randomize: 0.0598267s
+matrix_randomize: 0.156077s
+test_func: 1.93101s
+t=3: n=21120
+matrix_randomize: 0.165283s
+matrix_randomize: 0.0625712s
+test_func: 2.09014s
+overall: 8.67846s
+```
+(0.0764535+ 0.183382+ 0.125141+0.0510746+0.0598267+ 0.156077+ 0.165283+0.0625712 ) / 8 = 0.10997612500000001
+
+## 加速比
+
+matrix_randomize: 10.026x ; 对矩阵大小进行了修改，n = 32 * (rng.next_uint64() % 16 + 24) * 20; 因为原来的n太小，三级cache就能把矩阵都放进去了，基本没有加速。
+
 
-# 加速比
 
-matrix_randomize: 10000x
-matrix_transpose: 10000x
-matrix_multiply: 10000x
-matrix_RtAR: 10000x
+
+
+
+# transpose
+## 改进前
+```
+t=0: n=44800
+matrix_randomize: 0.261456s
+matrix_randomize: 0.262549s
+matrix_transpose: 8.39193s
+matrix_transpose: 8.84256s
+matrix_transpose: 4.72458s
+matrix_transpose: 6.28754s
+matrix_transpose: 5.01172s
+```
+
+## 改进后
+```
+t=0: n=44800
+matrix_randomize: 0.264891s
+matrix_randomize: 0.587853s
+matrix_transpose: 4.6243s
+matrix_transpose: 1.51406s
+matrix_transpose: 1.48309s
+matrix_transpose: 2.88639s
+matrix_transpose: 2.11405s
+```
+
+
+## 加速比
+transpose: 2.3707x
+
+
+
+# multiply
+## 改进前
+```
+t=0: n=1120
+matrix_randomize: 0.00686121s
+matrix_randomize: 0.0020274s
+matrix_transpose: 0.00631455s
+matrix_multiply: 0.510823s
+matrix_multiply: 0.479044s
+matrix_RtAR: 0.996241s
+matrix_trace: 1.8018e-05s
+1.75932e+08
+test_func: 1.01514s
+t=1: n=928
+matrix_randomize: 0.000342635s
+matrix_randomize: 0.00167575s
+matrix_transpose: 0.00546872s
+matrix_multiply: 0.244366s
+matrix_multiply: 0.244578s
+matrix_RtAR: 0.494883s
+matrix_trace: 6.2349e-05s
+1.00156e+08
+test_func: 0.502805s
+t=2: n=1024
+matrix_randomize: 0.00041904s
+matrix_randomize: 0.000321632s
+matrix_transpose: 0.0027552s
+matrix_multiply: 0.351989s
+matrix_multiply: 0.352222s
+matrix_RtAR: 0.707403s
+matrix_trace: 0.000112475s
+1.34324e+08
+test_func: 0.714231s
+t=3: n=1056
+matrix_randomize: 0.000361293s
+matrix_randomize: 0.000411746s
+matrix_transpose: 0.00290072s
+matrix_multiply: 0.39902s
+matrix_multiply: 0.396405s
+matrix_RtAR: 0.798926s
+matrix_trace: 0.00153154s
+1.47405e+08
+test_func: 0.806829s
+overall: 3.04113s
+```
+(0.510823+0.479044+0.244366+0.244578+0.351989+0.352222+0.39902+0.396405)/8=0.372305875
+## 改进后
+```
+t=0: n=1120
+matrix_randomize: 0.00837036s
+matrix_randomize: 0.000462134s
+matrix_transpose: 0.0056561s
+matrix_multiply: 0.0813393s
+matrix_multiply: 0.0748275s
+matrix_RtAR: 0.161888s
+matrix_trace: 1.9308e-05s
+1.76466e+08
+test_func: 0.180869s
+t=1: n=928
+matrix_randomize: 0.000277639s
+matrix_randomize: 0.00205225s
+matrix_transpose: 0.00221065s
+matrix_multiply: 0.0436374s
+matrix_multiply: 0.0386557s
+matrix_RtAR: 0.084553s
+matrix_trace: 0.00322023s
+1.00585e+08
+test_func: 0.094229s
+t=2: n=1024
+matrix_randomize: 0.00182822s
+matrix_randomize: 0.000406697s
+matrix_transpose: 0.0028676s
+matrix_multiply: 0.0494244s
+matrix_multiply: 0.0495718s
+matrix_RtAR: 0.102309s
+matrix_trace: 6.6259e-05s
+1.34691e+08
+test_func: 0.110543s
+t=3: n=1056
+matrix_randomize: 0.000332161s
+matrix_randomize: 0.000432429s
+matrix_transpose: 0.00303381s
+matrix_multiply: 0.0559908s
+matrix_multiply: 0.0543113s
+matrix_RtAR: 0.113629s
+matrix_trace: 6.0144e-05s
+1.47779e+08
+test_func: 0.120749s
+overall: 0.508416s
+```
+(0.0813393+0.0748275+0.0436374+0.0386557+0.0494244+0.0495718+0.0559908+0.0543113)/8=0.055969775
+
+## 加速比
+multiply: 6.65x
+
 
 > 如果记录了多种优化方法，可以做表格比较
 
@@ -27,19 +245,20 @@ matrix_RtAR: 10000x
 
 > matrix_randomize
 
-请回答。
+使用_mm_stream_ps直写。
 
 > matrix_transpose
 
-请回答。
+使用tbb的simple_partitioner，其内置的morton能够充分利用cache。
 
 > matrix_multiply
 
-请回答。
+使用分块的思想。只对x进行了分块足矣。
 
 > matrix_RtAR
 
-请回答。
+Rt 和 RtA 改为static变量，预先分配好空间。
+
 
 # 我的创新点
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d76276..8e6a07b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,9 @@ cmake_minimum_required(VERSION 3.12)
 project(hellocmake LANGUAGES CXX)
 
 set(CMAKE_CXX_STANDARD 17)
+
 #if (NOT CMAKE_BUILD_TYPE)
+#set(CMAKE_BUILD_TYPE Debug)
 set(CMAKE_BUILD_TYPE Release)
 #endif()
 
@@ -11,8 +13,8 @@ add_executable(main main.cpp)
 find_package(OpenMP REQUIRED)
 target_link_libraries(main PUBLIC OpenMP::OpenMP_CXX)
 
-#find_package(TBB REQUIRED)
-#target_link_libraries(main PUBLIC TBB::tbb)
+find_package(TBB REQUIRED)
+target_link_libraries(main PUBLIC TBB::tbb)
 
 if (MSVC)
     target_compile_options(main PUBLIC /fp:fast /arch:AVX)
diff --git a/main.cpp b/main.cpp
index d5af053..b413e7e 100644
--- a/main.cpp
+++ b/main.cpp
@@ -7,15 +7,23 @@
 // 作业中有很多个问句，请通过注释回答问题，并改进其代码，以使其更快
 // 并行可以用 OpenMP 也可以用 TBB
 
+#include <cstddef>
 #include <iostream>
 //#include <x86intrin.h>  // _mm 系列指令都来自这个头文件
-//#include <xmmintrin.h>  // 如果上面那个不行，试试这个
+#include <oneapi/tbb/blocked_range.h>
+#include <oneapi/tbb/blocked_range2d.h>
+#include <oneapi/tbb/parallel_for.h>
+#include <oneapi/tbb/partitioner.h>
+#include <xmmintrin.h>  // 如果上面那个不行，试试这个
+#include <tbb/parallel_for.h>
+#include <tbb/blocked_range2d.h>
 #include "ndarray.h"
 #include "wangsrng.h"
 #include "ticktock.h"
 
 // Matrix 是 YX 序的二维浮点数组：mat(x, y) = mat.data()[y * mat.shape(0) + x]
-using Matrix = ndarray<2, float>;
+// using Matrix = ndarray<2, float>;
+using Matrix = ndarray<2, float, 0, 0, AlignedAllocator<float, 4096> >;
 // 注意：默认对齐到 64 字节，如需 4096 字节，请用 ndarray<2, float, AlignedAllocator<4096, float>>
 
 static void matrix_randomize(Matrix &out) {
@@ -24,13 +32,41 @@ static void matrix_randomize(Matrix &out) {
     size_t ny = out.shape(1);
 
     // 这个循环为什么不够高效？如何优化？ 10 分
-#pragma omp parallel for collapse(2)
-    for (int x = 0; x < nx; x++) {
-        for (int y = 0; y < ny; y++) {
-            float val = wangsrng(x, y).next_float();
-            out(x, y) = val;
+    // ans: 矩阵的x轴是紧密排列的，但是循环的内循环是y，访问数据时会跳跃，不利于cache;
+    // 写入数据可以采用直写的策略。
+
+// #pragma omp parallel for collapse(2)
+//     for (int x = 0; x < nx; x++) {
+//         for (int y = 0; y < ny; y++) {
+//             float val = wangsrng(x, y).next_float();
+//             out(x, y) = val;
+//         }
+//     }
+
+    #pragma omp parallel for collapse(2)
+    for (int y = 0; y < ny; y++) {
+        for (int x = 0; x < nx; x+=4) {
+            // float t1[16];
+            // #pragma omp simd
+            // for(int offset = 0; offset < 16; offset++){
+            //     t1[offset] = wangsrng(x+offset, y).next_float();
+            // }
+            // for(int offset=0; offset < 16; offset+=4){
+            //     _mm_stream_si32((int *)&out(x+offset,y), *(int *)&t1[offset]);
+            // }
+
+
+            
+            __m128 tmp = {wangsrng(x, y).next_float(), wangsrng(x+1, y).next_float(), wangsrng(x+2, y).next_float(), wangsrng(x+3, y).next_float()};
+            _mm_stream_ps(&out(x,y), tmp);
+
+            // float val = wangsrng(x, y).next_float();
+            // _mm_stream_si32((int *)&out(x,y), *(int *)&val);
+            // out(x, y) = val;
         }
     }
+
+
     TOCK(matrix_randomize);
 }
 
@@ -41,12 +77,29 @@ static void matrix_transpose(Matrix &out, Matrix const &in) {
     out.reshape(ny, nx);
 
     // 这个循环为什么不够高效？如何优化？ 15 分
-#pragma omp parallel for collapse(2)
-    for (int x = 0; x < nx; x++) {
-        for (int y = 0; y < ny; y++) {
-            out(y, x) = in(x, y);
-        }
-    }
+    // ans: 因为out矩阵是紧密访问，但是in矩阵是跳跃访问，cache中放不下。应改为分块转置。
+
+
+// #pragma omp parallel for collapse(2)
+//     for (int x = 0; x < nx; x++) {
+//         for (int y = 0; y < ny; y++) {
+//             out(y, x) = in(x, y);
+//         }
+//     }
+
+    constexpr int block_size = 64;
+    tbb::parallel_for(tbb::blocked_range2d<size_t>(0,ny, block_size, 0, nx, block_size),
+        [&](const tbb::blocked_range2d<size_t> &r){
+            for(size_t y=r.cols().begin(); y<r.cols().end(); y++){
+                for(size_t x=r.rows().begin(); x<r.rows().end(); x++){
+                    out(x,y) = in(y,x);
+                }
+            }
+        },
+        tbb::simple_partitioner{}
+        );
+
+
     TOCK(matrix_transpose);
 }
 
@@ -62,15 +115,29 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
     out.reshape(nx, ny);
 
     // 这个循环为什么不够高效？如何优化？ 15 分
-#pragma omp parallel for collapse(2)
-    for (int y = 0; y < ny; y++) {
-        for (int x = 0; x < nx; x++) {
-            out(x, y) = 0;  // 有没有必要手动初始化？ 5 分
-            for (int t = 0; t < nt; t++) {
-                out(x, y) += lhs(x, t) * rhs(t, y);
+    // ans: lhs是跳跃访问，rhs是连续访问，out不动，造成无法矢量化。
+
+// #pragma omp parallel for collapse(2)
+//     for (int y = 0; y < ny; y++) {
+//         for (int x = 0; x < nx; x++) {
+//             out(x, y) = 0;  // 有没有必要手动初始化？ 5 分
+//             for (int t = 0; t < nt; t++) {
+//                 out(x, y) += lhs(x, t) * rhs(t, y);
+//             }
+//         }
+//     }
+
+    #pragma omp parallel for collapse(2)
+    for(int j=0; j<ny; j++){
+        for(int i=0; i<nx; i+=32){
+            for(int t=0; t<nt; t++){
+                for(int i_block=i; i_block<i+32; i_block++){
+                    out(i,j) += lhs(i_block, t) *  rhs(t, j);
+                }
             }
         }
     }
+
     TOCK(matrix_multiply);
 }
 
@@ -78,7 +145,9 @@ static void matrix_multiply(Matrix &out, Matrix const &lhs, Matrix const &rhs) {
 static void matrix_RtAR(Matrix &RtAR, Matrix const &R, Matrix const &A) {
     TICK(matrix_RtAR);
     // 这两个是临时变量，有什么可以优化的？ 5 分
-    Matrix Rt, RtA;
+    // ans: 改为static变量，预先分配好空间。
+    static Matrix Rt(std::array<std::size_t, 2>{1024, 1024}), RtA(std::array<std::size_t, 2>{1024, 1024});
+    
     matrix_transpose(Rt, R);
     matrix_multiply(RtA, Rt, A);
     matrix_multiply(RtAR, RtA, R);
@@ -106,6 +175,7 @@ static void test_func(size_t n) {
 
     Matrix RtAR;
     matrix_RtAR(RtAR, R, A);
+    
 
     std::cout << matrix_trace(RtAR) << std::endl;
     TOCK(test_func);
@@ -116,6 +186,7 @@ int main() {
     TICK(overall);
     for (int t = 0; t < 4; t++) {
         size_t n = 32 * (rng.next_uint64() % 16 + 24);
+        // size_t n = 1<<13;
         std::cout << "t=" << t << ": n=" << n << std::endl;
         test_func(n);
     }