diff --git a/.gitignore b/.gitignore index 9ed92a0..488f741 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ build GNUmakefile +bench.txt \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 29b152c..6c85c06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,3 +7,4 @@ if (NOT CMAKE_BUILD_TYPE) endif() add_executable(main main.cpp) +target_compile_options(main PUBLIC -ffast-math -march=native) diff --git a/main.cpp b/main.cpp index cf6369b..923249b 100644 --- a/main.cpp +++ b/main.cpp @@ -1,73 +1,121 @@ +#include +#include #include #include +#include #include -#include -#include -float frand() { - return (float)rand() / RAND_MAX * 2 - 1; +#define VECTOR_SIZE 48 + +/* static float frand() { + float INV_RAND_MAX = 1.0 / RAND_MAX; + return (float)rand() * (INV_RAND_MAX * 2) - 1; +} */ +static std::tuple frand7() { + float INV_RAND_MAX = 1.0 / RAND_MAX; + float a = (float)rand() * (INV_RAND_MAX * 2) - 1; + float b = (float)rand() * (INV_RAND_MAX * 2) - 1; + float c = (float)rand() * (INV_RAND_MAX * 2) - 1; + float d = (float)rand() * (INV_RAND_MAX * 2) - 1; + float e = (float)rand() * (INV_RAND_MAX * 2) - 1; + float f = (float)rand() * (INV_RAND_MAX * 2) - 1; + float g = (float)rand() * (INV_RAND_MAX * 2) - 1; + return std::make_tuple(a, b, c, d, e, f, g); } -struct Star { - float px, py, pz; - float vx, vy, vz; - float mass; +struct alignas(32) Star { + // float px, py, pz; + // float vx, vy, vz; + // float mass; + float px[VECTOR_SIZE]; + float py[VECTOR_SIZE]; + float pz[VECTOR_SIZE]; + float vx[VECTOR_SIZE]; + float vy[VECTOR_SIZE]; + float vz[VECTOR_SIZE]; + float mass[VECTOR_SIZE]; }; -std::vector stars; +// std::vector stars; +Star stars; -void init() { - for (int i = 0; i < 48; i++) { - stars.push_back({ - frand(), frand(), frand(), - frand(), frand(), frand(), - frand() + 1, - }); +static void init() { +#pragma GCC unroll 8 + for (size_t i = 0; i < VECTOR_SIZE; i++) { + // stars.push_back({ + // frand(), frand(), frand(), + // frand(), frand(), frand(), + // frand() + 1, + // }); + // stars.px.emplace_back(frand()); + // stars.py.emplace_back(frand()); + // stars.pz.emplace_back(frand()); + // stars.vx.emplace_back(frand()); + // stars.vy.emplace_back(frand()); + // stars.vz.emplace_back(frand()); + // stars.mass.emplace_back(frand() + 1); + auto [a, b, c, d, e, f, g] = frand7(); + stars.px[i] = a; + stars.py[i] = b; + stars.pz[i] = c; + stars.vx[i] = d; + stars.vy[i] = e; + stars.vz[i] = f; + stars.mass[i] = g + 1; } } -float G = 0.001; -float eps = 0.001; -float dt = 0.01; +constexpr float G = 0.001; +constexpr float eps = 0.001; +constexpr float dt = 0.01; +constexpr float eps_times_eps = eps * eps; // eps * eps -void step() { - for (auto &star: stars) { - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - d2 *= sqrt(d2); - star.vx += dx * other.mass * G * dt / d2; - star.vy += dy * other.mass * G * dt / d2; - star.vz += dz * other.mass * G * dt / d2; +static void step() { +#pragma GCC ivdep + for (int i = 0; i < VECTOR_SIZE; ++i) { + for (int j = 0; j < VECTOR_SIZE; ++j) { + if (i == j) + continue; + float dx = stars.px[j] - stars.px[i]; + float dy = stars.py[j] - stars.py[i]; + float dz = stars.pz[j] - stars.pz[i]; + float d2 = dx * dx + dy * dy + dz * dz + eps_times_eps; + d2 *= std::sqrt(d2); + float inv_d2 = 1.0 / d2; + float tmp = stars.mass[j] * (G * dt) * inv_d2; + stars.vx[i] += dx * tmp; + stars.vy[i] += dy * tmp; + stars.vz[i] += dz * tmp; } } - for (auto &star: stars) { - star.px += star.vx * dt; - star.py += star.vy * dt; - star.pz += star.vz * dt; +#pragma GCC unroll 16 + for (int i = 0; i < VECTOR_SIZE; ++i) { + stars.px[i] += stars.vx[i] * dt; + stars.py[i] += stars.vy[i] * dt; + stars.pz[i] += stars.vz[i] * dt; } } -float calc() { +static float calc() { float energy = 0; - for (auto &star: stars) { - float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; - energy += star.mass * v2 / 2; - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - energy -= other.mass * star.mass * G / sqrt(d2) / 2; +#pragma GCC ivdep + for (int i = 0; i < VECTOR_SIZE; ++i) { + float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i]; + energy += stars.mass[i] * v2 / 2; + for (int j = 0; j < VECTOR_SIZE; ++j) { + float dx = stars.px[j] - stars.px[i]; + float dy = stars.py[j] - stars.py[i]; + float dz = stars.pz[j] - stars.pz[i]; + float d2 = dx * dx + dy * dy + dz * dz + eps_times_eps; + float inv_sqrt_d2 = 1 / std::sqrt(d2); + energy -= stars.mass[j] * stars.mass[i] * (G * 0.5f) * inv_sqrt_d2; } } return energy; } template -long benchmark(Func const &func) { +static long benchmark(Func const& func) { auto t0 = std::chrono::steady_clock::now(); func(); auto t1 = std::chrono::steady_clock::now(); @@ -79,7 +127,7 @@ int main() { init(); printf("Initial energy: %f\n", calc()); auto dt = benchmark([&] { - for (int i = 0; i < 100000; i++) + for (size_t i = 0; i < 100000; i++) step(); }); printf("Final energy: %f\n", calc());