diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..4d2541f --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "files.associations": { + "chrono": "cpp", + "cmath": "cpp" + } +} \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 29b152c..0647d87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,8 +2,16 @@ cmake_minimum_required(VERSION 3.12) project(hellocmake LANGUAGES CXX) set(CMAKE_CXX_STANDARD 17) +set(CMAKE_BUILD_TYPE Release) +set(CMAKE_CXX_FLAGS_DEBUG"-g") +set(CMAKE_CXX_FLAGS_RELEASE"-O3") + +# target_compile_options(testbench PUBLIC -ffast-math -march=native) +# find_package(OpenMP REQUIRED) +# target_link_libraries(testbench PUBLIC OpenMP::OpenMP_CXX) + if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() -add_executable(main main.cpp) + add_executable(main main.cpp) diff --git a/README.md b/README.md index 18caa7d..1ea699a 100644 --- a/README.md +++ b/README.md @@ -29,3 +29,47 @@ - 不允许做算法复杂度优化 - 可以针对编译器和平台优化,这次不要求跨平台 - 可以用 xmmintrin.h,如果你觉得编译器靠不住的话 + +初始数据: + +Initial energy: -8.571526 +Final energy: -8.511777 +Time elapsed: 6646 ms + +编译指令加入O3优化: + +Initial energy: -8.571526 +Final energy: -8.511777 +Time elapsed: 1737 ms + +将结构体OOP改成DOP + +Initial energy: -8.571526 +Final energy: -8.511777 +Time elapsed: 1734 ms + +加入编译指令 + +``` +#pragma GCC ivdep +#pragma GCC unroll 4 +``` + +Initial energy: -8.571302 +Final energy: -8.511518 +Time elapsed: 1587 ms + +加上暴力火车头: + +Initial energy: -8.571527 +Final energy: -8.511723 +Time elapsed: 1175 ms + + +加入编译指令: + +`-ffast-math -march=native` + +Initial energy: -8.571527 +Final energy: -8.511747 +Time elapsed: 210 ms diff --git a/initial.cpp b/initial.cpp new file mode 100644 index 0000000..3298423 --- /dev/null +++ b/initial.cpp @@ -0,0 +1,88 @@ +#include +#include +#include +#include +#include + +float frand() { + return (float)rand() / RAND_MAX * 2 - 1; +} + +struct Star { + float px, py, pz; + float vx, vy, vz; + float mass; +}; + +std::vector stars; + +void init() { + for (int i = 0; i < 48; i++) { + stars.push_back({ + frand(), frand(), frand(), + frand(), frand(), frand(), + frand() + 1, + }); + } +} + +float G = 0.001; +float eps = 0.001; +float dt = 0.01; + +void step() { + for (auto &star: stars) { + for (auto &other: stars) { + float dx = other.px - star.px; + float dy = other.py - star.py; + float dz = other.pz - star.pz; + float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + d2 *= sqrt(d2); + star.vx += dx * other.mass * G * dt / d2; + star.vy += dy * other.mass * G * dt / d2; + star.vz += dz * other.mass * G * dt / d2; + } + } + for (auto &star: stars) { + star.px += star.vx * dt; + star.py += star.vy * dt; + star.pz += star.vz * dt; + } +} + +float calc() { + float energy = 0; + for (auto &star: stars) { + float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; + energy += star.mass * v2 / 2; + for (auto &other: stars) { + float dx = other.px - star.px; + float dy = other.py - star.py; + float dz = other.pz - star.pz; + float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + energy -= other.mass * star.mass * G / sqrt(d2) / 2; + } + } + return energy; +} + +template +long benchmark(Func const &func) { + auto t0 = std::chrono::steady_clock::now(); + func(); + auto t1 = std::chrono::steady_clock::now(); + auto dt = std::chrono::duration_cast(t1 - t0); + return dt.count(); +} + +int main() { + init(); + printf("Initial energy: %f\n", calc()); + auto dt = benchmark([&] { + for (int i = 0; i < 100000; i++) + step(); + }); + printf("Final energy: %f\n", calc()); + printf("Time elapsed: %ld ms\n", dt); + return 0; +} \ No newline at end of file diff --git a/initial.exe b/initial.exe new file mode 100644 index 0000000..b726876 Binary files /dev/null and b/initial.exe differ diff --git a/main.cpp b/main.cpp index cf6369b..fcc9d3e 100644 --- a/main.cpp +++ b/main.cpp @@ -3,64 +3,115 @@ #include #include #include +#include +#include //mmx +#include //sse +#include //sse2 +#include //sse3 + +#pragma GCC target("avx") +#pragma GCC optimize(3) +#pragma GCC optimize("Ofast") +#pragma GCC optimize("inline") +#pragma GCC optimize("-fgcse") +#pragma GCC optimize("-fgcse-lm") +#pragma GCC optimize("-fipa-sra") +#pragma GCC optimize("-ftree-pre") +#pragma GCC optimize("-ftree-vrp") +#pragma GCC optimize("-fpeephole2") +#pragma GCC optimize("-ffast-math") +#pragma GCC optimize("-fsched-spec") +#pragma GCC optimize("unroll-loops") +#pragma GCC optimize("-falign-jumps") +#pragma GCC optimize("-falign-loops") +#pragma GCC optimize("-falign-labels") +#pragma GCC optimize("-fdevirtualize") +#pragma GCC optimize("-fcaller-saves") +#pragma GCC optimize("-fcrossjumping") +#pragma GCC optimize("-fthread-jumps") +#pragma GCC optimize("-funroll-loops") +#pragma GCC optimize("-freorder-blocks") +#pragma GCC optimize("-fschedule-insns") +#pragma GCC optimize("inline-functions") +#pragma GCC optimize("-ftree-tail-merge") +#pragma GCC optimize("-fschedule-insns2") +#pragma GCC optimize("-fstrict-aliasing") +#pragma GCC optimize("-falign-functions") +#pragma GCC optimize("-fcse-follow-jumps") +#pragma GCC optimize("-fsched-interblock") +#pragma GCC optimize("-fpartial-inlining") +#pragma GCC optimize("no-stack-protector") +#pragma GCC optimize("-freorder-functions") +#pragma GCC optimize("-findirect-inlining") +#pragma GCC optimize("-fhoist-adjacent-loads") +#pragma GCC optimize("-frerun-cse-after-loop") +#pragma GCC optimize("inline-small-functions") +#pragma GCC optimize("-finline-small-functions") +#pragma GCC optimize("-ftree-switch-conversion") +#pragma GCC optimize("-foptimize-sibling-calls") +#pragma GCC optimize("-fexpensive-optimizations") +#pragma GCC optimize("inline-functions-called-once") +#pragma GCC optimize("-fdelete-null-pointer-checks") + float frand() { return (float)rand() / RAND_MAX * 2 - 1; } -struct Star { - float px, py, pz; - float vx, vy, vz; - float mass; -}; - -std::vector stars; +__declspec(align(16)) float px[48],py[48],pz[48]; +__declspec(align(16)) float vx[48],vy[48],vz[48]; +__declspec(align(16)) float mass[48]; void init() { - for (int i = 0; i < 48; i++) { - stars.push_back({ - frand(), frand(), frand(), - frand(), frand(), frand(), - frand() + 1, - }); + for (uint32_t i = 0; i < 48; i++) { + px[i] = frand();py[i]=frand();pz[i] = frand(); + vx[i] = frand();vy[i]=frand();vz[i]=frand(); + mass[i] = frand()+1; } } -float G = 0.001; -float eps = 0.001; -float dt = 0.01; +constexpr float G = 0.001; +constexpr float eps = 0.001; +constexpr float dt = 0.01; void step() { - for (auto &star: stars) { - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; + float dx,dy,dz,d2; + for (size_t i=0;i<(uint32_t)48;++i) { + for (size_t j=0;j<(uint32_t)48;++j) { + // #pragma omp simd + dx = px[j] - px[i]; + dy = py[j] - py[i]; + dz = pz[j] - pz[i]; + d2 = dx * dx + dy * dy + dz * dz + (eps * eps); d2 *= sqrt(d2); - star.vx += dx * other.mass * G * dt / d2; - star.vy += dy * other.mass * G * dt / d2; - star.vz += dz * other.mass * G * dt / d2; + d2 = mass[j] * G * dt / d2; + vx[i] += dx * d2; + vy[i] += dy * d2; + vz[i] += dz * d2; } } - for (auto &star: stars) { - star.px += star.vx * dt; - star.py += star.vy * dt; - star.pz += star.vz * dt; + for(size_t i=0;i<48; ++i){ + // #pragma omp simd + px[i] += vx[i] * dt; + py[i] += vy[i] * dt; + pz[i] += vz[i] * dt; } } float calc() { + float dx,dy,dz,d2; float energy = 0; - for (auto &star: stars) { - float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz; - energy += star.mass * v2 / 2; - for (auto &other: stars) { - float dx = other.px - star.px; - float dy = other.py - star.py; - float dz = other.pz - star.pz; - float d2 = dx * dx + dy * dy + dz * dz + eps * eps; - energy -= other.mass * star.mass * G / sqrt(d2) / 2; + for (size_t i=0;i<48;++i) { + // #pragma omp simd + float v2 = vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]; + energy += mass[i] * v2 * 0.5; + for (size_t j=0;j<48;++j) { + // #pragma omp simd + dx = px[j] - px[i]; + dy = py[j] - py[i]; + dz = pz[j] - pz[i]; + d2 = (dx * dx + dy * dy + dz * dz + (eps * eps)); + energy -= mass[j] * mass[i] * 0.0005 / sqrt(d2); } } return energy; diff --git a/main.exe b/main.exe new file mode 100644 index 0000000..c7da8a1 Binary files /dev/null and b/main.exe differ diff --git a/rbq.bat b/rbq.bat new file mode 100644 index 0000000..0be8cc2 --- /dev/null +++ b/rbq.bat @@ -0,0 +1,6 @@ +g++ -o main.exe main.cpp -ffast-math -march=native -msse4.1 -O2 -fopenmp -O3 -std=c++17 +main.exe +main.exe +main.exe +main.exe +main.exe \ No newline at end of file