-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.cpp
114 lines (89 loc) · 3.01 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#include "avx2.h"
#include "avx2_const_me.h"
#include "avx512.h"
#include "avx512_no_unroll.h"
#include "avx512_modern.h"
#include "ggml.h"
#include <benchmark/benchmark.h>
#include <algorithm>
#include <cmath>
#include <string>
#include <sstream>
#include <vector>
constexpr size_t HidSize = 4096;
constexpr size_t InterSize = 11008;
constexpr size_t BatchSize = 8;
constexpr size_t WorkspaceSize = 65536;
struct Tensors {
ggml_tensor src0;
ggml_tensor src1;
ggml_tensor dst;
ggml_tensor ref_dst;
std::vector<char> workspace;
};
Tensors LoadTensors() {
auto res = Tensors {
.src0 = LoadFromFile("src0.bin", HidSize, InterSize, true),
.src1 = LoadFromFile("src1.bin", HidSize, BatchSize, false),
.ref_dst = LoadFromFile("dst.bin", InterSize, BatchSize, false),
};
res.dst = res.ref_dst;
// Intentionally leak memory
res.dst.data = new char[res.ref_dst.ByteSize()]{};
res.workspace.resize(WorkspaceSize);
return res;
}
void SanityCheck(const Tensors& tensors) {
const float* actual = reinterpret_cast<const float*>(tensors.dst.data);
const float* expected = reinterpret_cast<const float*>(tensors.ref_dst.data);
for (size_t i = 0; i < tensors.dst.NumElems(); ++i) {
const auto a = actual[i];
const auto e = expected[i];
if (std::isnan(a) != std::isnan(e) || std::abs(a - e) > 1e-6) {
std::stringstream ss;
ss << "Sanity check failed at index #" << i << ": " << a << " != " << e;
throw std::runtime_error(ss.str());
}
}
}
static void Avx2Vanilla(benchmark::State& state) {
auto tensors = LoadTensors();
for (auto _ : state) {
MatMulAvx2(&tensors.src0, &tensors.src1, &tensors.dst, tensors.workspace.data());
}
SanityCheck(tensors);
}
static void Avx2ConstMe(benchmark::State& state) {
auto tensors = LoadTensors();
for (auto _ : state) {
MatMulAvx2ConstMe(&tensors.src0, &tensors.src1, &tensors.dst, tensors.workspace.data());
}
SanityCheck(tensors);
}
static void Avx512Vanilla(benchmark::State& state) {
auto tensors = LoadTensors();
for (auto _ : state) {
MatMulAvx512(&tensors.src0, &tensors.src1, &tensors.dst, tensors.workspace.data());
}
SanityCheck(tensors);
}
static void Avx512NoUnroll(benchmark::State& state) {
auto tensors = LoadTensors();
for (auto _ : state) {
MatMulAvx512NoUnroll(&tensors.src0, &tensors.src1, &tensors.dst, tensors.workspace.data());
}
SanityCheck(tensors);
}
static void Avx512Modern(benchmark::State& state) {
auto tensors = LoadTensors();
for (auto _ : state) {
MatMulAvx512Modern(&tensors.src0, &tensors.src1, &tensors.dst, tensors.workspace.data());
}
SanityCheck(tensors);
}
BENCHMARK(Avx2Vanilla)->Name("AVX2/vanilla");
BENCHMARK(Avx2ConstMe)->Name("AVX2/const-me");
BENCHMARK(Avx512Vanilla)->Name("AVX-512/vanilla");
BENCHMARK(Avx512NoUnroll)->Name("AVX-512/no-unroll");
BENCHMARK(Avx512Modern)->Name("AVX-512/modern");
BENCHMARK_MAIN();