-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlearn_constant_mem.cu
231 lines (204 loc) · 8.09 KB
/
learn_constant_mem.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
/*
* This program is about rendering some colorful spheres using raytracing, while comparing whether the use of
* __constant__ memory will speed up the process.
*/
#include <cmath>
#include <cstdio>
#include "random"
#include "cuda.h"
#include "includes/check_error.cuh"
#include "includes/cpu_bitmap.h"
#define INF 2e10f
#define DIM 1024
#define SPHERE_NUM 20
float RND(float range) {
static std::random_device rd;
static std::mt19937 mt(rd());
std::uniform_real_distribution<float> dist(0.0f, range);
return dist(mt);
}
struct Sphere {
float r,g,b;
float radius;
float x,y,z;
// (ox, oy) is pixel on the screen, light ray will be casted from (ox, oy, 0), direction is (0, 0, 1)
__device__ float hit(float ox, float oy, float *edge_coefficient) const {
float dx = ox - x;
float dy = oy - y;
if (dx * dx + dy * dy < radius * radius) {
float dz = sqrtf(radius * radius - dx * dx - dy * dy);
// dz / radius will be the cosine of the angle between the normal vector and the light ray
// which indicates the light intensity
*edge_coefficient = dz / sqrtf(radius * radius);
return dz + z;
}
return -INF;
}
};
__global__ void ray_tracing_no_constant_mem(unsigned char *bitmap, Sphere *d_spheres) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int offset = x + y * gridDim.x * blockDim.x;
auto ox = (float)x - (float)DIM/2;
auto oy = (float)y - (float)DIM/2;
float r = 0, g = 0, b = 0;
float maxz = -INF;
for (int i = 0; i != SPHERE_NUM; ++i) {
float n;
float t = d_spheres[i].hit(ox, oy, &n);
if (t > maxz) {
float fsacle = n;
r = d_spheres[i].r * fsacle;
g = d_spheres[i].g * fsacle;
b = d_spheres[i].b * fsacle;
}
}
bitmap[offset*4 + 0] = (int)(r * 255);
bitmap[offset*4 + 1] = (int)(g * 255);
bitmap[offset*4 + 2] = (int)(b * 255);
bitmap[offset*4 + 3] = 255;
}
float no_constant_mem() {
cudaEvent_t start, stop;
CHECK_ERROR(cudaEventCreate(&start));
CHECK_ERROR(cudaEventCreate(&stop));
CHECK_ERROR(cudaEventRecord(start, 0));
CPUBitmap bitmap{DIM, DIM};
unsigned char* d_bitmap; Sphere *d_spheres;
auto *h_spheres = new Sphere[SPHERE_NUM];
for (int i = 0; i != SPHERE_NUM; ++i) {
h_spheres[i].r = RND(1.0f);
h_spheres[i].g = RND(1.0f);
h_spheres[i].b = RND(1.0f);
h_spheres[i].x = RND(DIM) - DIM/2;
h_spheres[i].y = RND(DIM) - DIM/2;
h_spheres[i].z = RND(DIM) - DIM/2;
h_spheres[i].radius = RND(500.0f) + 80.0f;
}
CHECK_ERROR(cudaMalloc((void**)&d_bitmap, bitmap.image_size()));
CHECK_ERROR(cudaMalloc((void**)&d_spheres, sizeof(Sphere) * SPHERE_NUM));
CHECK_ERROR(cudaMemcpy(d_spheres, h_spheres, sizeof(Sphere) * SPHERE_NUM,
cudaMemcpyHostToDevice));
delete[] h_spheres;
dim3 grid(DIM/16, DIM/16);
dim3 block(16, 16);
// kernel
ray_tracing_no_constant_mem<<<grid, block>>>(d_bitmap, d_spheres);
CHECK_ERROR(cudaMemcpy(bitmap.get_ptr(), d_bitmap, bitmap.image_size(),
cudaMemcpyDeviceToHost));
// h_bitmap.display_and_exit();
bitmap.save_as_bmp("./no_constant_mem.bmp");
cudaFree(d_bitmap);
cudaFree(d_spheres);
CHECK_ERROR(cudaEventRecord(stop, 0));
CHECK_ERROR(cudaEventSynchronize(stop));
float elapsedTime;
CHECK_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("Time to generate without constant mem: %3.1f ms\n", elapsedTime);
CHECK_ERROR(cudaEventDestroy(start));
CHECK_ERROR(cudaEventDestroy(stop));
return elapsedTime;
}
__constant__ Sphere d_spheres[SPHERE_NUM];
__global__ void ray_tracing(unsigned char *bitmap) {
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int offset = x + y * gridDim.x * blockDim.x;
auto ox = (float)x - (float)DIM/2;
auto oy = (float)y - (float)DIM/2;
float r = 0, g = 0, b = 0;
float maxz = -INF;
for (auto & d_sphere : d_spheres) {
float n;
float t = d_sphere.hit(ox, oy, &n);
if (t > maxz) {
float fsacle = n;
r = d_sphere.r * fsacle;
g = d_sphere.g * fsacle;
b = d_sphere.b * fsacle;
}
}
bitmap[offset*4 + 0] = (int)(r * 255);
bitmap[offset*4 + 1] = (int)(g * 255);
bitmap[offset*4 + 2] = (int)(b * 255);
bitmap[offset*4 + 3] = 255;
}
float constant_mem() {
cudaEvent_t start, stop;
CHECK_ERROR(cudaEventCreate(&start));
CHECK_ERROR(cudaEventCreate(&stop));
CHECK_ERROR(cudaEventRecord(start, 0));
CPUBitmap bitmap{DIM, DIM};
unsigned char* d_bitmap;
auto *h_spheres = new Sphere[SPHERE_NUM];
for (int i = 0; i != SPHERE_NUM; ++i) {
h_spheres[i].r = RND(1.0f);
h_spheres[i].g = RND(1.0f);
h_spheres[i].b = RND(1.0f);
h_spheres[i].x = RND(DIM) - DIM/2;
h_spheres[i].y = RND(DIM) - DIM/2;
h_spheres[i].z = RND(DIM) - DIM/2;
h_spheres[i].radius = RND(500.0f) + 80.0f;
}
CHECK_ERROR(cudaMalloc((void**)&d_bitmap, bitmap.image_size()));
// no need to allocate memory for __constant__ d_spheres
// CHECK_ERROR(cudaMalloc((void**)&d_spheres, sizeof(Sphere) * SPHERE_NUM));
// copy data to __constant__ d_spheres, note the new syntax
CHECK_ERROR(cudaMemcpyToSymbol(d_spheres, h_spheres, sizeof(Sphere) * SPHERE_NUM,
0, cudaMemcpyHostToDevice));
delete[] h_spheres;
dim3 grid(DIM/16, DIM/16);
dim3 block(16, 16);
// kernel
ray_tracing<<<grid, block>>>(d_bitmap);
CHECK_ERROR(cudaMemcpy(bitmap.get_ptr(), d_bitmap, bitmap.image_size(),
cudaMemcpyDeviceToHost));
// h_bitmap.display_and_exit();
bitmap.save_as_bmp("constant_mem.bmp");
cudaFree(d_bitmap);
// cudaFree(d_spheres);
CHECK_ERROR(cudaEventRecord(stop, 0));
CHECK_ERROR(cudaEventSynchronize(stop));
float elapsedTime;
CHECK_ERROR(cudaEventElapsedTime(&elapsedTime, start, stop));
printf("Time to generate with constant mem: %3.1f ms\n", elapsedTime);
CHECK_ERROR(cudaEventDestroy(start));
CHECK_ERROR(cudaEventDestroy(stop));
return elapsedTime;
}
/*
* Conclusion:
* Why we use constant memory? because although constant memory is off-chip, when we use constant memory, the constant
* will be broadcasted to half warp(16 threads) in the same SM, meaning half of the threads in the same SM will access
* the same constant memory with almost no latency. While that already saved about 94% of the time, the GPU will also
* actively cache the constant memory in the L1 cache, which will further reduce the latency.
*
*
* But when the DIM and SPHERE_NUM is very large, the constant memory will be too small to hold all the data,
* so we have to use global memory, but the broadcasting will still be used! That will be a serial process, which means
* the threads in the same half warp will access the global memory one by one, but if we just ignore the constant memory,
* the threads will access the global memory in parallel. So we can see performance drop when DIM and SPHERE_NUM is
* very large.
*
*/
int main() {
int run_times = 1;
float no_constant_mem_time = 0.0f;
float constant_mem_time = 0.0f;
for (int i = 0; i != run_times; ++i) {
no_constant_mem_time += no_constant_mem();
constant_mem_time += constant_mem();
}
/*
* Typical result(DIM=1024):
* Average time to generate without constant mem: 6.1 ms
* Average time to generate with constant mem: 5.5 ms
*
* Typical result(DIM=10240):
* Average time to generate without constant mem: 1675.9 ms
* Average time to generate with constant mem: 1966.0 ms
*/
printf("Average time to generate without constant mem: %3.1f ms\n", no_constant_mem_time / (float)run_times);
printf("Average time to generate with constant mem: %3.1f ms\n", constant_mem_time / (float)run_times);
return 0;
}