-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlearn_share_memory.cu
149 lines (135 loc) · 5.42 KB
/
learn_share_memory.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
/*
* This program is about computing dot_product in the way using __shared__ memory to sum up the results in a single
* block, and this can speed up the summing process by roughly 10x times
* Also, this program renders some green balls using __shared__ memory
*/
#include <cmath>
#include <cstdio>
#include "includes/check_error.cuh"
#include "cpu_bitmap.h"
#define M_PI 3.14159265358979323846
const unsigned int N = 1024;
// these two value is only used for dot_product
const unsigned int threads_per_block = 256;
const unsigned int blocks_per_grid = ((N - 1) / threads_per_block) + 1;
__global__ void dot_product(const float *a,const float *b, float *c) {
__shared__ float cache[threads_per_block];
unsigned int thread_id = threadIdx.x;
unsigned int tid = thread_id + blockIdx.x * blockDim.x;
float temp = 0.0f;
while (tid < N) {
temp += a[tid] * b[tid];
// move forward by a computation area
tid += gridDim.x * blockDim.x;
}
cache[thread_id] = temp;
// this is to ensure all threads has finished writing in cache
__syncthreads();
unsigned int reduction_step = blockDim.x / 2;
while (reduction_step != 0) {
if (thread_id < reduction_step) {
cache[thread_id] += cache[thread_id + reduction_step];
}
// this is to prevent the addition by reduction is not sync finished
__syncthreads();
reduction_step /= 2;
}
// just let the first thread to write to the final result(which is the sum of the block)
if (thread_id == 0) {
c[blockIdx.x] = cache[0];
}
}
__global__ void compute_bitmap(unsigned char *bitmap) {
__shared__ float cache[16][16];
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int offset = y * gridDim.x * blockDim.x + x;
const float period = 128.0f;
cache[threadIdx.y][threadIdx.x] = 255 * (sinf((float)((float)x * 2.0f * M_PI / period)) + 1.0f) *
(sinf((float)((float)y * 2.0f * M_PI / period)) + 1.0f) / 4.0f;
// if we don't add this line, the result will be wrong
__syncthreads();
bitmap[offset * 4 + 0] = 0;
bitmap[offset * 4 + 1] = (unsigned char)cache[threadIdx.y][threadIdx.x];
bitmap[offset * 4 + 2] = 0;
bitmap[offset * 4 + 3] = 255;
}
bool check_correct(float *cal, float *ref) {
for (int i = 0; i != blocks_per_grid; ++i) {
if (fabs(cal[i] - ref[i]) > 1e-5 * ((float)N / blocks_per_grid)) {
printf("Error: cal[%d] = %f, ref[%d] = %f\n", i, cal[i], i, ref[i]);
printf("The error is %f\n", fabs(cal[i] - ref[i]));
printf("The estimate error per thread is %f\n",
fabs(cal[i] - ref[i]) / ((float)N / blocks_per_grid));
return false;
}
}
return true;
}
int correct_dot_product() {
float *h_a, *h_b, *h_partial_c, *ref_partial_c;
float *d_a, *d_b, *d_partial_c;
h_a = new float[N];
h_b = new float[N];
h_partial_c = new float[blocks_per_grid];
ref_partial_c = new float[blocks_per_grid];
for (int i = 0; i != N; ++i) {
h_a[i] = (float)(i % 3) * 4.5f;
h_b[i] = (float)(i % 2) * 0.6f;
}
for (int j = 0; j != blocks_per_grid; ++j) {
// calculate the reference result
float temp = 0.0f;
for (int i = 0; i != threads_per_block; ++i) {
temp += h_a[i + j * threads_per_block] * h_b[i + j * threads_per_block];
}
ref_partial_c[j] = temp;
}
CHECK_ERROR(cudaMalloc((void**)&d_a, sizeof(float) * N));
CHECK_ERROR(cudaMalloc((void**)&d_b, sizeof(float) * N));
CHECK_ERROR(cudaMalloc((void**)&d_partial_c, sizeof(float) * blocks_per_grid));
CHECK_ERROR(cudaMemcpy(d_a, h_a, sizeof(float) * N, cudaMemcpyHostToDevice));
CHECK_ERROR(cudaMemcpy(d_b, h_b, sizeof(float) * N, cudaMemcpyHostToDevice));
dot_product<<<blocks_per_grid, threads_per_block>>>(d_a, d_b, d_partial_c);
CHECK_ERROR(cudaGetLastError());
CHECK_ERROR(cudaDeviceSynchronize());
CHECK_ERROR(cudaMemcpy(h_partial_c, d_partial_c, sizeof(float) * blocks_per_grid, cudaMemcpyDeviceToHost));
bool correct = check_correct(h_partial_c, ref_partial_c);
if (!correct) {
printf("The result is not correct!\n");
printf("To be mentioned, the precision of gpu calculating float is not as good as cpu\n");
printf("If a[i] or b[i] is too big(based on experiment, usually >10 is too big)"
"the result will be wrong\n");
}
float sum = 0.0f;
for (int i = 0; i != blocks_per_grid; ++i) {
sum += h_partial_c[i];
}
printf("The dot product of the two vectors is %f\n", sum);
CHECK_ERROR(cudaFree(d_a));
CHECK_ERROR(cudaFree(d_b));
CHECK_ERROR(cudaFree(d_partial_c));
delete[] h_a;
delete[] h_b;
delete[] h_partial_c;
delete[] ref_partial_c;
return 0;
}
int sin_bitmap() {
CPUBitmap h_bitmap(N, N);
unsigned char *d_bitmap;
CHECK_ERROR(cudaMalloc((void**)&d_bitmap, sizeof(unsigned char) * N * N * 4));
dim3 grid(N / 16, N / 16);
dim3 block(16, 16);
compute_bitmap<<<grid, block>>>(d_bitmap);
CHECK_ERROR(cudaGetLastError());
CHECK_ERROR(cudaDeviceSynchronize());
CHECK_ERROR(cudaMemcpy(h_bitmap.get_ptr(), d_bitmap, sizeof(unsigned char) * N * N * 4, cudaMemcpyDeviceToHost));
h_bitmap.display_and_exit();
return 0;
}
int main() {
// sin_bitmap();
correct_dot_product();
return 0;
}