-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkernel.cu
284 lines (243 loc) · 13 KB
/
kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cassert>
#include <cmath>
#include <iostream>
#include <sstream>
#include <string>
#include "kernel.h"
#include "json.hpp"
#include <fstream>
#include <numeric>
#include <algorithm>
#include "matrix_print.hpp"
__global__ void feed_forward_and_backprop(float *z_matrix, float* activation_matrix, float* weight_matrix, float* biases, int* shape, int shape_length, float* outputs, float* error, float* error_weights,
int* activation_offset_table, int* bias_offset_table, int* z_offset_table, int* wm_offset_table)
{
// Feed forward of an entire neural network for one single input
// We go trhough each layer step by step and calculate the neurons activations
// After each layer we synchronize the threads
// But now with multiple inputs
// blockDim.x is the same as number_of_inputs
// blockDim.y is the maximum number of neurons per layer of the entire network example_shape = [7,9,3] => blockDim.y = 9
// blockDim.y isn't used in the code but it's required for launch enough threads
// threadIdx.x is the current input nr
// threadIdx.y is the current neuron in the l+1th layer (also called the outgoing neuron)
// Now we also use backpropagation there might be some changes as how we use the threads especially for BP4
int output_neurons = shape[shape_length - 1];
int *error_offset_table = z_offset_table;
int input_offset = 0;
int input_offset_next = 0;
for (int layer = 0; layer < shape_length - 1; layer++)
{
if (threadIdx.y < shape[layer+1]) // for every outgoing neuron one thread
{
input_offset = shape[layer] * threadIdx.x;
input_offset_next = shape[layer+1] * threadIdx.x;
// w*x
for (int incoming_neuron = 0; incoming_neuron < shape[layer]; incoming_neuron++)
{
z_matrix[z_offset_table[layer] + input_offset_next + threadIdx.y] += weight_matrix[wm_offset_table[layer] + (shape[layer]-1)*incoming_neuron + threadIdx.y] * activation_matrix[activation_offset_table[layer] + input_offset + incoming_neuron];
}
// w*x + b
z_matrix[z_offset_table[layer] + input_offset_next + threadIdx.y] += biases[bias_offset_table[layer] + threadIdx.y];
// sig(w*x + b)
activation_matrix[activation_offset_table[layer+1] + input_offset_next + threadIdx.y] = 1.0 / (1.0 + exp(-z_matrix[z_offset_table[layer] + input_offset_next + threadIdx.y]));
}
__syncthreads();
}
if (threadIdx.y < shape[shape_length - 1])
{
// output doesn't need the offset as it's just the final layer
// error and activation matrix need the offset for the last layer because they contain information about all the layers
// layer_offset is still set from the forward propagation to exactly the last layer
// but the error matrix has no values for the input as it doesnt make sense so we substract the first layer offset
error[error_offset_table[shape_length-2] + threadIdx.x * output_neurons + threadIdx.y] = activation_matrix[activation_offset_table[shape_length-1] + threadIdx.x * output_neurons + threadIdx.y] - outputs[threadIdx.x * output_neurons + threadIdx.y];
}
for (int layer = shape_length - 2; layer > 0; layer--)
{
// BP 2 + 3
// threadIdx.y is the neuron in the current layer we need to sum up error1 * w11 + error2 * w21 + error3 * w31....
// and we need to do this for all inputs threadIdx.x
if (threadIdx.y < shape[layer])
{
for (int higher_layer_neuron_nr = 0; higher_layer_neuron_nr < shape[layer + 1]; higher_layer_neuron_nr++)
{
error[error_offset_table[layer-1] + threadIdx.y + threadIdx.x * shape[layer]] += error[error_offset_table[layer] + higher_layer_neuron_nr + threadIdx.x * shape[layer+1]] *
weight_matrix[wm_offset_table[layer] + threadIdx.y * shape[layer+1] + higher_layer_neuron_nr];
}
float sig_strich = activation_matrix[activation_offset_table[layer] + threadIdx.y + threadIdx.x * shape[layer]] * (1 - activation_matrix[activation_offset_table[layer] + threadIdx.y + threadIdx.x * shape[layer]]);
error[error_offset_table[layer-1] + threadIdx.y + threadIdx.x * shape[layer]] *= sig_strich;
}
__syncthreads();
}
// BP4
for (int layer = 0; layer < shape_length - 1; layer++)
{
int nr_of_threads = shape[layer] * shape[layer + 1];
// for each weight one thread
if (threadIdx.y < shape[layer] * shape[layer + 1])
{
// I know this is yet not efficient for many inputs . Problems is it's not that easy because we need to access the same memory address when summing
// There is an summing algorithm which is more efficient but maybe later
for (int input_nr = 0; input_nr < blockDim.x; input_nr++)
{
error_weights[wm_offset_table[layer] + threadIdx.y] +=
activation_matrix[activation_offset_table[layer] + input_nr*shape[layer] + threadIdx.y / shape[layer + 1]] *
error[error_offset_table[layer] + input_nr*shape[layer+1] + threadIdx.y % shape[layer + 1]];
}
}
__syncthreads();
}
}
void calculate_offsets(int* activation_offset_table, int *bias_offset_table, int *z_offset_table, int *weight_matrix_offset_table, int* shape, int shape_length, int nr_of_inputs)
{
// Goal of this to get the correct offset for a certain layer by offset_table[layer]
activation_offset_table[0] = 0;
bias_offset_table[0] = 0;
z_offset_table[0] = 0;
weight_matrix_offset_table[0] = 0;
for (int i = 1; i < shape_length; i++)
{
activation_offset_table[i] = activation_offset_table[i - 1] + shape[i - 1] * nr_of_inputs;
}
for (int i = 1; i < shape_length - 1; i++)
{
bias_offset_table[i] = bias_offset_table[i - 1] + shape[i];
z_offset_table[i] = bias_offset_table[i] * nr_of_inputs;
weight_matrix_offset_table[i] = weight_matrix_offset_table[i - 1] + shape[i] * shape[i - 1];
}
}
int main()
{
using json = nlohmann::json;
std::ifstream i("C:\\Users\\trung\\Projects\\NeuNetCUDA\\network.json");
json network_init_json = json::parse(i);
int NUM_NEURONS_TOTAL = accumulate(network_init_json["shape"].begin(), network_init_json["shape"].end(), 0);
int NUM_LAYERS_TOTAL = (int)network_init_json["shape"].size();
int *SHAPE = (int*)malloc(network_init_json["shape"].size() * sizeof(int));
int *SHAPE_WEIGHTS = (int*)malloc((network_init_json["shape"].size() - 1) * sizeof(int));
std::copy(network_init_json["shape"].begin(), network_init_json["shape"].end(), SHAPE);
if (network_init_json["input"].size() % SHAPE[0] != 0)
{
std::cout << "Shape of input doesn't match should be a multiple of " << SHAPE[0] << std::endl;
return 0;
}
int number_of_inputs = (int)network_init_json["input"].size() / SHAPE[0];
// Initilaize weight sizes from json
int weight_sizes = 0;
for (int i = 0; i < network_init_json["shape"].size() - 1; i++)
{
weight_sizes += (int)network_init_json["shape"][i] * (int)network_init_json["shape"][i + 1];
SHAPE_WEIGHTS[i] = (int)network_init_json["shape"][i] * (int)network_init_json["shape"][i + 1];
}
// Initilaize bias sizes from json
const int bias_sizes = NUM_NEURONS_TOTAL - SHAPE[0];
// Calculate all the needed sizes
const size_t bytes_biases = bias_sizes * sizeof(float);
const size_t bytes_z = bias_sizes * sizeof(float) * number_of_inputs;
const size_t bytes_error = bias_sizes * sizeof(float) * number_of_inputs;
const size_t bytes_error_weights = weight_sizes * sizeof(float);
const size_t bytes_weights = weight_sizes * sizeof(float);
const size_t bytes_activations = (NUM_NEURONS_TOTAL) * sizeof(float) * number_of_inputs;
const size_t bytes_shape = NUM_LAYERS_TOTAL * sizeof(int);
const size_t bytes_output = (int)network_init_json["output"].size() * sizeof(float);
const size_t bytes_activation_offset = NUM_LAYERS_TOTAL * sizeof(int);
const size_t bytes_biass_offset = (NUM_LAYERS_TOTAL - 1) * sizeof(int);
const size_t bytes_z_offset = bytes_biass_offset;
const size_t bytes_wm_offset = bytes_biass_offset;
// Lookup tables for the offsets
int *activation_offset_table = (int*)malloc(bytes_activation_offset);
int *bias_offset_table = (int*)malloc(bytes_biass_offset);
int *z_offset_table = (int*)malloc(bytes_z_offset);
int *weight_matrix_offset_table = (int*)malloc(bytes_wm_offset);
calculate_offsets(activation_offset_table, bias_offset_table, z_offset_table, weight_matrix_offset_table, SHAPE, NUM_LAYERS_TOTAL, number_of_inputs);
/*---------------- Fill weights ----------------------*/
float *host_weights = (float*) malloc(bytes_weights);
std::copy(network_init_json["weights"].begin(), network_init_json["weights"].end(), host_weights);
print_weight_matrix_numpy_style(host_weights, SHAPE, NUM_LAYERS_TOTAL);
/*---------------- Fill biases ----------------------*/
float *host_biases = (float*)malloc(bytes_biases);
std::copy(network_init_json["biases"].begin(), network_init_json["biases"].end(), host_biases);
print_bias_matrix_numpy_style(host_biases, SHAPE, NUM_LAYERS_TOTAL);
/*---------------- COPY BIASES INTO ONE BIG ARRAY----------------------*/
// Copy Input into activation matrix
float *host_activations = (float*)malloc(bytes_activations);
std::copy(network_init_json["input"].begin(), network_init_json["input"].end(), host_activations);
// Copy Output into output matrix
float *host_outputs = (float*)malloc(bytes_output);
std::copy(network_init_json["output"].begin(), network_init_json["output"].end(), host_outputs);
// Initialize z Matrix
float *host_z = (float*)malloc(bytes_z);
memset(host_z, 0, bytes_z);
float *host_error = (float*)malloc(bytes_error);
memset(host_error, 0, bytes_error);
float *host_error_weights = (float*)malloc(bytes_error_weights);
memset(host_error_weights, 0, bytes_error_weights);
// Allocate GPU device memory
float *d_biases, *d_weights, *d_activations, *d_z, *d_error, *d_error_weights, *d_output;
int *d_shape, *d_activation_offset_table, *d_bias_offset_table, *d_z_offset_table, *d_weight_matrix_offset_table;
cudaMalloc(&d_error, bytes_error);
cudaMalloc(&d_output, bytes_output);
cudaMalloc(&d_biases, bytes_biases);
cudaMalloc(&d_weights, bytes_weights);
cudaMalloc(&d_error_weights, bytes_error_weights);
cudaMalloc(&d_activations, bytes_activations);
cudaMalloc(&d_z, bytes_z);
cudaMalloc(&d_shape, bytes_shape);
cudaMalloc(&d_activation_offset_table, bytes_activation_offset);
cudaMalloc(&d_bias_offset_table, bytes_biass_offset);
cudaMalloc(&d_z_offset_table, bytes_z_offset);
cudaMalloc(&d_weight_matrix_offset_table, bytes_wm_offset);
// Copy data to GPU
cudaMemcpy(d_error_weights, host_error_weights, bytes_error_weights, cudaMemcpyHostToDevice);
cudaMemcpy(d_output, host_outputs, bytes_output, cudaMemcpyHostToDevice);
cudaMemcpy(d_error, host_error, bytes_error, cudaMemcpyHostToDevice);
cudaMemcpy(d_biases, host_biases, bytes_biases, cudaMemcpyHostToDevice);
cudaMemcpy(d_weights, host_weights, bytes_weights, cudaMemcpyHostToDevice);
cudaMemcpy(d_activations, host_activations, bytes_activations, cudaMemcpyHostToDevice);
cudaMemcpy(d_z, host_z, bytes_z, cudaMemcpyHostToDevice);
cudaMemcpy(d_shape, SHAPE, bytes_shape, cudaMemcpyHostToDevice);
cudaMemcpy(d_activation_offset_table, activation_offset_table, bytes_activation_offset, cudaMemcpyHostToDevice);
cudaMemcpy(d_bias_offset_table, bias_offset_table, bytes_biass_offset, cudaMemcpyHostToDevice);
cudaMemcpy(d_z_offset_table, z_offset_table, bytes_z_offset, cudaMemcpyHostToDevice);
cudaMemcpy(d_weight_matrix_offset_table, weight_matrix_offset_table, bytes_wm_offset, cudaMemcpyHostToDevice);
// Prepare call parameters threadblock size and gride size
int max_threads_gtx1070ti = 1024;
int x_dim = number_of_inputs;
// preivious version but changed for BP4 int y_dim = *std::max_element(SHAPE, SHAPE + network_init_json["shape"].size() - 1);
int y_dim = *std::max_element(SHAPE_WEIGHTS, SHAPE_WEIGHTS + network_init_json["shape"].size() - 2);
dim3 dimBlock(x_dim, y_dim, 1);
int num_blocks = ((x_dim * y_dim) / max_threads_gtx1070ti + 1);
if (num_blocks > 1)
{
std::cout << "Can't handle this yet";
return 0;
}
dim3 dimGrid(num_blocks, 1, 1);
// Call cuda feed forward function for multiple inputs
feed_forward_and_backprop << <dimGrid, dimBlock >> > (d_z, d_activations, d_weights, d_biases, d_shape, NUM_LAYERS_TOTAL, d_output, d_error, d_error_weights, d_activation_offset_table, d_bias_offset_table, d_z_offset_table, d_weight_matrix_offset_table);
// Copy data from GPU to RAM
cudaMemcpy(host_activations, d_activations, bytes_activations, cudaMemcpyDeviceToHost);
cudaMemcpy(host_weights, d_weights, bytes_weights, cudaMemcpyDeviceToHost);
cudaMemcpy(host_biases, d_biases, bytes_biases, cudaMemcpyDeviceToHost);
cudaMemcpy(host_z, d_z, bytes_z, cudaMemcpyDeviceToHost);
cudaMemcpy(host_error, d_error, bytes_error, cudaMemcpyDeviceToHost);
cudaMemcpy(host_error_weights, d_error_weights, bytes_error_weights, cudaMemcpyDeviceToHost);
// Free our memory
cudaFree(d_biases);
cudaFree(d_weights);
cudaFree(d_activations);
cudaFree(d_shape);
cudaFree(d_z);
cudaFree(d_error);
cudaFree(d_error_weights);
print_activations(host_activations, number_of_inputs, SHAPE, NUM_LAYERS_TOTAL);
print_error(host_error, number_of_inputs, SHAPE, NUM_LAYERS_TOTAL);
print_weight_matrix_numpy_style(host_error_weights, SHAPE, NUM_LAYERS_TOTAL);
getchar();
return 0;
}