Skip to content

Commit

Permalink
DarkNet on CUDA version 2 Fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Piotr Sowa committed Jun 6, 2021
1 parent b33b3be commit a355ed5
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 81 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ ARFLAGS=rcs
OPTS=-Ofast
LDFLAGS= -lm -pthread
COMMON= -Iinclude/ -Isrc/
CFLAGS=-Wall -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors -fPIC
CFLAGS=-Wno-unknown-pragmas -Wno-unused-variable -Wno-unused-result -Wno-deprecated-declarations -Wno-unused-function -Wfatal-errors -fPIC

ifeq ($(OPENMP), 1)
CFLAGS+= -fopenmp
Expand Down
2 changes: 1 addition & 1 deletion cfg/yolov1.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ batch_normalize=1
filters=64
size=7
stride=2
pad=1
pad=3
activation=leaky

[maxpool]
Expand Down
1 change: 0 additions & 1 deletion examples/classifier.c
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,6 @@ void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *fi
}
if(r.data != im.data) free_image(r);

if (resize) free_image(r);
free_image(im);
if (filename) break;
}
Expand Down
9 changes: 6 additions & 3 deletions examples/detector.c
Original file line number Diff line number Diff line change
Expand Up @@ -536,14 +536,17 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
fprintf(stderr, "Total Detection Time: %f Seconds\n", what_time_is_it_now() - start);
}

void validate_detector_recall(char *cfgfile, char *weightfile)
void validate_detector_recall(char *datacfg, char *cfgfile, char *weightfile)
{
network *net = load_network(cfgfile, weightfile, 0);
set_batch_network(net, 1);
fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
srand(time(0));

list *plist = get_paths("data/coco_val_5k.list");
// list *plist = get_paths("data/coco_val_5k.list");
list *options = read_data_cfg(datacfg);
char *test_images = option_find_str(options, "test", "data/test.list");
list *plist = get_paths(test_images);
char **paths = (char **)list_to_array(plist);

layer l = net->layers[net->n-1];
Expand Down Expand Up @@ -967,7 +970,7 @@ void run_detector(int argc, char **argv)
else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear);
else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile);
else if(0==strcmp(argv[2], "valid2")) validate_detector_flip(datacfg, cfg, weights, outfile);
else if(0==strcmp(argv[2], "recall")) validate_detector_recall(cfg, weights);
else if(0==strcmp(argv[2], "recall")) validate_detector_recall(datacfg, cfg, weights);
else if(0==strcmp(argv[2], "demo")) {
list *options = read_data_cfg(datacfg);
int classes = option_find_int(options, "classes", 20);
Expand Down
22 changes: 14 additions & 8 deletions src/batchnorm_layer.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ layer make_batchnorm_layer(int batch, int w, int h, int c)
l.rolling_mean = calloc(c, sizeof(float));
l.rolling_variance = calloc(c, sizeof(float));

l.mean_delta = calloc(c, sizeof(float));
l.variance_delta = calloc(c, sizeof(float));

l.x = calloc(l.batch*l.outputs, sizeof(float));
l.x_norm = calloc(l.batch*l.outputs, sizeof(float));

l.forward = forward_batchnorm_layer;
l.backward = backward_batchnorm_layer;
#ifdef GPU
Expand All @@ -50,20 +56,20 @@ layer make_batchnorm_layer(int batch, int w, int h, int c)
l.mean_gpu = cuda_make_array(l.mean, c);
l.variance_gpu = cuda_make_array(l.variance, c);

l.rolling_mean_gpu = cuda_make_array(l.mean, c);
l.rolling_variance_gpu = cuda_make_array(l.variance, c);
l.rolling_mean_gpu = cuda_make_array(l.rolling_mean, c);
l.rolling_variance_gpu = cuda_make_array(l.rolling_variance, c);

l.mean_delta_gpu = cuda_make_array(l.mean, c);
l.variance_delta_gpu = cuda_make_array(l.variance, c);
l.mean_delta_gpu = cuda_make_array(l.mean_delta, c);
l.variance_delta_gpu = cuda_make_array(l.variance_delta, c);

l.x_gpu = cuda_make_array(l.x, l.batch*l.outputs);
l.x_norm_gpu = cuda_make_array(l.x_norm, l.batch*l.outputs);

l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs);
l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs);
#ifdef CUDNN
cudnnCreateTensorDescriptor(&l.normTensorDesc);
cudnnCreateTensorDescriptor(&l.dstTensorDesc);
cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w);
cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1);

cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1);
#endif
#endif
return l;
Expand Down
5 changes: 0 additions & 5 deletions src/image_opencv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,9 @@
#include <iostream>

#ifdef OPENCV
//#include <opencv2/core.hpp>
//#include <opencv2/core/cvstd.hpp>
#include <opencv2/opencv.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/highgui/highgui_c.h>
//#include <opencv2/videoio.hpp>
//#include "opencv2/videoio/videoio_c.h"
//#include <opencv2/imgcodecs.hpp>
#endif

#include "image.h"
Expand Down
1 change: 1 addition & 0 deletions src/layer.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ void free_layer(layer l)
#endif
return;
}
if(l.mask) free(l.mask);
if(l.cweights) free(l.cweights);
if(l.indexes) free(l.indexes);
if(l.input_layers) free(l.input_layers);
Expand Down
111 changes: 49 additions & 62 deletions src/network.c
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,8 @@ void top_predictions(network *net, int k, int *index)
float *network_predict(network *net, float *input)
{
network orig = *net;
net->input = input;
//net->input = input;
memcpy(net->input, input, net->inputs*net->batch*sizeof(float));
net->truth = 0;
net->train = 0;
net->delta = 0;
Expand Down Expand Up @@ -975,72 +976,58 @@ pthread_t train_network_in_thread(network *net, data d, float *err)
return thread;
}

void merge_weights(layer l, layer base)
void merge_weights(layer l)
{
if (l.type == CONVOLUTIONAL) {
axpy_cpu(l.n, 1, l.bias_updates, 1, base.biases, 1);
axpy_cpu(l.nweights, 1, l.weight_updates, 1, base.weights, 1);
if (l.scales) {
axpy_cpu(l.n, 1, l.scale_updates, 1, base.scales, 1);
}
} else if(l.type == CONNECTED) {
axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.biases, 1);
axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weights, 1);
}
if (l.type == CONVOLUTIONAL) {
axpy_cpu(l.n, 1, l.bias_updates, 1, l.biases, 1);
axpy_cpu(l.nweights, 1, l.weight_updates, 1, l.weights, 1);
if (l.scales) {
axpy_cpu(l.n, 1, l.scale_updates, 1, l.scales, 1);
}
} else if(l.type == CONNECTED) {
axpy_cpu(l.outputs, 1, l.bias_updates, 1, l.biases, 1);
axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, l.weights, 1);
}
}

void scale_weights(layer l, float s)
{
if (l.type == CONVOLUTIONAL) {
scal_cpu(l.n, s, l.biases, 1);
scal_cpu(l.nweights, s, l.weights, 1);
if (l.scales) {
scal_cpu(l.n, s, l.scales, 1);
}
} else if(l.type == CONNECTED) {
scal_cpu(l.outputs, s, l.biases, 1);
scal_cpu(l.outputs*l.inputs, s, l.weights, 1);
}
if (l.type == CONVOLUTIONAL) {
scal_cpu(l.n, s, l.biases, 1);
scal_cpu(l.nweights, s, l.weights, 1);
if (l.scales) {
scal_cpu(l.n, s, l.scales, 1);
}
} else if(l.type == CONNECTED) {
scal_cpu(l.outputs, s, l.biases, 1);
scal_cpu(l.outputs*l.inputs, s, l.weights, 1);
}
}


void pull_weights(layer l)
{
if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
cuda_pull_array(l.biases_gpu, l.bias_updates, l.n);
cuda_pull_array(l.weights_gpu, l.weight_updates, l.nweights);
if(l.scales) cuda_pull_array(l.scales_gpu, l.scale_updates, l.n);
opencl_pull_array_map(l.biases_gpu, l.bias_updates, l.n);
opencl_pull_array_map(l.weights_gpu, l.weight_updates, l.nweights);
if(l.scales) opencl_pull_array_map(l.scales_gpu, l.scale_updates, l.n);
} else if(l.type == CONNECTED){
cuda_pull_array(l.biases_gpu, l.bias_updates, l.outputs);
cuda_pull_array(l.weights_gpu, l.weight_updates, l.outputs*l.inputs);
opencl_pull_array_map(l.biases_gpu, l.bias_updates, l.outputs);
opencl_pull_array_map(l.weights_gpu, l.weight_updates, l.outputs*l.inputs);
}
}

void push_weights(layer l)
{
if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
cuda_push_array(l.biases_gpu, l.biases, l.n);
cuda_push_array(l.weights_gpu, l.weights, l.nweights);
if(l.scales) cuda_push_array(l.scales_gpu, l.scales, l.n);
opencl_push_array_map(l.biases_gpu, l.bias_updates, l.n);
opencl_push_array_map(l.weights_gpu, l.weight_updates, l.nweights);
if(l.scales) opencl_push_array_map(l.scales_gpu, l.scale_updates, l.n);
} else if(l.type == CONNECTED){
cuda_push_array(l.biases_gpu, l.biases, l.outputs);
cuda_push_array(l.weights_gpu, l.weights, l.outputs*l.inputs);
}
}

void distribute_weights(layer l, layer base)
{
if (l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL) {
cuda_push_array(l.biases_gpu, base.biases, l.n);
cuda_push_array(l.weights_gpu, base.weights, l.nweights);
if (base.scales) cuda_push_array(l.scales_gpu, base.scales, l.n);
} else if (l.type == CONNECTED) {
cuda_push_array(l.biases_gpu, base.biases, l.outputs);
cuda_push_array(l.weights_gpu, base.weights, l.outputs*l.inputs);
opencl_push_array_map(l.biases_gpu, l.bias_updates, l.outputs);
opencl_push_array_map(l.weights_gpu, l.weight_updates, l.outputs*l.inputs);
}
}


/*
void pull_updates(layer l)
Expand Down Expand Up @@ -1127,22 +1114,22 @@ void distribute_weights(layer l, layer base)

void sync_layer(network **nets, int n, int j)
{
int i;
network *net = nets[0];
layer base = net->layers[j];
scale_weights(base, 0);
for (i = 0; i < n; ++i) {
cuda_set_device(nets[i]->gpu_index);
layer l = nets[i]->layers[j];
pull_weights(l);
merge_weights(l, base);
}
scale_weights(base, 1./n);
for (i = 0; i < n; ++i) {
cuda_set_device(nets[i]->gpu_index);
layer l = nets[i]->layers[j];
distribute_weights(l, base);
}
int i;
network *net = nets[0];
layer base = net->layers[j];
scale_weights(base, 0);
for (i = 0; i < n; ++i) {
opencl_set_device(nets[i]->gpu_index);
layer l = nets[i]->layers[j];
pull_weights(l);
merge_weights(l);
}
scale_weights(base, 1./n);
for (i = 0; i < n; ++i) {
opencl_set_device(nets[i]->gpu_index);
layer l = nets[i]->layers[j];
push_weights(l);
}
}

typedef struct{
Expand Down

0 comments on commit a355ed5

Please sign in to comment.