From a355ed527360e5f845cbde8d3f9e4a4dbead175c Mon Sep 17 00:00:00 2001 From: Piotr Sowa Date: Sun, 6 Jun 2021 18:42:06 +0200 Subject: [PATCH] DarkNet on CUDA version 2 Fixes --- Makefile | 2 +- cfg/yolov1.cfg | 2 +- examples/classifier.c | 1 - examples/detector.c | 9 ++-- src/batchnorm_layer.c | 22 ++++++--- src/image_opencv.cpp | 5 -- src/layer.c | 1 + src/network.c | 111 +++++++++++++++++++----------------------- 8 files changed, 72 insertions(+), 81 deletions(-) diff --git a/Makefile b/Makefile index 499c257b28d..a03690a7ad5 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ ARFLAGS=rcs OPTS=-Ofast LDFLAGS= -lm -pthread COMMON= -Iinclude/ -Isrc/ -CFLAGS=-Wall -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors -fPIC +CFLAGS=-Wno-unknown-pragmas -Wno-unused-variable -Wno-unused-result -Wno-deprecated-declarations -Wno-unused-function -Wfatal-errors -fPIC ifeq ($(OPENMP), 1) CFLAGS+= -fopenmp diff --git a/cfg/yolov1.cfg b/cfg/yolov1.cfg index 06cf6e67617..7350682356d 100644 --- a/cfg/yolov1.cfg +++ b/cfg/yolov1.cfg @@ -25,7 +25,7 @@ batch_normalize=1 filters=64 size=7 stride=2 -pad=1 +pad=3 activation=leaky [maxpool] diff --git a/examples/classifier.c b/examples/classifier.c index 9c22f7952c7..59ee3eb9d95 100644 --- a/examples/classifier.c +++ b/examples/classifier.c @@ -647,7 +647,6 @@ void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *fi } if(r.data != im.data) free_image(r); - if (resize) free_image(r); free_image(im); if (filename) break; } diff --git a/examples/detector.c b/examples/detector.c index 4094ca0c36d..e2e2d272aaa 100644 --- a/examples/detector.c +++ b/examples/detector.c @@ -536,14 +536,17 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out fprintf(stderr, "Total Detection Time: %f Seconds\n", what_time_is_it_now() - start); } -void validate_detector_recall(char *cfgfile, char *weightfile) +void validate_detector_recall(char *datacfg, char *cfgfile, char *weightfile) { network *net = load_network(cfgfile, weightfile, 0); set_batch_network(net, 1); fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay); srand(time(0)); - list *plist = get_paths("data/coco_val_5k.list"); + // list *plist = get_paths("data/coco_val_5k.list"); + list *options = read_data_cfg(datacfg); + char *test_images = option_find_str(options, "test", "data/test.list"); + list *plist = get_paths(test_images); char **paths = (char **)list_to_array(plist); layer l = net->layers[net->n-1]; @@ -967,7 +970,7 @@ void run_detector(int argc, char **argv) else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear); else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile); else if(0==strcmp(argv[2], "valid2")) validate_detector_flip(datacfg, cfg, weights, outfile); - else if(0==strcmp(argv[2], "recall")) validate_detector_recall(cfg, weights); + else if(0==strcmp(argv[2], "recall")) validate_detector_recall(datacfg, cfg, weights); else if(0==strcmp(argv[2], "demo")) { list *options = read_data_cfg(datacfg); int classes = option_find_int(options, "classes", 20); diff --git a/src/batchnorm_layer.c b/src/batchnorm_layer.c index ebff387cc4b..cbda4ec0e58 100644 --- a/src/batchnorm_layer.c +++ b/src/batchnorm_layer.c @@ -32,6 +32,12 @@ layer make_batchnorm_layer(int batch, int w, int h, int c) l.rolling_mean = calloc(c, sizeof(float)); l.rolling_variance = calloc(c, sizeof(float)); + l.mean_delta = calloc(c, sizeof(float)); + l.variance_delta = calloc(c, sizeof(float)); + + l.x = calloc(l.batch*l.outputs, sizeof(float)); + l.x_norm = calloc(l.batch*l.outputs, sizeof(float)); + l.forward = forward_batchnorm_layer; l.backward = backward_batchnorm_layer; #ifdef GPU @@ -50,20 +56,20 @@ layer make_batchnorm_layer(int batch, int w, int h, int c) l.mean_gpu = cuda_make_array(l.mean, c); l.variance_gpu = cuda_make_array(l.variance, c); - l.rolling_mean_gpu = cuda_make_array(l.mean, c); - l.rolling_variance_gpu = cuda_make_array(l.variance, c); + l.rolling_mean_gpu = cuda_make_array(l.rolling_mean, c); + l.rolling_variance_gpu = cuda_make_array(l.rolling_variance, c); - l.mean_delta_gpu = cuda_make_array(l.mean, c); - l.variance_delta_gpu = cuda_make_array(l.variance, c); + l.mean_delta_gpu = cuda_make_array(l.mean_delta, c); + l.variance_delta_gpu = cuda_make_array(l.variance_delta, c); + + l.x_gpu = cuda_make_array(l.x, l.batch*l.outputs); + l.x_norm_gpu = cuda_make_array(l.x_norm, l.batch*l.outputs); - l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs); - l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs); #ifdef CUDNN cudnnCreateTensorDescriptor(&l.normTensorDesc); cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); - cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); - + cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); #endif #endif return l; diff --git a/src/image_opencv.cpp b/src/image_opencv.cpp index 3abb9f16175..b20fb23f4f6 100644 --- a/src/image_opencv.cpp +++ b/src/image_opencv.cpp @@ -6,14 +6,9 @@ #include #ifdef OPENCV - //#include - //#include #include #include #include - //#include - //#include "opencv2/videoio/videoio_c.h" - //#include #endif #include "image.h" diff --git a/src/layer.c b/src/layer.c index c27b4776421..cfc314bedd7 100644 --- a/src/layer.c +++ b/src/layer.c @@ -12,6 +12,7 @@ void free_layer(layer l) #endif return; } + if(l.mask) free(l.mask); if(l.cweights) free(l.cweights); if(l.indexes) free(l.indexes); if(l.input_layers) free(l.input_layers); diff --git a/src/network.c b/src/network.c index e7f759ed6e6..f6288389e90 100644 --- a/src/network.c +++ b/src/network.c @@ -540,7 +540,8 @@ void top_predictions(network *net, int k, int *index) float *network_predict(network *net, float *input) { network orig = *net; - net->input = input; + //net->input = input; + memcpy(net->input, input, net->inputs*net->batch*sizeof(float)); net->truth = 0; net->train = 0; net->delta = 0; @@ -975,72 +976,58 @@ pthread_t train_network_in_thread(network *net, data d, float *err) return thread; } -void merge_weights(layer l, layer base) +void merge_weights(layer l) { - if (l.type == CONVOLUTIONAL) { - axpy_cpu(l.n, 1, l.bias_updates, 1, base.biases, 1); - axpy_cpu(l.nweights, 1, l.weight_updates, 1, base.weights, 1); - if (l.scales) { - axpy_cpu(l.n, 1, l.scale_updates, 1, base.scales, 1); - } - } else if(l.type == CONNECTED) { - axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.biases, 1); - axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weights, 1); - } + if (l.type == CONVOLUTIONAL) { + axpy_cpu(l.n, 1, l.bias_updates, 1, l.biases, 1); + axpy_cpu(l.nweights, 1, l.weight_updates, 1, l.weights, 1); + if (l.scales) { + axpy_cpu(l.n, 1, l.scale_updates, 1, l.scales, 1); + } + } else if(l.type == CONNECTED) { + axpy_cpu(l.outputs, 1, l.bias_updates, 1, l.biases, 1); + axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, l.weights, 1); + } } void scale_weights(layer l, float s) { - if (l.type == CONVOLUTIONAL) { - scal_cpu(l.n, s, l.biases, 1); - scal_cpu(l.nweights, s, l.weights, 1); - if (l.scales) { - scal_cpu(l.n, s, l.scales, 1); - } - } else if(l.type == CONNECTED) { - scal_cpu(l.outputs, s, l.biases, 1); - scal_cpu(l.outputs*l.inputs, s, l.weights, 1); - } + if (l.type == CONVOLUTIONAL) { + scal_cpu(l.n, s, l.biases, 1); + scal_cpu(l.nweights, s, l.weights, 1); + if (l.scales) { + scal_cpu(l.n, s, l.scales, 1); + } + } else if(l.type == CONNECTED) { + scal_cpu(l.outputs, s, l.biases, 1); + scal_cpu(l.outputs*l.inputs, s, l.weights, 1); + } } - void pull_weights(layer l) { if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){ - cuda_pull_array(l.biases_gpu, l.bias_updates, l.n); - cuda_pull_array(l.weights_gpu, l.weight_updates, l.nweights); - if(l.scales) cuda_pull_array(l.scales_gpu, l.scale_updates, l.n); + opencl_pull_array_map(l.biases_gpu, l.bias_updates, l.n); + opencl_pull_array_map(l.weights_gpu, l.weight_updates, l.nweights); + if(l.scales) opencl_pull_array_map(l.scales_gpu, l.scale_updates, l.n); } else if(l.type == CONNECTED){ - cuda_pull_array(l.biases_gpu, l.bias_updates, l.outputs); - cuda_pull_array(l.weights_gpu, l.weight_updates, l.outputs*l.inputs); + opencl_pull_array_map(l.biases_gpu, l.bias_updates, l.outputs); + opencl_pull_array_map(l.weights_gpu, l.weight_updates, l.outputs*l.inputs); } } void push_weights(layer l) { if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){ - cuda_push_array(l.biases_gpu, l.biases, l.n); - cuda_push_array(l.weights_gpu, l.weights, l.nweights); - if(l.scales) cuda_push_array(l.scales_gpu, l.scales, l.n); + opencl_push_array_map(l.biases_gpu, l.bias_updates, l.n); + opencl_push_array_map(l.weights_gpu, l.weight_updates, l.nweights); + if(l.scales) opencl_push_array_map(l.scales_gpu, l.scale_updates, l.n); } else if(l.type == CONNECTED){ - cuda_push_array(l.biases_gpu, l.biases, l.outputs); - cuda_push_array(l.weights_gpu, l.weights, l.outputs*l.inputs); - } -} - -void distribute_weights(layer l, layer base) -{ - if (l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL) { - cuda_push_array(l.biases_gpu, base.biases, l.n); - cuda_push_array(l.weights_gpu, base.weights, l.nweights); - if (base.scales) cuda_push_array(l.scales_gpu, base.scales, l.n); - } else if (l.type == CONNECTED) { - cuda_push_array(l.biases_gpu, base.biases, l.outputs); - cuda_push_array(l.weights_gpu, base.weights, l.outputs*l.inputs); + opencl_push_array_map(l.biases_gpu, l.bias_updates, l.outputs); + opencl_push_array_map(l.weights_gpu, l.weight_updates, l.outputs*l.inputs); } } - /* void pull_updates(layer l) @@ -1127,22 +1114,22 @@ void distribute_weights(layer l, layer base) void sync_layer(network **nets, int n, int j) { - int i; - network *net = nets[0]; - layer base = net->layers[j]; - scale_weights(base, 0); - for (i = 0; i < n; ++i) { - cuda_set_device(nets[i]->gpu_index); - layer l = nets[i]->layers[j]; - pull_weights(l); - merge_weights(l, base); - } - scale_weights(base, 1./n); - for (i = 0; i < n; ++i) { - cuda_set_device(nets[i]->gpu_index); - layer l = nets[i]->layers[j]; - distribute_weights(l, base); - } + int i; + network *net = nets[0]; + layer base = net->layers[j]; + scale_weights(base, 0); + for (i = 0; i < n; ++i) { + opencl_set_device(nets[i]->gpu_index); + layer l = nets[i]->layers[j]; + pull_weights(l); + merge_weights(l); + } + scale_weights(base, 1./n); + for (i = 0; i < n; ++i) { + opencl_set_device(nets[i]->gpu_index); + layer l = nets[i]->layers[j]; + push_weights(l); + } } typedef struct{