From a355ed527360e5f845cbde8d3f9e4a4dbead175c Mon Sep 17 00:00:00 2001
From: Piotr Sowa <piotr@hanunia.local>
Date: Sun, 6 Jun 2021 18:42:06 +0200
Subject: [PATCH] DarkNet on CUDA version 2 Fixes

---
 Makefile              |   2 +-
 cfg/yolov1.cfg        |   2 +-
 examples/classifier.c |   1 -
 examples/detector.c   |   9 ++--
 src/batchnorm_layer.c |  22 ++++++---
 src/image_opencv.cpp  |   5 --
 src/layer.c           |   1 +
 src/network.c         | 111 +++++++++++++++++++-----------------------
 8 files changed, 72 insertions(+), 81 deletions(-)

diff --git a/Makefile b/Makefile
index 499c257b28d..a03690a7ad5 100644
--- a/Makefile
+++ b/Makefile
@@ -27,7 +27,7 @@ ARFLAGS=rcs
 OPTS=-Ofast
 LDFLAGS= -lm -pthread 
 COMMON= -Iinclude/ -Isrc/
-CFLAGS=-Wall -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors -fPIC
+CFLAGS=-Wno-unknown-pragmas -Wno-unused-variable -Wno-unused-result -Wno-deprecated-declarations -Wno-unused-function -Wfatal-errors -fPIC
 
 ifeq ($(OPENMP), 1) 
 CFLAGS+= -fopenmp
diff --git a/cfg/yolov1.cfg b/cfg/yolov1.cfg
index 06cf6e67617..7350682356d 100644
--- a/cfg/yolov1.cfg
+++ b/cfg/yolov1.cfg
@@ -25,7 +25,7 @@ batch_normalize=1
 filters=64
 size=7
 stride=2
-pad=1
+pad=3
 activation=leaky
 
 [maxpool]
diff --git a/examples/classifier.c b/examples/classifier.c
index 9c22f7952c7..59ee3eb9d95 100644
--- a/examples/classifier.c
+++ b/examples/classifier.c
@@ -647,7 +647,6 @@ void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *fi
         }
         if(r.data != im.data) free_image(r);
 		
-		if (resize) free_image(r);
         free_image(im);
         if (filename) break;
     }
diff --git a/examples/detector.c b/examples/detector.c
index 4094ca0c36d..e2e2d272aaa 100644
--- a/examples/detector.c
+++ b/examples/detector.c
@@ -536,14 +536,17 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
     fprintf(stderr, "Total Detection Time: %f Seconds\n", what_time_is_it_now() - start);
 }
 
-void validate_detector_recall(char *cfgfile, char *weightfile)
+void validate_detector_recall(char *datacfg, char *cfgfile, char *weightfile)
 {
     network *net = load_network(cfgfile, weightfile, 0);
     set_batch_network(net, 1);
     fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
     srand(time(0));
 
-    list *plist = get_paths("data/coco_val_5k.list");
+    // list *plist = get_paths("data/coco_val_5k.list");
+    list *options = read_data_cfg(datacfg);
+    char *test_images = option_find_str(options, "test", "data/test.list");
+    list *plist = get_paths(test_images);
     char **paths = (char **)list_to_array(plist);
 
     layer l = net->layers[net->n-1];
@@ -967,7 +970,7 @@ void run_detector(int argc, char **argv)
     else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear);
     else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile);
     else if(0==strcmp(argv[2], "valid2")) validate_detector_flip(datacfg, cfg, weights, outfile);
-    else if(0==strcmp(argv[2], "recall")) validate_detector_recall(cfg, weights);
+    else if(0==strcmp(argv[2], "recall")) validate_detector_recall(datacfg, cfg, weights);
     else if(0==strcmp(argv[2], "demo")) {
         list *options = read_data_cfg(datacfg);
         int classes = option_find_int(options, "classes", 20);
diff --git a/src/batchnorm_layer.c b/src/batchnorm_layer.c
index ebff387cc4b..cbda4ec0e58 100644
--- a/src/batchnorm_layer.c
+++ b/src/batchnorm_layer.c
@@ -32,6 +32,12 @@ layer make_batchnorm_layer(int batch, int w, int h, int c)
     l.rolling_mean = calloc(c, sizeof(float));
     l.rolling_variance = calloc(c, sizeof(float));
 
+    l.mean_delta = calloc(c, sizeof(float));
+    l.variance_delta = calloc(c, sizeof(float));
+
+    l.x = calloc(l.batch*l.outputs, sizeof(float));
+    l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
+
     l.forward = forward_batchnorm_layer;
     l.backward = backward_batchnorm_layer;
 #ifdef GPU
@@ -50,20 +56,20 @@ layer make_batchnorm_layer(int batch, int w, int h, int c)
     l.mean_gpu = cuda_make_array(l.mean, c);
     l.variance_gpu = cuda_make_array(l.variance, c);
 
-    l.rolling_mean_gpu = cuda_make_array(l.mean, c);
-    l.rolling_variance_gpu = cuda_make_array(l.variance, c);
+    l.rolling_mean_gpu = cuda_make_array(l.rolling_mean, c);
+    l.rolling_variance_gpu = cuda_make_array(l.rolling_variance, c);
 
-    l.mean_delta_gpu = cuda_make_array(l.mean, c);
-    l.variance_delta_gpu = cuda_make_array(l.variance, c);
+    l.mean_delta_gpu = cuda_make_array(l.mean_delta, c);
+    l.variance_delta_gpu = cuda_make_array(l.variance_delta, c);
+
+    l.x_gpu = cuda_make_array(l.x, l.batch*l.outputs);
+    l.x_norm_gpu = cuda_make_array(l.x_norm, l.batch*l.outputs);
 
-    l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs);
-    l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs);
     #ifdef CUDNN
     cudnnCreateTensorDescriptor(&l.normTensorDesc);
     cudnnCreateTensorDescriptor(&l.dstTensorDesc);
     cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
-    cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 
-
+    cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1);
     #endif
 #endif
     return l;
diff --git a/src/image_opencv.cpp b/src/image_opencv.cpp
index 3abb9f16175..b20fb23f4f6 100644
--- a/src/image_opencv.cpp
+++ b/src/image_opencv.cpp
@@ -6,14 +6,9 @@
 #include <iostream>
 
 #ifdef OPENCV
-    //#include <opencv2/core.hpp>
-    //#include <opencv2/core/cvstd.hpp>
     #include <opencv2/opencv.hpp>
     #include <opencv2/highgui.hpp>
     #include <opencv2/highgui/highgui_c.h>
-    //#include <opencv2/videoio.hpp>
-    //#include "opencv2/videoio/videoio_c.h"
-    //#include <opencv2/imgcodecs.hpp>
 #endif
 
 #include "image.h"
diff --git a/src/layer.c b/src/layer.c
index c27b4776421..cfc314bedd7 100644
--- a/src/layer.c
+++ b/src/layer.c
@@ -12,6 +12,7 @@ void free_layer(layer l)
 #endif
         return;
     }
+    if(l.mask)               free(l.mask);
     if(l.cweights)           free(l.cweights);
     if(l.indexes)            free(l.indexes);
     if(l.input_layers)       free(l.input_layers);
diff --git a/src/network.c b/src/network.c
index e7f759ed6e6..f6288389e90 100644
--- a/src/network.c
+++ b/src/network.c
@@ -540,7 +540,8 @@ void top_predictions(network *net, int k, int *index)
 float *network_predict(network *net, float *input)
 {
     network orig = *net;
-    net->input = input;
+    //net->input = input;
+    memcpy(net->input, input, net->inputs*net->batch*sizeof(float));
     net->truth = 0;
     net->train = 0;
     net->delta = 0;
@@ -975,72 +976,58 @@ pthread_t train_network_in_thread(network *net, data d, float *err)
 	return thread;
 }
 
-void merge_weights(layer l, layer base)
+void merge_weights(layer l)
 {
-    if (l.type == CONVOLUTIONAL) {
-        axpy_cpu(l.n, 1, l.bias_updates, 1, base.biases, 1);
-        axpy_cpu(l.nweights, 1, l.weight_updates, 1, base.weights, 1);
-        if (l.scales) {
-            axpy_cpu(l.n, 1, l.scale_updates, 1, base.scales, 1);
-        }
-    } else if(l.type == CONNECTED) {
-        axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.biases, 1);
-        axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weights, 1);
-    }
+	if (l.type == CONVOLUTIONAL) {
+		axpy_cpu(l.n, 1, l.bias_updates, 1, l.biases, 1);
+		axpy_cpu(l.nweights, 1, l.weight_updates, 1, l.weights, 1);
+		if (l.scales) {
+			axpy_cpu(l.n, 1, l.scale_updates, 1, l.scales, 1);
+		}
+	} else if(l.type == CONNECTED) {
+		axpy_cpu(l.outputs, 1, l.bias_updates, 1, l.biases, 1);
+		axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, l.weights, 1);
+	}
 }
 
 void scale_weights(layer l, float s)
 {
-    if (l.type == CONVOLUTIONAL) {
-        scal_cpu(l.n, s, l.biases, 1);
-        scal_cpu(l.nweights, s, l.weights, 1);
-        if (l.scales) {
-            scal_cpu(l.n, s, l.scales, 1);
-        }
-    } else if(l.type == CONNECTED) {
-        scal_cpu(l.outputs, s, l.biases, 1);
-        scal_cpu(l.outputs*l.inputs, s, l.weights, 1);
-    }
+	if (l.type == CONVOLUTIONAL) {
+		scal_cpu(l.n, s, l.biases, 1);
+		scal_cpu(l.nweights, s, l.weights, 1);
+		if (l.scales) {
+			scal_cpu(l.n, s, l.scales, 1);
+		}
+	} else if(l.type == CONNECTED) {
+		scal_cpu(l.outputs, s, l.biases, 1);
+		scal_cpu(l.outputs*l.inputs, s, l.weights, 1);
+	}
 }
 
-
 void pull_weights(layer l)
 {
     if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
-        cuda_pull_array(l.biases_gpu, l.bias_updates, l.n);
-        cuda_pull_array(l.weights_gpu, l.weight_updates, l.nweights);
-        if(l.scales) cuda_pull_array(l.scales_gpu, l.scale_updates, l.n);
+        opencl_pull_array_map(l.biases_gpu, l.bias_updates, l.n);
+        opencl_pull_array_map(l.weights_gpu, l.weight_updates, l.nweights);
+        if(l.scales) opencl_pull_array_map(l.scales_gpu, l.scale_updates, l.n);
     } else if(l.type == CONNECTED){
-        cuda_pull_array(l.biases_gpu, l.bias_updates, l.outputs);
-        cuda_pull_array(l.weights_gpu, l.weight_updates, l.outputs*l.inputs);
+        opencl_pull_array_map(l.biases_gpu, l.bias_updates, l.outputs);
+        opencl_pull_array_map(l.weights_gpu, l.weight_updates, l.outputs*l.inputs);
     }
 }
 
 void push_weights(layer l)
 {
     if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
-        cuda_push_array(l.biases_gpu, l.biases, l.n);
-        cuda_push_array(l.weights_gpu, l.weights, l.nweights);
-        if(l.scales) cuda_push_array(l.scales_gpu, l.scales, l.n);
+        opencl_push_array_map(l.biases_gpu, l.bias_updates, l.n);
+        opencl_push_array_map(l.weights_gpu, l.weight_updates, l.nweights);
+        if(l.scales) opencl_push_array_map(l.scales_gpu, l.scale_updates, l.n);
     } else if(l.type == CONNECTED){
-        cuda_push_array(l.biases_gpu, l.biases, l.outputs);
-        cuda_push_array(l.weights_gpu, l.weights, l.outputs*l.inputs);
-    }
-}
-
-void distribute_weights(layer l, layer base)
-{
-    if (l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL) {
-        cuda_push_array(l.biases_gpu, base.biases, l.n);
-        cuda_push_array(l.weights_gpu, base.weights, l.nweights);
-        if (base.scales) cuda_push_array(l.scales_gpu, base.scales, l.n);
-    } else if (l.type == CONNECTED) {
-        cuda_push_array(l.biases_gpu, base.biases, l.outputs);
-        cuda_push_array(l.weights_gpu, base.weights, l.outputs*l.inputs);
+        opencl_push_array_map(l.biases_gpu, l.bias_updates, l.outputs);
+        opencl_push_array_map(l.weights_gpu, l.weight_updates, l.outputs*l.inputs);
     }
 }
 
-
 /*
 
    void pull_updates(layer l)
@@ -1127,22 +1114,22 @@ void distribute_weights(layer l, layer base)
 
 void sync_layer(network **nets, int n, int j)
 {
-    int i;
-    network *net = nets[0];
-    layer base = net->layers[j];
-    scale_weights(base, 0);
-    for (i = 0; i < n; ++i) {
-        cuda_set_device(nets[i]->gpu_index);
-        layer l = nets[i]->layers[j];
-        pull_weights(l);
-        merge_weights(l, base);
-    }
-    scale_weights(base, 1./n);
-    for (i = 0; i < n; ++i) {
-        cuda_set_device(nets[i]->gpu_index);
-        layer l = nets[i]->layers[j];
-        distribute_weights(l, base);
-    }
+	int i;
+	network *net = nets[0];
+	layer base = net->layers[j];
+	scale_weights(base, 0);
+	for (i = 0; i < n; ++i) {
+		opencl_set_device(nets[i]->gpu_index);
+		layer l = nets[i]->layers[j];
+		pull_weights(l);
+		merge_weights(l);
+	}
+	scale_weights(base, 1./n);
+	for (i = 0; i < n; ++i) {
+		opencl_set_device(nets[i]->gpu_index);
+		layer l = nets[i]->layers[j];
+		push_weights(l);
+	}
 }
 
 typedef struct{