From 69601cd8b0860156a5fca0a61580bf10fe5aa367 Mon Sep 17 00:00:00 2001
From: acxz <akashpatel2008@yahoo.com>
Date: Mon, 10 Jun 2019 12:44:18 -0400
Subject: [PATCH 01/86] added cmake build instructions and reorganized some
 install instructions

---
 README.md | 48 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 52df11995a8..97b0bae824c 100644
--- a/README.md
+++ b/README.md
@@ -18,9 +18,12 @@ More details: http://pjreddie.com/darknet/yolo/
 
 0.  [Improvements in this repository](#improvements-in-this-repository)
 1.  [How to use](#how-to-use-on-the-command-line)
-2.  [How to compile on Linux](#how-to-compile-on-linux)
+2.  How to compile on Linux
+    * [Using cmake](#how-to-compile-on-linux-using-cmake)
+    * [Using make](#how-to-compile-on-linux-using-make)
 3.  How to compile on Windows
     * [Using vcpkg](#how-to-compile-on-windows-using-vcpkg)
+    * [Using Cmake-GUI](#how-to-compile-on-windows-using-Cmake-GUI)
     * [Legacy way](#how-to-compile-on-windows-legacy-way)
 4.  [How to train (Pascal VOC Data)](#how-to-train-pascal-voc-data)
 5.  [How to train with multi-GPU:](#how-to-train-with-multi-gpu)
@@ -50,10 +53,6 @@ More details: http://pjreddie.com/darknet/yolo/
 * **GPU with CC >= 3.0**: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
 * on Linux **GCC or Clang**, on Windows **MSVC 2015/2017/2019** https://visualstudio.microsoft.com/thank-you-downloading-visual-studio/?sku=Community
 
-Compiling on **Windows** by using `Cmake-GUI` as on this [**IMAGE**](https://user-images.githubusercontent.com/4096485/55107892-6becf380-50e3-11e9-9a0a-556a943c429a.png): Configure -> Optional platform for generator (Set: x64) -> Finish -> Generate -> Open Project -> x64 & Release -> Build -> Build solution
-
-Compiling on **Linux** by using command `make` (or alternative way by using command: `cmake . && make` )
-
 #### Pre-trained models
 
 There are weights-file for different cfg-files (smaller size -> faster speed & lower accuracy:
@@ -154,7 +153,23 @@ On Linux find executable file `./darknet` in the root directory, while on Window
 
 * Yolo v3 COCO-model: `darknet.exe detector demo data/coco.data yolov3.cfg yolov3.weights http://192.168.0.80:8080/video?dummy=param.mjpg -i 0`
 
-### How to compile on Linux
+### How to compile on Linux (using `cmake`)
+
+The `CMakeLists.txt` will attempt to find installed optional dependencies like
+CUDA, cudnn, ZED and build against those. It will also create a shared object
+library file to use `darknet` for code development.
+
+Inside the cloned repository:
+
+```
+mkdir build-release
+cd build-release
+cmake ..
+make
+make install
+```
+
+### How to compile on Linux (using `make`)
 
 Just do `make` in the darknet directory.
 Before make, you can set such options in the `Makefile`: [link](https://github.com/AlexeyAB/darknet/blob/9c1b9a2cf6363546c152251be578a21f3c3caec6/Makefile#L1)
@@ -201,6 +216,19 @@ PS Code\vcpkg>         .\vcpkg install pthreads opencv[ffmpeg] #replace with ope
 
 9.  Open Powershell, go to the `darknet` folder and build with the command `.\build.ps1`. If you want to use Visual Studio, you will find two custom solutions created for you by CMake after the build, one in `build_win_debug` and the other in `build_win_release`, containing all the appropriate config flags for your system.
 
+### How to compile on Windows (using `Cmake-GUI`)
+
+Using `Cmake-GUI` as shown here on this [**IMAGE**](https://user-images.githubusercontent.com/4096485/55107892-6becf380-50e3-11e9-9a0a-556a943c429a.png):
+
+1. Configure
+2. Optional platform for generator (Set: x64)
+3. Finish
+4. Generate
+5. Open Project
+6. x64 & Release
+7. Build
+8. Build solution
+
 ### How to compile on Windows (legacy way)
 
 1. If you have **CUDA 10.0, cuDNN 7.4 and OpenCV 3.x** (with paths: `C:\opencv_3.0\opencv\build\include` & `C:\opencv_3.0\opencv\build\x64\vc14\lib`), then open `build\darknet\darknet.sln`, set **x64** and **Release** https://hsto.org/webt/uh/fk/-e/uhfk-eb0q-hwd9hsxhrikbokd6u.jpeg and do the: Build -> Build darknet. Also add Windows system variable `CUDNN` with path to CUDNN: https://user-images.githubusercontent.com/4096485/53249764-019ef880-36ca-11e9-8ffe-d9cf47e7e462.jpg
@@ -596,8 +624,12 @@ With example of: `train.txt`, `obj.names`, `obj.data`, `yolo-obj.cfg`, `air`1-6`
 
 ## How to use Yolo as DLL and SO libraries
 
-* on Linux - set `LIBSO=1` in the `Makefile` and do `make`
-* on Windows - compile `build\darknet\yolo_cpp_dll.sln` or `build\darknet\yolo_cpp_dll_no_gpu.sln` solution
+* on Linux
+    * build `darknet` using `cmake` or
+    * set `LIBSO=1` in the `Makefile` and do `make`
+* on Windows
+    * compile `build\darknet\yolo_cpp_dll.sln` solution or
+    * compile `build\darknet\yolo_cpp_dll_no_gpu.sln` solution
 
 There are 2 APIs:
 * C API: https://github.com/AlexeyAB/darknet/blob/master/include/darknet.h

From fead96a0022eb5b160f87324478d41c60f5166a9 Mon Sep 17 00:00:00 2001
From: acxz <akashpatel2008@yahoo.com>
Date: Mon, 10 Jun 2019 13:00:05 -0400
Subject: [PATCH 02/86] fixed link to cmake-gui section

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 97b0bae824c..521c2ab59f4 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ More details: http://pjreddie.com/darknet/yolo/
     * [Using make](#how-to-compile-on-linux-using-make)
 3.  How to compile on Windows
     * [Using vcpkg](#how-to-compile-on-windows-using-vcpkg)
-    * [Using Cmake-GUI](#how-to-compile-on-windows-using-Cmake-GUI)
+    * [Using Cmake-GUI](#how-to-compile-on-windows-using-cmake-gui)
     * [Legacy way](#how-to-compile-on-windows-legacy-way)
 4.  [How to train (Pascal VOC Data)](#how-to-train-pascal-voc-data)
 5.  [How to train with multi-GPU:](#how-to-train-with-multi-gpu)

From de07ab6924913c868e8d4bc1e24a5167e8e20a66 Mon Sep 17 00:00:00 2001
From: acxz <akashpatel2008@yahoo.com>
Date: Wed, 12 Jun 2019 12:14:25 -0400
Subject: [PATCH 03/86] added more ways to create so an dll files

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 521c2ab59f4..2f85a17c15e 100644
--- a/README.md
+++ b/README.md
@@ -625,11 +625,13 @@ With example of: `train.txt`, `obj.names`, `obj.data`, `yolo-obj.cfg`, `air`1-6`
 ## How to use Yolo as DLL and SO libraries
 
 * on Linux
+    * using `build.sh` or
     * build `darknet` using `cmake` or
     * set `LIBSO=1` in the `Makefile` and do `make`
 * on Windows
-    * compile `build\darknet\yolo_cpp_dll.sln` solution or
-    * compile `build\darknet\yolo_cpp_dll_no_gpu.sln` solution
+    * using `build.ps1` or
+    * build `darknet` using `cmake` or
+    * compile `build\darknet\yolo_cpp_dll.sln` solution or `build\darknet\yolo_cpp_dll_no_gpu.sln` solution
 
 There are 2 APIs:
 * C API: https://github.com/AlexeyAB/darknet/blob/master/include/darknet.h

From 993af0fb5b5ec8fe3303d73cbbbcee2938b3051f Mon Sep 17 00:00:00 2001
From: shooorf <pretoreani@gmail.com>
Date: Wed, 26 Jun 2019 20:47:09 +0300
Subject: [PATCH 04/86] Check if image does not require resizing

---
 src/image.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/image.c b/src/image.c
index 2f085801dbe..995fa849ddd 100644
--- a/src/image.c
+++ b/src/image.c
@@ -1275,6 +1275,8 @@ float bilinear_interpolate(image im, float x, float y, int c)
 
 image resize_image(image im, int w, int h)
 {
+    if (im.w == w && im.h == h) return copy_image(im);
+
     image resized = make_image(w, h, im.c);
     image part = make_image(w, im.h, im.c);
     int r, c, k;

From 8d80a65288df8aac8a7080e17bcdea6136bd186d Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Tue, 27 Aug 2019 14:04:39 +0300
Subject: [PATCH 05/86] Max pool layer can use stride=2 or stride_x=2
 stride_y=4 (isn't tested well)

---
 include/darknet.h            |  2 ++
 src/box.c                    |  2 ++
 src/maxpool_layer.c          | 30 ++++++++++++++++++------------
 src/maxpool_layer.h          |  2 +-
 src/maxpool_layer_kernels.cu | 31 ++++++++++++++++---------------
 src/parser.c                 | 14 ++++++++------
 6 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/include/darknet.h b/include/darknet.h
index 5d87a8323a4..5cfd274db52 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -208,6 +208,8 @@ struct layer {
     int size;
     int side;
     int stride;
+    int stride_x;
+    int stride_y;
     int dilation;
     int maxpool_depth;
     int out_channels;
diff --git a/src/box.c b/src/box.c
index 640f54a299e..1b5c4998a6b 100644
--- a/src/box.c
+++ b/src/box.c
@@ -207,6 +207,8 @@ dxrep dx_box_iou(box pred, box truth, IOU_LOSS iou_loss) {
         p_dr = ((U * dI_wrt_r) - (I * dU_wrt_r)) / (U * U);
     }
 
+    // GIoU = I/U - (C-U)/C
+    // C is the smallest convex hull that encloses both Detection and Truth
     if (iou_loss == GIOU) {
         if (C > 0) {
             // apply "C" term from gIOU
diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c
index 4d2ee49f57f..000efe90663 100644
--- a/src/maxpool_layer.c
+++ b/src/maxpool_layer.c
@@ -34,8 +34,8 @@ void cudnn_maxpool_setup(layer *l)
         l->size,
         l->pad/2, //0, //l.pad,
         l->pad/2, //0, //l.pad,
-        l->stride,
-        l->stride);
+        l->stride_x,
+        l->stride_y);
 
     cudnnCreateTensorDescriptor(&l->srcTensorDesc);
     cudnnCreateTensorDescriptor(&l->dstTensorDesc);
@@ -45,7 +45,7 @@ void cudnn_maxpool_setup(layer *l)
 }
 
 
-maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride, int padding, int maxpool_depth, int out_channels)
+maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels)
 {
     maxpool_layer l = { (LAYER_TYPE)0 };
     l.type = MAXPOOL;
@@ -62,14 +62,16 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
         l.out_h = l.h;
     }
     else {
-        l.out_w = (w + padding - size) / stride + 1;
-        l.out_h = (h + padding - size) / stride + 1;
+        l.out_w = (w + padding - size) / stride_x + 1;
+        l.out_h = (h + padding - size) / stride_y + 1;
         l.out_c = c;
     }
     l.outputs = l.out_h * l.out_w * l.out_c;
     l.inputs = h*w*c;
     l.size = size;
-    l.stride = stride;
+    l.stride = stride_x;
+    l.stride_x = stride_x;
+    l.stride_y = stride_y;
     int output_size = l.out_h * l.out_w * l.out_c * batch;
     l.indexes = (int*)calloc(output_size, sizeof(int));
     l.output = (float*)calloc(output_size, sizeof(float));
@@ -87,7 +89,11 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
 
     #endif  // GPU
 	l.bflops = (l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.;
-    fprintf(stderr, "max               %d x %d/%2d   %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+    if(stride_x == stride_y)
+        fprintf(stderr, "max               %d x %d/%2d   %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+    else
+        fprintf(stderr, "max             %d x %d/%2dx%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, stride_y, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+
     return l;
 }
 
@@ -97,8 +103,8 @@ void resize_maxpool_layer(maxpool_layer *l, int w, int h)
     l->w = w;
     l->inputs = h*w*l->c;
 
-    l->out_w = (w + l->pad - l->size) / l->stride + 1;
-    l->out_h = (h + l->pad - l->size) / l->stride + 1;
+    l->out_w = (w + l->pad - l->size) / l->stride_x + 1;
+    l->out_h = (h + l->pad - l->size) / l->stride_y + 1;
     l->outputs = l->out_w * l->out_h * l->out_c;
     int output_size = l->outputs * l->batch;
 
@@ -151,7 +157,7 @@ void forward_maxpool_layer(const maxpool_layer l, network_state state)
     }
 
 
-    if (!state.train) {
+    if (!state.train && l.stride_x == l.stride_y) {
         forward_maxpool_layer_avx(state.input, l.output, l.indexes, l.size, l.w, l.h, l.out_w, l.out_h, l.c, l.pad, l.stride, l.batch);
         return;
     }
@@ -173,8 +179,8 @@ void forward_maxpool_layer(const maxpool_layer l, network_state state)
                     int max_i = -1;
                     for(n = 0; n < l.size; ++n){
                         for(m = 0; m < l.size; ++m){
-                            int cur_h = h_offset + i*l.stride + n;
-                            int cur_w = w_offset + j*l.stride + m;
+                            int cur_h = h_offset + i*l.stride_y + n;
+                            int cur_w = w_offset + j*l.stride_x + m;
                             int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c));
                             int valid = (cur_h >= 0 && cur_h < l.h &&
                                          cur_w >= 0 && cur_w < l.w);
diff --git a/src/maxpool_layer.h b/src/maxpool_layer.h
index 0c1f6148946..4994d45700d 100644
--- a/src/maxpool_layer.h
+++ b/src/maxpool_layer.h
@@ -12,7 +12,7 @@ typedef layer maxpool_layer;
 extern "C" {
 #endif
 image get_maxpool_image(maxpool_layer l);
-maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride, int padding, int maxpool_depth, int out_channels);
+maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels);
 void resize_maxpool_layer(maxpool_layer *l, int w, int h);
 void forward_maxpool_layer(const maxpool_layer l, network_state state);
 void backward_maxpool_layer(const maxpool_layer l, network_state state);
diff --git a/src/maxpool_layer_kernels.cu b/src/maxpool_layer_kernels.cu
index 82d631b358a..8e8511003e5 100644
--- a/src/maxpool_layer_kernels.cu
+++ b/src/maxpool_layer_kernels.cu
@@ -49,10 +49,10 @@ __global__ void backward_maxpool_depth_layer_kernel(int n, int w, int h, int c,
 }
 
 
-__global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride, int size, int pad, float *input, float *output, int *indexes)
+__global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *input, float *output, int *indexes)
 {
-    int h = (in_h + pad - size) / stride + 1;
-    int w = (in_w + pad - size) / stride + 1;
+    int h = (in_h + pad - size) / stride_y + 1;
+    int w = (in_w + pad - size) / stride_x + 1;
     int c = in_c;
 
     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -75,8 +75,8 @@ __global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c
     int l, m;
     for(l = 0; l < size; ++l){
         for(m = 0; m < size; ++m){
-            int cur_h = h_offset + i*stride + l;
-            int cur_w = w_offset + j*stride + m;
+            int cur_h = h_offset + i*stride_y + l;
+            int cur_w = w_offset + j*stride_x + m;
             int index = cur_w + in_w*(cur_h + in_h*(k + b*in_c));
             int valid = (cur_h >= 0 && cur_h < in_h &&
                     cur_w >= 0 && cur_w < in_w);
@@ -89,12 +89,13 @@ __global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c
     indexes[out_index] = max_i;
 }
 
-__global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride, int size, int pad, float *delta, float *prev_delta, int *indexes)
+__global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *delta, float *prev_delta, int *indexes)
 {
-    int h = (in_h + pad - size) / stride + 1;
-    int w = (in_w + pad - size) / stride + 1;
+    int h = (in_h + pad - size) / stride_y + 1;
+    int w = (in_w + pad - size) / stride_x + 1;
     int c = in_c;
-    int area = (size-1)/stride;
+    int area_x = (size - 1) / stride_x;
+    int area_y = (size - 1) / stride_y;
 
     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if(id >= n) return;
@@ -113,10 +114,10 @@ __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_
 
     float d = 0;
     int l, m;
-    for(l = -area; l < area+1; ++l){
-        for(m = -area; m < area+1; ++m){
-            int out_w = (j-w_offset)/stride + m;
-            int out_h = (i-h_offset)/stride + l;
+    for(l = -area_y; l < area_y+1; ++l){
+        for(m = -area_x; m < area_x+1; ++m){
+            int out_w = (j-w_offset)/stride_x + m;
+            int out_h = (i-h_offset)/stride_y + l;
             int out_index = out_w + w*(out_h + h*(k + c*b));
             int valid = (out_w >= 0 && out_w < w &&
                      out_h >= 0 && out_h < h);
@@ -172,7 +173,7 @@ extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state sta
 
     size_t n = h*w*c*layer.batch;
 
-    forward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream()>>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu);
+    forward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream()>>>(n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu);
     CHECK_CUDA(cudaPeekAtLastError());
 }
 
@@ -192,6 +193,6 @@ extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network_state st
 
     size_t n = layer.h*layer.w*layer.c*layer.batch;
 
-    backward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, layer.delta_gpu, state.delta, layer.indexes_gpu);
+    backward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, layer.delta_gpu, state.delta, layer.indexes_gpu);
     CHECK_CUDA(cudaPeekAtLastError());
 }
diff --git a/src/parser.c b/src/parser.c
index 09e79d2df75..ac8f9613fae 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -535,6 +535,8 @@ layer parse_reorg_old(list *options, size_params params)
 maxpool_layer parse_maxpool(list *options, size_params params)
 {
     int stride = option_find_int(options, "stride",1);
+    int stride_x = option_find_int_quiet(options, "stride_x", stride);
+    int stride_y = option_find_int_quiet(options, "stride_y", stride);
     int size = option_find_int(options, "size",stride);
     int padding = option_find_int_quiet(options, "padding", size-1);
     int maxpool_depth = option_find_int_quiet(options, "maxpool_depth", 0);
@@ -547,7 +549,7 @@ maxpool_layer parse_maxpool(list *options, size_params params)
     batch=params.batch;
     if(!(h && w && c)) error("Layer before maxpool layer must output image.");
 
-    maxpool_layer layer = make_maxpool_layer(batch, h, w, c, size, stride, padding, maxpool_depth, out_channels);
+    maxpool_layer layer = make_maxpool_layer(batch, h, w, c, size, stride_x, stride_y, padding, maxpool_depth, out_channels);
     return layer;
 }
 
@@ -1332,12 +1334,12 @@ void load_convolutional_weights(layer l, FILE *fp)
         //return;
     }
     int num = l.nweights;
-    fread(l.biases, sizeof(float), l.n, fp);
+    if (fread(l.biases, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! \n");
     //fread(l.weights, sizeof(float), num, fp); // as in connected layer
     if (l.batch_normalize && (!l.dontloadscales)){
-        fread(l.scales, sizeof(float), l.n, fp);
-        fread(l.rolling_mean, sizeof(float), l.n, fp);
-        fread(l.rolling_variance, sizeof(float), l.n, fp);
+        if(fread(l.scales, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! \n");
+        if(fread(l.rolling_mean, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! \n");
+        if(fread(l.rolling_variance, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! \n");
         if(0){
             int i;
             for(i = 0; i < l.n; ++i){
@@ -1354,7 +1356,7 @@ void load_convolutional_weights(layer l, FILE *fp)
             fill_cpu(l.n, 0, l.rolling_variance, 1);
         }
     }
-    fread(l.weights, sizeof(float), num, fp);
+    if(fread(l.weights, sizeof(float), num, fp) < num) printf("\n Warning: Unexpected end of wights-file! \n");
     //if(l.adam){
     //    fread(l.m, sizeof(float), num, fp);
     //    fread(l.v, sizeof(float), num, fp);

From 4acf924aaf7e61b07b0d8387347674d782e32cce Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Thu, 29 Aug 2019 18:30:33 +0300
Subject: [PATCH 06/86] minor fix

---
 src/parser.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/parser.c b/src/parser.c
index ac8f9613fae..3883d041d4b 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -1334,12 +1334,12 @@ void load_convolutional_weights(layer l, FILE *fp)
         //return;
     }
     int num = l.nweights;
-    if (fread(l.biases, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! \n");
+    if (fread(l.biases, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! l.biase - l.index = %d \n", l.index);
     //fread(l.weights, sizeof(float), num, fp); // as in connected layer
     if (l.batch_normalize && (!l.dontloadscales)){
-        if(fread(l.scales, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! \n");
-        if(fread(l.rolling_mean, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! \n");
-        if(fread(l.rolling_variance, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! \n");
+        if(fread(l.scales, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! l.scales - l.index = %d \n", l.index);
+        if(fread(l.rolling_mean, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! l.rolling_mean - l.index = %d \n", l.index);
+        if(fread(l.rolling_variance, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! l.rolling_variance - l.index = %d \n", l.index);
         if(0){
             int i;
             for(i = 0; i < l.n; ++i){
@@ -1356,7 +1356,7 @@ void load_convolutional_weights(layer l, FILE *fp)
             fill_cpu(l.n, 0, l.rolling_variance, 1);
         }
     }
-    if(fread(l.weights, sizeof(float), num, fp) < num) printf("\n Warning: Unexpected end of wights-file! \n");
+    if(fread(l.weights, sizeof(float), num, fp) < num) printf("\n Warning: Unexpected end of wights-file! l.weights - l.index = %d \n", l.index);
     //if(l.adam){
     //    fread(l.m, sizeof(float), num, fp);
     //    fread(l.v, sizeof(float), num, fp);

From 58906ef812323444181386c7dd4e6dda8a377a48 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Thu, 29 Aug 2019 23:34:04 +0300
Subject: [PATCH 07/86] minor fix: Unexpected end of wights-file!

---
 src/parser.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/src/parser.c b/src/parser.c
index 3883d041d4b..6cc4790bf32 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -1327,6 +1327,12 @@ void load_convolutional_weights_binary(layer l, FILE *fp)
 #endif
 }
 
+void check_read_size(size_t read_bytes, int required_bytes)
+{
+    if (read_bytes > 0 && read_bytes < required_bytes) return 0;
+    return 1;
+}
+
 void load_convolutional_weights(layer l, FILE *fp)
 {
     if(l.binary){
@@ -1334,12 +1340,17 @@ void load_convolutional_weights(layer l, FILE *fp)
         //return;
     }
     int num = l.nweights;
-    if (fread(l.biases, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! l.biase - l.index = %d \n", l.index);
+    int read_bytes;
+    read_bytes = fread(l.biases, sizeof(float), l.n, fp);
+    if (read_bytes > 0 && read_bytes < l.n) printf("\n Warning: Unexpected end of wights-file! l.biases - l.index = %d \n", l.index);
     //fread(l.weights, sizeof(float), num, fp); // as in connected layer
     if (l.batch_normalize && (!l.dontloadscales)){
-        if(fread(l.scales, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! l.scales - l.index = %d \n", l.index);
-        if(fread(l.rolling_mean, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! l.rolling_mean - l.index = %d \n", l.index);
-        if(fread(l.rolling_variance, sizeof(float), l.n, fp) < l.n) printf("\n Warning: Unexpected end of wights-file! l.rolling_variance - l.index = %d \n", l.index);
+        read_bytes = fread(l.scales, sizeof(float), l.n, fp);
+        if (read_bytes > 0 && read_bytes < l.n) printf("\n Warning: Unexpected end of wights-file! l.scales - l.index = %d \n", l.index);
+        read_bytes = fread(l.rolling_mean, sizeof(float), l.n, fp);
+        if (read_bytes > 0 && read_bytes < l.n) printf("\n Warning: Unexpected end of wights-file! l.rolling_mean - l.index = %d \n", l.index);
+        read_bytes = fread(l.rolling_variance, sizeof(float), l.n, fp);
+        if (read_bytes > 0 && read_bytes < l.n) printf("\n Warning: Unexpected end of wights-file! l.rolling_variance - l.index = %d \n", l.index);
         if(0){
             int i;
             for(i = 0; i < l.n; ++i){
@@ -1356,7 +1367,8 @@ void load_convolutional_weights(layer l, FILE *fp)
             fill_cpu(l.n, 0, l.rolling_variance, 1);
         }
     }
-    if(fread(l.weights, sizeof(float), num, fp) < num) printf("\n Warning: Unexpected end of wights-file! l.weights - l.index = %d \n", l.index);
+    read_bytes = fread(l.weights, sizeof(float), num, fp);
+    if (read_bytes > 0 && read_bytes < l.n) printf("\n Warning: Unexpected end of wights-file! l.weights - l.index = %d \n", l.index);
     //if(l.adam){
     //    fread(l.m, sizeof(float), num, fp);
     //    fread(l.v, sizeof(float), num, fp);
@@ -1473,8 +1485,9 @@ void load_weights_upto(network *net, char *filename, int cutoff)
             }
 #endif
         }
+        if (feof(fp)) break;
     }
-    fprintf(stderr, "Done!\n");
+    fprintf(stderr, "Done! Loaded %d layers from weights-file \n", i);
     fclose(fp);
 }
 

From 660a9b225cb30ad06397b5c39e9612d9eff59ea9 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 30 Aug 2019 00:29:26 +0300
Subject: [PATCH 08/86] compile fix

---
 src/parser.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/parser.c b/src/parser.c
index 6cc4790bf32..48bd42bd3ce 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -1327,12 +1327,6 @@ void load_convolutional_weights_binary(layer l, FILE *fp)
 #endif
 }
 
-void check_read_size(size_t read_bytes, int required_bytes)
-{
-    if (read_bytes > 0 && read_bytes < required_bytes) return 0;
-    return 1;
-}
-
 void load_convolutional_weights(layer l, FILE *fp)
 {
     if(l.binary){

From 2a382e5a4bb55a6cca636f3d7025be2cfbdfc68a Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 30 Aug 2019 20:28:56 +0300
Subject: [PATCH 09/86] Fix training TridentNet

---
 src/layer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/layer.c b/src/layer.c
index c0b48afc2b8..68d1b35b68a 100644
--- a/src/layer.c
+++ b/src/layer.c
@@ -12,6 +12,7 @@ void free_sublayer(layer *l)
 
 void free_layer(layer l)
 {
+    if (l.share_layer != NULL) return;    // don't free shared layers
     if (l.type == CONV_LSTM) {
         if (l.peephole) {
             free_sublayer(l.vf);

From 102ab710a9ee13c8a160923301d15ae8fcff6188 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sun, 1 Sep 2019 14:11:17 +0300
Subject: [PATCH 10/86] Fixed fuse_conv_batchnorm() for TridentNet

---
 src/convolutional_layer.c | 1 +
 src/network.c             | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 6ff5b8b3d4f..93ac79a0711 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -550,6 +550,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     l.bflops = (2.0 * l.nweights * l.out_h*l.out_w) / 1000000000.;
     if (l.xnor && l.use_bin_output) fprintf(stderr, "convXB");
     else if (l.xnor) fprintf(stderr, "convX ");
+    else if(l.share_layer) fprintf(stderr, "convS ");
     else fprintf(stderr, "conv  ");
 
     if(groups > 1) fprintf(stderr, "%5d/%4d ", n, groups);
diff --git a/src/network.c b/src/network.c
index 6e64a8ceb56..9bdab8adce1 100644
--- a/src/network.c
+++ b/src/network.c
@@ -1033,6 +1033,10 @@ void fuse_conv_batchnorm(network net)
         if (l->type == CONVOLUTIONAL) {
             //printf(" Merges Convolutional-%d and batch_norm \n", j);
 
+            if (l->share_layer != NULL) {
+                l->batch_normalize = 0;
+            }
+
             if (l->batch_normalize) {
                 int f;
                 for (f = 0; f < l->n; ++f)

From a4a06c337a082bba02da2077bfb687dadf3bf580 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sun, 1 Sep 2019 16:07:33 +0300
Subject: [PATCH 11/86] Addex stride_x= and stride_y= for [convolutional] layer
 for MatrixNet

---
 src/conv_lstm_layer.c        | 22 +++++++++++-----------
 src/convolutional_kernels.cu |  8 ++++----
 src/convolutional_layer.c    | 33 +++++++++++++++++++--------------
 src/convolutional_layer.h    |  2 +-
 src/crnn_layer.c             |  6 +++---
 src/parser.c                 |  9 ++++++---
 6 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/src/conv_lstm_layer.c b/src/conv_lstm_layer.c
index 6cbaf1c3911..5da2bab39e0 100644
--- a/src/conv_lstm_layer.c
+++ b/src/conv_lstm_layer.c
@@ -66,44 +66,44 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i
 
     // U
     l.uf = (layer*)calloc(1, sizeof(layer));
-    *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
     l.uf->batch = batch;
     if (l.workspace_size < l.uf->workspace_size) l.workspace_size = l.uf->workspace_size;
 
     l.ui = (layer*)calloc(1, sizeof(layer));
-    *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
     l.ui->batch = batch;
     if (l.workspace_size < l.ui->workspace_size) l.workspace_size = l.ui->workspace_size;
 
     l.ug = (layer*)calloc(1, sizeof(layer));
-    *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
     l.ug->batch = batch;
     if (l.workspace_size < l.ug->workspace_size) l.workspace_size = l.ug->workspace_size;
 
     l.uo = (layer*)calloc(1, sizeof(layer));
-    *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
     l.uo->batch = batch;
     if (l.workspace_size < l.uo->workspace_size) l.workspace_size = l.uo->workspace_size;
 
 
     // W
     l.wf = (layer*)calloc(1, sizeof(layer));
-    *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
     l.wf->batch = batch;
     if (l.workspace_size < l.wf->workspace_size) l.workspace_size = l.wf->workspace_size;
 
     l.wi = (layer*)calloc(1, sizeof(layer));
-    *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
     l.wi->batch = batch;
     if (l.workspace_size < l.wi->workspace_size) l.workspace_size = l.wi->workspace_size;
 
     l.wg = (layer*)calloc(1, sizeof(layer));
-    *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
     l.wg->batch = batch;
     if (l.workspace_size < l.wg->workspace_size) l.workspace_size = l.wg->workspace_size;
 
     l.wo = (layer*)calloc(1, sizeof(layer));
-    *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
     l.wo->batch = batch;
     if (l.workspace_size < l.wo->workspace_size) l.workspace_size = l.wo->workspace_size;
 
@@ -111,21 +111,21 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i
     // V
     l.vf = (layer*)calloc(1, sizeof(layer));
     if (l.peephole) {
-        *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+        *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
         l.vf->batch = batch;
         if (l.workspace_size < l.vf->workspace_size) l.workspace_size = l.vf->workspace_size;
     }
 
     l.vi = (layer*)calloc(1, sizeof(layer));
     if (l.peephole) {
-        *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+        *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
         l.vi->batch = batch;
         if (l.workspace_size < l.vi->workspace_size) l.workspace_size = l.vi->workspace_size;
     }
 
     l.vo = (layer*)calloc(1, sizeof(layer));
     if (l.peephole) {
-        *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+        *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
         l.vo->batch = batch;
         if (l.workspace_size < l.vo->workspace_size) l.workspace_size = l.vo->workspace_size;
     }
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index e404ecabd85..07a0a0d7121 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -177,7 +177,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
             fast_binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu);
         }
 
-        if (l.align_bit_weights_gpu && !state.train && l.c >= 32)
+        if (l.align_bit_weights_gpu && !state.train && l.c >= 32 && l.stride_x == l.stride_y)
         {
             //return;
             //cudaError_t status = cudaSuccess;
@@ -574,7 +574,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
                     l.h, l.w,               // input size (h, w)
                     l.size, l.size,         // kernel size (h, w)
                     l.pad, l.pad,           // padding (h, w)
-                    l.stride, l.stride,     // stride (h, w)
+                    l.stride_y, l.stride_x,     // stride (h, w)
                     l.dilation, l.dilation, // dilation (h, w)
                     state.workspace);       // output
 
@@ -819,7 +819,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
                 l.h, l.w,               // input size (h, w)
                 l.size, l.size,         // kernel size (h, w)
                 l.pad, l.pad,           // padding (h, w)
-                l.stride, l.stride,     // stride (h, w)
+                l.stride_y, l.stride_x,     // stride (h, w)
                 l.dilation, l.dilation, // dilation (h, w)
                 state.workspace);       // output
             //gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n);
@@ -844,7 +844,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
                     l.h, l.w,               // input size (h, w)
                     l.size, l.size,         // kernel size (h, w)
                     l.pad, l.pad,           // padding size (h, w)
-                    l.stride, l.stride,     // stride size (h, w)
+                    l.stride_y, l.stride_x,     // stride size (h, w)
                     l.dilation, l.dilation, // dilation size (h, w)
                     delta);                 // output (delta)
 
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 93ac79a0711..207e3f27cf6 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -76,12 +76,12 @@ void binarize_input(float *input, int n, int size, float *binary)
 
 int convolutional_out_height(convolutional_layer l)
 {
-    return (l.h + 2*l.pad - l.size) / l.stride + 1;
+    return (l.h + 2*l.pad - l.size) / l.stride_y + 1;
 }
 
 int convolutional_out_width(convolutional_layer l)
 {
-    return (l.w + 2*l.pad - l.size) / l.stride + 1;
+    return (l.w + 2*l.pad - l.size) / l.stride_x + 1;
 }
 
 image get_convolutional_image(convolutional_layer l)
@@ -276,9 +276,9 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 
     //printf("\n l->dilation = %d, l->pad = %d, l->size = %d \n", l->dilation, l->pad, l->size);
 #if(CUDNN_MAJOR >= 6)
-    CHECK_CUDNN(cudnnSetConvolution2dDescriptor(l->convDesc, l->pad * l->dilation, l->pad* l->dilation, l->stride, l->stride, l->dilation, l->dilation, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));    // cudnn >= 6.0
+    CHECK_CUDNN(cudnnSetConvolution2dDescriptor(l->convDesc, l->pad * l->dilation, l->pad* l->dilation, l->stride_y, l->stride_x, l->dilation, l->dilation, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT));    // cudnn >= 6.0
 #else
-    CHECK_CUDNN(cudnnSetConvolution2dDescriptor(l->convDesc, l->pad * l->dilation, l->pad * l->dilation, l->stride, l->stride, l->dilation, l->dilation, CUDNN_CROSS_CORRELATION));    // cudnn 5.1
+    CHECK_CUDNN(cudnnSetConvolution2dDescriptor(l->convDesc, l->pad * l->dilation, l->pad * l->dilation, l->stride_y, l->stride_x, l->dilation, l->dilation, CUDNN_CROSS_CORRELATION));    // cudnn 5.1
 #endif
     int forward_algo = CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
     int backward_algo = CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST;
@@ -332,7 +332,7 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 #endif
 #endif
 
-convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, convolutional_layer *share_layer)
+convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, convolutional_layer *share_layer)
 {
     int total_batch = batch*steps;
     int i;
@@ -354,7 +354,9 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     l.use_bin_output = use_bin_output;
     l.batch = batch;
     l.steps = steps;
-    l.stride = stride;
+    l.stride = stride_x;
+    l.stride_y = stride_x;
+    l.stride_x = stride_y;
     l.dilation = dilation;
     l.size = size;
     l.pad = padding;
@@ -553,11 +555,14 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     else if(l.share_layer) fprintf(stderr, "convS ");
     else fprintf(stderr, "conv  ");
 
-    if(groups > 1) fprintf(stderr, "%5d/%4d ", n, groups);
+    if (groups > 1) fprintf(stderr, "%5d/%4d ", n, groups);
     else           fprintf(stderr, "%5d      ", n);
 
-    if(dilation > 1) fprintf(stderr, "%2d x%2d/%2d(%1d)", size, size, stride, dilation);
-    else             fprintf(stderr, "%2d x%2d/%2d   ", size, size, stride);
+    if (stride_x != stride_y) fprintf(stderr, "%2d x%2d/%2dx%2d ", size, size, stride_x, stride_y);
+    else {
+        if (dilation > 1) fprintf(stderr, "%2d x%2d/%2d(%1d)", size, size, stride_x, dilation);
+        else             fprintf(stderr, "%2d x%2d/%2d   ", size, size, stride_x);
+    }
 
     fprintf(stderr, "%4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
 
@@ -583,7 +588,7 @@ void denormalize_convolutional_layer(convolutional_layer l)
 
 void test_convolutional_layer()
 {
-    convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 1, 5, 2, 1, 1, LEAKY, 1, 0, 0, 0, 0, 0, NULL);
+    convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 1, 5, 2, 2, 1, 1, LEAKY, 1, 0, 0, 0, 0, 0, NULL);
     l.batch_normalize = 1;
     float data[] = {1,1,1,1,1,
         1,1,1,1,1,
@@ -921,7 +926,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
 
             //gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
             //gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n);
-            if (l.xnor && l.align_bit_weights && !state.train)
+            if (l.xnor && l.align_bit_weights && !state.train && l.stride_x == l.stride_y)
             {
                 memset(b, 0, l.bit_align*l.size*l.size*l.c * sizeof(float));
 
@@ -1053,7 +1058,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
                         l.h, l.w,           // input size (h, w)
                         l.size, l.size,     // kernel size (h, w)
                         l.pad, l.pad,       // padding (h, w)
-                        l.stride, l.stride, // stride (h, w)
+                        l.stride_y, l.stride_x, // stride (h, w)
                         l.dilation, l.dilation, // dilation (h, w)
                         b);                 // output
 
@@ -1229,7 +1234,7 @@ void backward_convolutional_layer(convolutional_layer l, network_state state)
                 l.h, l.w,           // input size (h, w)
                 l.size, l.size,     // kernel size (h, w)
                 l.pad, l.pad,       // padding (h, w)
-                l.stride, l.stride, // stride (h, w)
+                l.stride_y, l.stride_x, // stride (h, w)
                 l.dilation, l.dilation, // dilation (h, w)
                 b);                 // output
 
@@ -1251,7 +1256,7 @@ void backward_convolutional_layer(convolutional_layer l, network_state state)
                     l.h, l.w,               // input size (h, w)
                     l.size, l.size,         // kernel size (h, w)
                     l.pad, l.pad,           // padding (h, w)
-                    l.stride, l.stride,     // stride (h, w)
+                    l.stride_y, l.stride_x,     // stride (h, w)
                     l.dilation, l.dilation, // dilation (h, w)
                     state.delta + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w); // output (delta)
             }
diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h
index e62b155c45f..1167175ccf4 100644
--- a/src/convolutional_layer.h
+++ b/src/convolutional_layer.h
@@ -30,7 +30,7 @@ void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16);
 #endif
 
 size_t get_convolutional_workspace_size(layer l);
-convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, convolutional_layer *share_layer);
+convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, convolutional_layer *share_layer);
 void denormalize_convolutional_layer(convolutional_layer l);
 void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
 void forward_convolutional_layer(const convolutional_layer layer, network_state state);
diff --git a/src/crnn_layer.c b/src/crnn_layer.c
index 7609003b4f2..eaded279099 100644
--- a/src/crnn_layer.c
+++ b/src/crnn_layer.c
@@ -50,17 +50,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
     l.state = (float*)calloc(l.hidden * l.batch * (l.steps + 1), sizeof(float));
 
     l.input_layer = (layer*)calloc(1, sizeof(layer));
-    *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
     l.input_layer->batch = batch;
     if (l.workspace_size < l.input_layer->workspace_size) l.workspace_size = l.input_layer->workspace_size;
 
     l.self_layer = (layer*)calloc(1, sizeof(layer));
-    *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
     l.self_layer->batch = batch;
     if (l.workspace_size < l.self_layer->workspace_size) l.workspace_size = l.self_layer->workspace_size;
 
     l.output_layer = (layer*)calloc(1, sizeof(layer));
-    *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, groups, size, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
     l.output_layer->batch = batch;
     if (l.workspace_size < l.output_layer->workspace_size) l.workspace_size = l.output_layer->workspace_size;
 
diff --git a/src/parser.c b/src/parser.c
index 48bd42bd3ce..8283f7ed0f0 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -158,6 +158,8 @@ convolutional_layer parse_convolutional(list *options, size_params params, netwo
     int groups = option_find_int_quiet(options, "groups", 1);
     int size = option_find_int(options, "size",1);
     int stride = option_find_int(options, "stride",1);
+    int stride_x = option_find_int_quiet(options, "stride_x", stride);
+    int stride_y = option_find_int_quiet(options, "stride_y", stride);
     int dilation = option_find_int_quiet(options, "dilation", 1);
     if (size == 1) dilation = 1;
     int pad = option_find_int_quiet(options, "pad",0);
@@ -167,9 +169,10 @@ convolutional_layer parse_convolutional(list *options, size_params params, netwo
     char *activation_s = option_find_str(options, "activation", "logistic");
     ACTIVATION activation = get_activation(activation_s);
 
-    int share_index = option_find_int_quiet(options, "share_index", -1);
+    int share_index = option_find_int_quiet(options, "share_index", -1000000000);
     convolutional_layer *share_layer = NULL;
-    if(share_index > -1) share_layer = &net.layers[share_index];
+    if(share_index >= 0) share_layer = &net.layers[share_index];
+    else if(share_index != -1000000000) share_layer = &net.layers[params.index + share_index];
 
     int batch,h,w,c;
     h = params.h;
@@ -182,7 +185,7 @@ convolutional_layer parse_convolutional(list *options, size_params params, netwo
     int xnor = option_find_int_quiet(options, "xnor", 0);
     int use_bin_output = option_find_int_quiet(options, "bin_output", 0);
 
-    convolutional_layer layer = make_convolutional_layer(batch,1,h,w,c,n,groups,size,stride,dilation,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index, share_layer);
+    convolutional_layer layer = make_convolutional_layer(batch,1,h,w,c,n,groups,size,stride_x,stride_y,dilation,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index, share_layer);
     layer.flipped = option_find_int_quiet(options, "flipped", 0);
     layer.dot = option_find_float_quiet(options, "dot", 0);
     layer.assisted_excitation = option_find_float_quiet(options, "assisted_excitation", 0);

From 11b8b2bddf32190a83dbf7868881520b1e564377 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Mon, 2 Sep 2019 02:13:11 +0300
Subject: [PATCH 12/86] Added antialiasing=1 param for [convolutional]-layer on
 GPU

---
 include/darknet.h            |  2 ++
 src/conv_lstm_layer.c        | 22 +++++++--------
 src/convolutional_kernels.cu | 24 +++++++++++++++++
 src/convolutional_layer.c    | 52 ++++++++++++++++++++++++++++++++++--
 src/convolutional_layer.h    |  2 +-
 src/crnn_layer.c             |  6 ++---
 src/layer.c                  |  4 +++
 src/parser.c                 | 19 +++++++++----
 8 files changed, 109 insertions(+), 22 deletions(-)

diff --git a/include/darknet.h b/include/darknet.h
index 5cfd274db52..a7a62b47bd7 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -211,6 +211,7 @@ struct layer {
     int stride_x;
     int stride_y;
     int dilation;
+    int antialiasing;
     int maxpool_depth;
     int out_channels;
     int reverse;
@@ -528,6 +529,7 @@ struct layer {
     float * scale_updates_gpu;
     float * scale_change_gpu;
 
+    float * input_antialiasing_gpu;
     float * output_gpu;
     float * output_sigmoid_gpu;
     float * loss_gpu;
diff --git a/src/conv_lstm_layer.c b/src/conv_lstm_layer.c
index 5da2bab39e0..a6da3bf0c2c 100644
--- a/src/conv_lstm_layer.c
+++ b/src/conv_lstm_layer.c
@@ -66,44 +66,44 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i
 
     // U
     l.uf = (layer*)calloc(1, sizeof(layer));
-    *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
     l.uf->batch = batch;
     if (l.workspace_size < l.uf->workspace_size) l.workspace_size = l.uf->workspace_size;
 
     l.ui = (layer*)calloc(1, sizeof(layer));
-    *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
     l.ui->batch = batch;
     if (l.workspace_size < l.ui->workspace_size) l.workspace_size = l.ui->workspace_size;
 
     l.ug = (layer*)calloc(1, sizeof(layer));
-    *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
     l.ug->batch = batch;
     if (l.workspace_size < l.ug->workspace_size) l.workspace_size = l.ug->workspace_size;
 
     l.uo = (layer*)calloc(1, sizeof(layer));
-    *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
     l.uo->batch = batch;
     if (l.workspace_size < l.uo->workspace_size) l.workspace_size = l.uo->workspace_size;
 
 
     // W
     l.wf = (layer*)calloc(1, sizeof(layer));
-    *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
     l.wf->batch = batch;
     if (l.workspace_size < l.wf->workspace_size) l.workspace_size = l.wf->workspace_size;
 
     l.wi = (layer*)calloc(1, sizeof(layer));
-    *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
     l.wi->batch = batch;
     if (l.workspace_size < l.wi->workspace_size) l.workspace_size = l.wi->workspace_size;
 
     l.wg = (layer*)calloc(1, sizeof(layer));
-    *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
     l.wg->batch = batch;
     if (l.workspace_size < l.wg->workspace_size) l.workspace_size = l.wg->workspace_size;
 
     l.wo = (layer*)calloc(1, sizeof(layer));
-    *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
     l.wo->batch = batch;
     if (l.workspace_size < l.wo->workspace_size) l.workspace_size = l.wo->workspace_size;
 
@@ -111,21 +111,21 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i
     // V
     l.vf = (layer*)calloc(1, sizeof(layer));
     if (l.peephole) {
-        *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+        *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
         l.vf->batch = batch;
         if (l.workspace_size < l.vf->workspace_size) l.workspace_size = l.vf->workspace_size;
     }
 
     l.vi = (layer*)calloc(1, sizeof(layer));
     if (l.peephole) {
-        *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+        *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
         l.vi->batch = batch;
         if (l.workspace_size < l.vi->workspace_size) l.workspace_size = l.vi->workspace_size;
     }
 
     l.vo = (layer*)calloc(1, sizeof(layer));
     if (l.peephole) {
-        *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+        *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
         l.vo->batch = batch;
         if (l.workspace_size < l.vo->workspace_size) l.workspace_size = l.vo->workspace_size;
     }
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index 07a0a0d7121..b476ac76e3d 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -604,10 +604,34 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
     if (state.net.try_fix_nan) {
         fix_nan_and_inf(l.output_gpu, l.outputs*l.batch);
     }
+
+    if (l.antialiasing) {
+        network_state s = { 0 };
+        s.train = state.train;
+        s.workspace = state.workspace;
+        s.net = state.net;
+        if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
+        s.input = l.output_gpu;
+        forward_convolutional_layer_gpu(*(l.input_layer), s);
+        simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.input_antialiasing_gpu);
+        simple_copy_ongpu(l.input_layer->outputs*l.input_layer->batch, l.input_layer->output_gpu, l.output_gpu);
+    }
 }
 
 void backward_convolutional_layer_gpu(convolutional_layer l, network_state state)
 {
+    if (l.antialiasing) {
+        network_state s = { 0 };
+        s.train = state.train;
+        s.workspace = state.workspace;
+        s.net = state.net;
+        s.delta = l.delta_gpu;
+        s.input = l.input_antialiasing_gpu;
+        //if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
+        simple_copy_ongpu(l.input_layer->outputs*l.input_layer->batch, l.delta_gpu, l.input_layer->delta_gpu);
+        backward_convolutional_layer_gpu(*(l.input_layer), s);
+    }
+
     if(state.net.try_fix_nan) constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
 
     if (l.activation == SWISH) gradient_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.output_sigmoid_gpu, l.delta_gpu);
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 207e3f27cf6..11402721545 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -332,7 +332,7 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 #endif
 #endif
 
-convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, convolutional_layer *share_layer)
+convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer)
 {
     int total_batch = batch*steps;
     int i;
@@ -342,6 +342,13 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     if (xnor) groups = 1;   // disable groups for XNOR-net
     if (groups < 1) groups = 1;
 
+    const int blur_stride_x = stride_x;
+    const int blur_stride_y = stride_y;
+    l.antialiasing = antialiasing;
+    if (antialiasing) {
+        stride_x = stride_y = l.stride = l.stride_x = l.stride_y = 1; // use stride=1 in host-layer
+    }
+
     l.share_layer = share_layer;
     l.index = index;
     l.h = h;
@@ -568,6 +575,47 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
 
     //fprintf(stderr, "%5d/%2d %2d x%2d /%2d(%d)%4d x%4d x%4d  -> %4d x%4d x%4d %5.3f BF\n", n, groups, size, size, stride, dilation, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
 
+    if (l.antialiasing) {
+        printf("AA: ");
+        l.input_layer = (layer*)calloc(1, sizeof(layer));
+        const int blur_size = 3;
+        *(l.input_layer) = make_convolutional_layer(batch, steps, out_h, out_w, n, n, n, blur_size, blur_stride_x, blur_stride_y, 1, blur_size / 2, LINEAR, 0, 0, 0, 0, 0, index, 0, NULL);
+        const int blur_nweights = n * blur_size * blur_size;  // (n / n) * n * blur_size * blur_size;
+        int i;
+        for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
+            /*
+            l.input_layer->weights[i + 0] = 0;
+            l.input_layer->weights[i + 1] = 0;
+            l.input_layer->weights[i + 2] = 0;
+
+            l.input_layer->weights[i + 3] = 0;
+            l.input_layer->weights[i + 4] = 1;
+            l.input_layer->weights[i + 5] = 0;
+
+            l.input_layer->weights[i + 6] = 0;
+            l.input_layer->weights[i + 7] = 0;
+            l.input_layer->weights[i + 8] = 0;
+            */
+            l.input_layer->weights[i + 0] = 1 / 16.f;
+            l.input_layer->weights[i + 1] = 2 / 16.f;
+            l.input_layer->weights[i + 2] = 1 / 16.f;
+
+            l.input_layer->weights[i + 3] = 2 / 16.f;
+            l.input_layer->weights[i + 4] = 4 / 16.f;
+            l.input_layer->weights[i + 5] = 2 / 16.f;
+
+            l.input_layer->weights[i + 6] = 1 / 16.f;
+            l.input_layer->weights[i + 7] = 2 / 16.f;
+            l.input_layer->weights[i + 8] = 1 / 16.f;
+
+        }
+        for (i = 0; i < n; ++i) l.input_layer->biases[i] = 0;
+#ifdef GPU
+        l.input_antialiasing_gpu = cuda_make_array(NULL, l.batch*l.outputs);
+        push_convolutional_layer(*(l.input_layer));
+#endif  // GPU
+    }
+
     return l;
 }
 
@@ -588,7 +636,7 @@ void denormalize_convolutional_layer(convolutional_layer l)
 
 void test_convolutional_layer()
 {
-    convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 1, 5, 2, 2, 1, 1, LEAKY, 1, 0, 0, 0, 0, 0, NULL);
+    convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 1, 5, 2, 2, 1, 1, LEAKY, 1, 0, 0, 0, 0, 0, 0, NULL);
     l.batch_normalize = 1;
     float data[] = {1,1,1,1,1,
         1,1,1,1,1,
diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h
index 1167175ccf4..1012663a5b3 100644
--- a/src/convolutional_layer.h
+++ b/src/convolutional_layer.h
@@ -30,7 +30,7 @@ void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16);
 #endif
 
 size_t get_convolutional_workspace_size(layer l);
-convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, convolutional_layer *share_layer);
+convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer);
 void denormalize_convolutional_layer(convolutional_layer l);
 void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
 void forward_convolutional_layer(const convolutional_layer layer, network_state state);
diff --git a/src/crnn_layer.c b/src/crnn_layer.c
index eaded279099..e3114fc9497 100644
--- a/src/crnn_layer.c
+++ b/src/crnn_layer.c
@@ -50,17 +50,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
     l.state = (float*)calloc(l.hidden * l.batch * (l.steps + 1), sizeof(float));
 
     l.input_layer = (layer*)calloc(1, sizeof(layer));
-    *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
     l.input_layer->batch = batch;
     if (l.workspace_size < l.input_layer->workspace_size) l.workspace_size = l.input_layer->workspace_size;
 
     l.self_layer = (layer*)calloc(1, sizeof(layer));
-    *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
     l.self_layer->batch = batch;
     if (l.workspace_size < l.self_layer->workspace_size) l.workspace_size = l.self_layer->workspace_size;
 
     l.output_layer = (layer*)calloc(1, sizeof(layer));
-    *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, NULL);
+    *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
     l.output_layer->batch = batch;
     if (l.workspace_size < l.output_layer->workspace_size) l.workspace_size = l.output_layer->workspace_size;
 
diff --git a/src/layer.c b/src/layer.c
index 68d1b35b68a..b6ae95dba12 100644
--- a/src/layer.c
+++ b/src/layer.c
@@ -13,6 +13,9 @@ void free_sublayer(layer *l)
 void free_layer(layer l)
 {
     if (l.share_layer != NULL) return;    // don't free shared layers
+    if (l.antialiasing) {
+        free_sublayer(l.input_layer);
+    }
     if (l.type == CONV_LSTM) {
         if (l.peephole) {
             free_sublayer(l.vf);
@@ -167,6 +170,7 @@ void free_layer(layer l)
     if (l.bias_updates_gpu)        cuda_free(l.bias_updates_gpu), l.bias_updates_gpu = NULL;
     if (l.scales_gpu)              cuda_free(l.scales_gpu), l.scales_gpu = NULL;
     if (l.scale_updates_gpu)       cuda_free(l.scale_updates_gpu), l.scale_updates_gpu = NULL;
+    if (l.input_antialiasing_gpu)  cuda_free(l.input_antialiasing_gpu), l.input_antialiasing_gpu = NULL;
     if (l.output_gpu)              cuda_free(l.output_gpu), l.output_gpu = NULL;
     if (l.output_sigmoid_gpu)      cuda_free(l.output_sigmoid_gpu), l.output_sigmoid_gpu = NULL;
     if (l.delta_gpu)               cuda_free(l.delta_gpu), l.delta_gpu = NULL;
diff --git a/src/parser.c b/src/parser.c
index 8283f7ed0f0..fda2bacc041 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -161,6 +161,7 @@ convolutional_layer parse_convolutional(list *options, size_params params, netwo
     int stride_x = option_find_int_quiet(options, "stride_x", stride);
     int stride_y = option_find_int_quiet(options, "stride_y", stride);
     int dilation = option_find_int_quiet(options, "dilation", 1);
+    int antialiasing = option_find_int_quiet(options, "antialiasing", 0);
     if (size == 1) dilation = 1;
     int pad = option_find_int_quiet(options, "pad",0);
     int padding = option_find_int_quiet(options, "padding",0);
@@ -185,7 +186,7 @@ convolutional_layer parse_convolutional(list *options, size_params params, netwo
     int xnor = option_find_int_quiet(options, "xnor", 0);
     int use_bin_output = option_find_int_quiet(options, "bin_output", 0);
 
-    convolutional_layer layer = make_convolutional_layer(batch,1,h,w,c,n,groups,size,stride_x,stride_y,dilation,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index, share_layer);
+    convolutional_layer layer = make_convolutional_layer(batch,1,h,w,c,n,groups,size,stride_x,stride_y,dilation,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index, antialiasing, share_layer);
     layer.flipped = option_find_int_quiet(options, "flipped", 0);
     layer.dot = option_find_float_quiet(options, "dot", 0);
     layer.assisted_excitation = option_find_float_quiet(options, "assisted_excitation", 0);
@@ -991,10 +992,18 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
         n = n->next;
         ++count;
         if(n){
-            params.h = l.out_h;
-            params.w = l.out_w;
-            params.c = l.out_c;
-            params.inputs = l.outputs;
+            if (l.antialiasing) {
+                params.h = l.input_layer->out_h;
+                params.w = l.input_layer->out_w;
+                params.c = l.input_layer->out_c;
+                params.inputs = l.input_layer->outputs;
+            }
+            else {
+                params.h = l.out_h;
+                params.w = l.out_w;
+                params.c = l.out_c;
+                params.inputs = l.outputs;
+            }
         }
         if (l.bflops > 0) bflops += l.bflops;
     }

From 80ceee4fca9fd44082e587ecb70c45cbc8e26dca Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Mon, 2 Sep 2019 14:24:54 +0300
Subject: [PATCH 13/86] Added antialiasing=1 param for [convolutional]-layer on
 CPU (only forward inference)

---
 src/convolutional_layer.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 11402721545..10c1f324188 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -1135,6 +1135,18 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
     //wait_until_press_key_cv();
 
     if(l.assisted_excitation && state.train) assisted_excitation_forward(l, state);
+
+    if (l.antialiasing) {
+        network_state s = { 0 };
+        s.train = state.train;
+        s.workspace = state.workspace;
+        s.net = state.net;
+        if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
+        s.input = l.output;
+        forward_convolutional_layer(*(l.input_layer), s);
+        //simple_copy_ongpu(l.outputs*l.batch, l.output, l.input_antialiasing);
+        memcpy(l.output, l.input_layer->output, l.input_layer->outputs * l.input_layer->batch * sizeof(float));
+    }
 }
 
 static box float_to_box_stride(float *f, int stride)

From 9e26472b1ac245a87289b1a90af3007cd937d94b Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Mon, 2 Sep 2019 15:25:42 +0300
Subject: [PATCH 14/86] Added antialiasing=1 param for [maxpool]-layer on GPU
 and CPU

---
 src/convolutional_layer.c    |   3 +-
 src/maxpool_layer.c          | 115 +++++++++++++++++++++++++++--------
 src/maxpool_layer.h          |   2 +-
 src/maxpool_layer_kernels.cu |  41 ++++++++++---
 src/parser.c                 |   3 +-
 5 files changed, 126 insertions(+), 38 deletions(-)

diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 10c1f324188..c5c59576349 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -576,7 +576,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     //fprintf(stderr, "%5d/%2d %2d x%2d /%2d(%d)%4d x%4d x%4d  -> %4d x%4d x%4d %5.3f BF\n", n, groups, size, size, stride, dilation, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
 
     if (l.antialiasing) {
-        printf("AA: ");
+        printf("AA:  ");
         l.input_layer = (layer*)calloc(1, sizeof(layer));
         const int blur_size = 3;
         *(l.input_layer) = make_convolutional_layer(batch, steps, out_h, out_w, n, n, n, blur_size, blur_stride_x, blur_stride_y, 1, blur_size / 2, LINEAR, 0, 0, 0, 0, 0, index, 0, NULL);
@@ -1141,7 +1141,6 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
         s.train = state.train;
         s.workspace = state.workspace;
         s.net = state.net;
-        if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
         s.input = l.output;
         forward_convolutional_layer(*(l.input_layer), s);
         //simple_copy_ongpu(l.outputs*l.batch, l.output, l.input_antialiasing);
diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c
index 000efe90663..1239262197d 100644
--- a/src/maxpool_layer.c
+++ b/src/maxpool_layer.c
@@ -1,4 +1,5 @@
 #include "maxpool_layer.h"
+#include "convolutional_layer.h"
 #include "dark_cuda.h"
 #include "gemm.h"
 #include <stdio.h>
@@ -45,10 +46,18 @@ void cudnn_maxpool_setup(layer *l)
 }
 
 
-maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels)
+maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels, int antialiasing)
 {
     maxpool_layer l = { (LAYER_TYPE)0 };
     l.type = MAXPOOL;
+
+    const int blur_stride_x = stride_x;
+    const int blur_stride_y = stride_y;
+    l.antialiasing = antialiasing;
+    if (antialiasing) {
+        stride_x = stride_y = l.stride = l.stride_x = l.stride_y = 1; // use stride=1 in host-layer
+    }
+
     l.batch = batch;
     l.h = h;
     l.w = w;
@@ -94,6 +103,46 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
     else
         fprintf(stderr, "max             %d x %d/%2dx%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, stride_y, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
 
+    if (l.antialiasing) {
+        printf("AA:  ");
+        l.input_layer = (layer*)calloc(1, sizeof(layer));
+        const int blur_size = 3;
+        *(l.input_layer) = make_convolutional_layer(batch, 1, l.out_h, l.out_w, l.out_c, l.out_c, l.out_c, blur_size, blur_stride_x, blur_stride_y, 1, blur_size / 2, LINEAR, 0, 0, 0, 0, 0, 1, 0, NULL);
+        const int blur_nweights = l.out_c * blur_size * blur_size;  // (n / n) * n * blur_size * blur_size;
+        int i;
+        for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
+            /*
+            l.input_layer->weights[i + 0] = 0;
+            l.input_layer->weights[i + 1] = 0;
+            l.input_layer->weights[i + 2] = 0;
+
+            l.input_layer->weights[i + 3] = 0;
+            l.input_layer->weights[i + 4] = 1;
+            l.input_layer->weights[i + 5] = 0;
+
+            l.input_layer->weights[i + 6] = 0;
+            l.input_layer->weights[i + 7] = 0;
+            l.input_layer->weights[i + 8] = 0;
+            */
+            l.input_layer->weights[i + 0] = 1 / 16.f;
+            l.input_layer->weights[i + 1] = 2 / 16.f;
+            l.input_layer->weights[i + 2] = 1 / 16.f;
+
+            l.input_layer->weights[i + 3] = 2 / 16.f;
+            l.input_layer->weights[i + 4] = 4 / 16.f;
+            l.input_layer->weights[i + 5] = 2 / 16.f;
+
+            l.input_layer->weights[i + 6] = 1 / 16.f;
+            l.input_layer->weights[i + 7] = 2 / 16.f;
+            l.input_layer->weights[i + 8] = 1 / 16.f;
+        }
+        for (i = 0; i < l.out_c; ++i) l.input_layer->biases[i] = 0;
+#ifdef GPU
+        l.input_antialiasing_gpu = cuda_make_array(NULL, l.batch*l.outputs);
+        push_convolutional_layer(*(l.input_layer));
+#endif  // GPU
+    }
+
     return l;
 }
 
@@ -159,42 +208,54 @@ void forward_maxpool_layer(const maxpool_layer l, network_state state)
 
     if (!state.train && l.stride_x == l.stride_y) {
         forward_maxpool_layer_avx(state.input, l.output, l.indexes, l.size, l.w, l.h, l.out_w, l.out_h, l.c, l.pad, l.stride, l.batch);
-        return;
     }
+    else {
 
-    int b,i,j,k,m,n;
-    int w_offset = -l.pad / 2;
-    int h_offset = -l.pad / 2;
+        int b, i, j, k, m, n;
+        int w_offset = -l.pad / 2;
+        int h_offset = -l.pad / 2;
 
-    int h = l.out_h;
-    int w = l.out_w;
-    int c = l.c;
+        int h = l.out_h;
+        int w = l.out_w;
+        int c = l.c;
 
-    for(b = 0; b < l.batch; ++b){
-        for(k = 0; k < c; ++k){
-            for(i = 0; i < h; ++i){
-                for(j = 0; j < w; ++j){
-                    int out_index = j + w*(i + h*(k + c*b));
-                    float max = -FLT_MAX;
-                    int max_i = -1;
-                    for(n = 0; n < l.size; ++n){
-                        for(m = 0; m < l.size; ++m){
-                            int cur_h = h_offset + i*l.stride_y + n;
-                            int cur_w = w_offset + j*l.stride_x + m;
-                            int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c));
-                            int valid = (cur_h >= 0 && cur_h < l.h &&
-                                         cur_w >= 0 && cur_w < l.w);
-                            float val = (valid != 0) ? state.input[index] : -FLT_MAX;
-                            max_i = (val > max) ? index : max_i;
-                            max   = (val > max) ? val   : max;
+        for (b = 0; b < l.batch; ++b) {
+            for (k = 0; k < c; ++k) {
+                for (i = 0; i < h; ++i) {
+                    for (j = 0; j < w; ++j) {
+                        int out_index = j + w*(i + h*(k + c*b));
+                        float max = -FLT_MAX;
+                        int max_i = -1;
+                        for (n = 0; n < l.size; ++n) {
+                            for (m = 0; m < l.size; ++m) {
+                                int cur_h = h_offset + i*l.stride_y + n;
+                                int cur_w = w_offset + j*l.stride_x + m;
+                                int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c));
+                                int valid = (cur_h >= 0 && cur_h < l.h &&
+                                    cur_w >= 0 && cur_w < l.w);
+                                float val = (valid != 0) ? state.input[index] : -FLT_MAX;
+                                max_i = (val > max) ? index : max_i;
+                                max = (val > max) ? val : max;
+                            }
                         }
+                        l.output[out_index] = max;
+                        l.indexes[out_index] = max_i;
                     }
-                    l.output[out_index] = max;
-                    l.indexes[out_index] = max_i;
                 }
             }
         }
     }
+
+    if (l.antialiasing) {
+        network_state s = { 0 };
+        s.train = state.train;
+        s.workspace = state.workspace;
+        s.net = state.net;
+        s.input = l.output;
+        forward_convolutional_layer(*(l.input_layer), s);
+        //simple_copy_ongpu(l.outputs*l.batch, l.output, l.input_antialiasing);
+        memcpy(l.output, l.input_layer->output, l.input_layer->outputs * l.input_layer->batch * sizeof(float));
+    }
 }
 
 void backward_maxpool_layer(const maxpool_layer l, network_state state)
diff --git a/src/maxpool_layer.h b/src/maxpool_layer.h
index 4994d45700d..cfedf9d9ee6 100644
--- a/src/maxpool_layer.h
+++ b/src/maxpool_layer.h
@@ -12,7 +12,7 @@ typedef layer maxpool_layer;
 extern "C" {
 #endif
 image get_maxpool_image(maxpool_layer l);
-maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels);
+maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels, int antialiasing);
 void resize_maxpool_layer(maxpool_layer *l, int w, int h);
 void forward_maxpool_layer(const maxpool_layer l, network_state state);
 void backward_maxpool_layer(const maxpool_layer l, network_state state);
diff --git a/src/maxpool_layer_kernels.cu b/src/maxpool_layer_kernels.cu
index 8e8511003e5..cc546a0b50c 100644
--- a/src/maxpool_layer_kernels.cu
+++ b/src/maxpool_layer_kernels.cu
@@ -3,6 +3,8 @@
 #include <cublas_v2.h>
 
 #include "maxpool_layer.h"
+#include "convolutional_layer.h"
+#include "blas.h"
 #include "dark_cuda.h"
 
 __global__ void forward_maxpool_depth_layer_kernel(int n, int w, int h, int c, int out_c, int batch, float *input, float *output, int *indexes)
@@ -163,22 +165,47 @@ extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state sta
         //cudnnDestroyTensorDescriptor(layer.srcTensorDesc);
         //cudnnDestroyTensorDescriptor(layer.dstTensorDesc);
 
-        return;
     }
+    else
 #endif
+    {
+        int h = layer.out_h;
+        int w = layer.out_w;
+        int c = layer.out_c;
 
-    int h = layer.out_h;
-    int w = layer.out_w;
-    int c = layer.out_c;
+        size_t n = h*w*c*layer.batch;
 
-    size_t n = h*w*c*layer.batch;
+        forward_maxpool_layer_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> > (n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+    }
 
-    forward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream()>>>(n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu);
-    CHECK_CUDA(cudaPeekAtLastError());
+    if (layer.antialiasing) {
+        network_state s = { 0 };
+        s.train = state.train;
+        s.workspace = state.workspace;
+        s.net = state.net;
+        if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
+        s.input = layer.output_gpu;
+        forward_convolutional_layer_gpu(*(layer.input_layer), s);
+        simple_copy_ongpu(layer.outputs*layer.batch, layer.output_gpu, layer.input_antialiasing_gpu);
+        simple_copy_ongpu(layer.input_layer->outputs*layer.input_layer->batch, layer.input_layer->output_gpu, layer.output_gpu);
+    }
 }
 
 extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
 {
+    if (layer.antialiasing) {
+        network_state s = { 0 };
+        s.train = state.train;
+        s.workspace = state.workspace;
+        s.net = state.net;
+        s.delta = layer.delta_gpu;
+        s.input = layer.input_antialiasing_gpu;
+        //if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
+        simple_copy_ongpu(layer.input_layer->outputs*layer.input_layer->batch, layer.delta_gpu, layer.input_layer->delta_gpu);
+        backward_convolutional_layer_gpu(*(layer.input_layer), s);
+    }
+
     if (layer.maxpool_depth) {
         int h = layer.out_h;
         int w = layer.out_w;
diff --git a/src/parser.c b/src/parser.c
index fda2bacc041..b89bf0acc7b 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -545,6 +545,7 @@ maxpool_layer parse_maxpool(list *options, size_params params)
     int padding = option_find_int_quiet(options, "padding", size-1);
     int maxpool_depth = option_find_int_quiet(options, "maxpool_depth", 0);
     int out_channels = option_find_int_quiet(options, "out_channels", 1);
+    int antialiasing = option_find_int_quiet(options, "antialiasing", 0);
 
     int batch,h,w,c;
     h = params.h;
@@ -553,7 +554,7 @@ maxpool_layer parse_maxpool(list *options, size_params params)
     batch=params.batch;
     if(!(h && w && c)) error("Layer before maxpool layer must output image.");
 
-    maxpool_layer layer = make_maxpool_layer(batch, h, w, c, size, stride_x, stride_y, padding, maxpool_depth, out_channels);
+    maxpool_layer layer = make_maxpool_layer(batch, h, w, c, size, stride_x, stride_y, padding, maxpool_depth, out_channels, antialiasing);
     return layer;
 }
 

From a63782ca8937f412e943b6c841a5671ce39c60fc Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Mon, 2 Sep 2019 15:55:05 +0300
Subject: [PATCH 15/86] Added: efficientnet_b0.cfg

---
 build/darknet/x64/cfg/efficientnet_b0.cfg | 1005 +++++++++++++++++++++
 cfg/efficientnet_b0.cfg                   | 1005 +++++++++++++++++++++
 2 files changed, 2010 insertions(+)
 create mode 100644 build/darknet/x64/cfg/efficientnet_b0.cfg
 create mode 100644 cfg/efficientnet_b0.cfg

diff --git a/build/darknet/x64/cfg/efficientnet_b0.cfg b/build/darknet/x64/cfg/efficientnet_b0.cfg
new file mode 100644
index 00000000000..3bd3e895bc1
--- /dev/null
+++ b/build/darknet/x64/cfg/efficientnet_b0.cfg
@@ -0,0 +1,1005 @@
+[net]
+# Training
+batch=120
+subdivisions=4
+# Testing
+#batch=1
+#subdivisions=1
+height=224
+width=224
+channels=3
+momentum=0.9
+decay=0.0005
+max_crop=256
+
+burn_in=1000
+#burn_in=100
+learning_rate=0.256
+policy=poly
+power=4
+max_batches=800000
+momentum=0.9
+decay=0.00005
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
+
+
+### CONV1 - 1 (1)
+# conv1
+[convolutional]
+filters=32
+size=3
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+### CONV2 - MBConv1 - 1 (1)
+# conv2_1_expand
+[convolutional]
+filters=32
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv2_1_dwise
+[convolutional]
+groups=32
+filters=32
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=4 (recommended r=16)
+[convolutional]
+filters=8
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv2_1_linear
+[convolutional]
+filters=16
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV3 - MBConv6 - 1 (2)
+# conv2_2_expand
+[convolutional]
+filters=96
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv2_2_dwise
+[convolutional]
+groups=96
+filters=96
+size=3
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=8 (recommended r=16)
+[convolutional]
+filters=16
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=96
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv2_2_linear
+[convolutional]
+filters=24
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV3 - MBConv6 - 2 (2)
+# conv3_1_expand
+[convolutional]
+filters=144
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv3_1_dwise
+[convolutional]
+groups=144
+filters=144
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=8
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=144
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv3_1_linear
+[convolutional]
+filters=24
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV4 - MBConv6 - 1 (2)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_3_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_3_2_expand
+[convolutional]
+filters=144
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_3_2_dwise
+[convolutional]
+groups=144
+filters=144
+size=5
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=8
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=144
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_3_2_linear
+[convolutional]
+filters=40
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV4 - MBConv6 - 2 (2)
+# conv_4_1_expand
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_1_dwise
+[convolutional]
+groups=192
+filters=192
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=16
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=192
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_1_linear
+[convolutional]
+filters=40
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+
+### CONV5 - MBConv6 - 1 (3)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_4_2
+[shortcut]
+from=-9
+activation=linear
+
+# conv_4_3_expand
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_3_dwise
+[convolutional]
+groups=192
+filters=192
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=16
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=192
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_3_linear
+[convolutional]
+filters=80
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV5 - MBConv6 - 2 (3)
+# conv_4_4_expand
+[convolutional]
+filters=384
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_4_dwise
+[convolutional]
+groups=384
+filters=384
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=24
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=384
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_4_linear
+[convolutional]
+filters=80
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV5 - MBConv6 - 3 (3)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_4_4
+[shortcut]
+from=-9
+activation=linear
+
+# conv_4_5_expand
+[convolutional]
+filters=384
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_5_dwise
+[convolutional]
+groups=384
+filters=384
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=24
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=384
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_5_linear
+[convolutional]
+filters=80
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV6 - MBConv6 - 1 (3)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_4_6
+[shortcut]
+from=-9
+activation=linear
+
+# conv_4_7_expand
+[convolutional]
+filters=384
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_7_dwise
+[convolutional]
+groups=384
+filters=384
+size=5
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=24
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=384
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_7_linear
+[convolutional]
+filters=112
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV6 - MBConv6 - 2 (3)
+# conv_5_1_expand
+[convolutional]
+filters=576
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_5_1_dwise
+[convolutional]
+groups=576
+filters=576
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=576
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_5_1_linear
+[convolutional]
+filters=112
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV6 - MBConv6 - 3 (3)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_5_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_5_2_expand
+[convolutional]
+filters=576
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_5_2_dwise
+[convolutional]
+groups=576
+filters=576
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=576
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_5_2_linear
+[convolutional]
+filters=112
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 1 (4)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_5_2
+[shortcut]
+from=-9
+activation=linear
+
+# conv_5_3_expand
+[convolutional]
+filters=576
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_5_3_dwise
+[convolutional]
+groups=576
+filters=576
+size=5
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=576
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_5_3_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 2 (4)
+# conv_6_1_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_1_dwise
+[convolutional]
+groups=960
+filters=960
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_1_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 3 (4)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_6_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_6_2_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_2_dwise
+[convolutional]
+groups=960
+filters=960
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_2_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 4 (4)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_6_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_6_2_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_2_dwise
+[convolutional]
+groups=960
+filters=960
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_2_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV8 - MBConv6 - 1 (1)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_6_2
+[shortcut]
+from=-9
+activation=linear
+
+# conv_6_3_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_3_dwise
+[convolutional]
+groups=960
+filters=960
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_3_linear
+[convolutional]
+filters=320
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV9 - Conv2d 1x1
+# conv_6_4
+[convolutional]
+filters=1280
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+
+[avgpool]
+
+[dropout]
+probability=.2
+
+[convolutional]
+filters=1000
+size=1
+stride=1
+pad=0
+activation=linear
+
+[softmax]
+groups=1
+
+#[cost]
+#type=sse
+
diff --git a/cfg/efficientnet_b0.cfg b/cfg/efficientnet_b0.cfg
new file mode 100644
index 00000000000..3bd3e895bc1
--- /dev/null
+++ b/cfg/efficientnet_b0.cfg
@@ -0,0 +1,1005 @@
+[net]
+# Training
+batch=120
+subdivisions=4
+# Testing
+#batch=1
+#subdivisions=1
+height=224
+width=224
+channels=3
+momentum=0.9
+decay=0.0005
+max_crop=256
+
+burn_in=1000
+#burn_in=100
+learning_rate=0.256
+policy=poly
+power=4
+max_batches=800000
+momentum=0.9
+decay=0.00005
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
+
+
+### CONV1 - 1 (1)
+# conv1
+[convolutional]
+filters=32
+size=3
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+### CONV2 - MBConv1 - 1 (1)
+# conv2_1_expand
+[convolutional]
+filters=32
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv2_1_dwise
+[convolutional]
+groups=32
+filters=32
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=4 (recommended r=16)
+[convolutional]
+filters=8
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv2_1_linear
+[convolutional]
+filters=16
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV3 - MBConv6 - 1 (2)
+# conv2_2_expand
+[convolutional]
+filters=96
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv2_2_dwise
+[convolutional]
+groups=96
+filters=96
+size=3
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=8 (recommended r=16)
+[convolutional]
+filters=16
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=96
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv2_2_linear
+[convolutional]
+filters=24
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV3 - MBConv6 - 2 (2)
+# conv3_1_expand
+[convolutional]
+filters=144
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv3_1_dwise
+[convolutional]
+groups=144
+filters=144
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=8
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=144
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv3_1_linear
+[convolutional]
+filters=24
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV4 - MBConv6 - 1 (2)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_3_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_3_2_expand
+[convolutional]
+filters=144
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_3_2_dwise
+[convolutional]
+groups=144
+filters=144
+size=5
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=8
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=144
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_3_2_linear
+[convolutional]
+filters=40
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV4 - MBConv6 - 2 (2)
+# conv_4_1_expand
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_1_dwise
+[convolutional]
+groups=192
+filters=192
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=16
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=192
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_1_linear
+[convolutional]
+filters=40
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+
+### CONV5 - MBConv6 - 1 (3)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_4_2
+[shortcut]
+from=-9
+activation=linear
+
+# conv_4_3_expand
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_3_dwise
+[convolutional]
+groups=192
+filters=192
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=16
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=192
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_3_linear
+[convolutional]
+filters=80
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV5 - MBConv6 - 2 (3)
+# conv_4_4_expand
+[convolutional]
+filters=384
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_4_dwise
+[convolutional]
+groups=384
+filters=384
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=24
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=384
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_4_linear
+[convolutional]
+filters=80
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV5 - MBConv6 - 3 (3)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_4_4
+[shortcut]
+from=-9
+activation=linear
+
+# conv_4_5_expand
+[convolutional]
+filters=384
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_5_dwise
+[convolutional]
+groups=384
+filters=384
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=24
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=384
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_5_linear
+[convolutional]
+filters=80
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV6 - MBConv6 - 1 (3)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_4_6
+[shortcut]
+from=-9
+activation=linear
+
+# conv_4_7_expand
+[convolutional]
+filters=384
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_7_dwise
+[convolutional]
+groups=384
+filters=384
+size=5
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=24
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=384
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_7_linear
+[convolutional]
+filters=112
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV6 - MBConv6 - 2 (3)
+# conv_5_1_expand
+[convolutional]
+filters=576
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_5_1_dwise
+[convolutional]
+groups=576
+filters=576
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=576
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_5_1_linear
+[convolutional]
+filters=112
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV6 - MBConv6 - 3 (3)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_5_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_5_2_expand
+[convolutional]
+filters=576
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_5_2_dwise
+[convolutional]
+groups=576
+filters=576
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=576
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_5_2_linear
+[convolutional]
+filters=112
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 1 (4)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_5_2
+[shortcut]
+from=-9
+activation=linear
+
+# conv_5_3_expand
+[convolutional]
+filters=576
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_5_3_dwise
+[convolutional]
+groups=576
+filters=576
+size=5
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=576
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_5_3_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 2 (4)
+# conv_6_1_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_1_dwise
+[convolutional]
+groups=960
+filters=960
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_1_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 3 (4)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_6_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_6_2_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_2_dwise
+[convolutional]
+groups=960
+filters=960
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_2_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 4 (4)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_6_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_6_2_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_2_dwise
+[convolutional]
+groups=960
+filters=960
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_2_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV8 - MBConv6 - 1 (1)
+# dropout only before residual connection
+[dropout]
+probability=.2
+
+# block_6_2
+[shortcut]
+from=-9
+activation=linear
+
+# conv_6_3_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_3_dwise
+[convolutional]
+groups=960
+filters=960
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_3_linear
+[convolutional]
+filters=320
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV9 - Conv2d 1x1
+# conv_6_4
+[convolutional]
+filters=1280
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+
+[avgpool]
+
+[dropout]
+probability=.2
+
+[convolutional]
+filters=1000
+size=1
+stride=1
+pad=0
+activation=linear
+
+[softmax]
+groups=1
+
+#[cost]
+#type=sse
+

From be5d0d66933e50585688bc86bb42786de55893ab Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Tue, 3 Sep 2019 01:35:05 +0300
Subject: [PATCH 16/86] Added assisted_excitation=1 for [convolutional] layer
 on GPU

---
 include/darknet.h            |   3 +
 src/conv_lstm_layer.c        |  22 ++--
 src/convolutional_kernels.cu | 191 +++++++++++++++++++++++++++++++++++
 src/convolutional_layer.c    |  25 +++--
 src/convolutional_layer.h    |   3 +-
 src/crnn_layer.c             |   6 +-
 src/maxpool_layer.c          |   2 +-
 src/parser.c                 |   6 +-
 8 files changed, 232 insertions(+), 26 deletions(-)

diff --git a/include/darknet.h b/include/darknet.h
index a7a62b47bd7..e78abe6a5c9 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -537,6 +537,9 @@ struct layer {
     float * rand_gpu;
     float * squared_gpu;
     float * norms_gpu;
+
+    float *gt_gpu;
+    float *a_avg_gpu;
 #ifdef CUDNN
     cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
     cudnnTensorDescriptor_t srcTensorDesc16, dstTensorDesc16;
diff --git a/src/conv_lstm_layer.c b/src/conv_lstm_layer.c
index a6da3bf0c2c..4ae67b44a83 100644
--- a/src/conv_lstm_layer.c
+++ b/src/conv_lstm_layer.c
@@ -66,44 +66,44 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i
 
     // U
     l.uf = (layer*)calloc(1, sizeof(layer));
-    *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+    *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
     l.uf->batch = batch;
     if (l.workspace_size < l.uf->workspace_size) l.workspace_size = l.uf->workspace_size;
 
     l.ui = (layer*)calloc(1, sizeof(layer));
-    *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+    *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
     l.ui->batch = batch;
     if (l.workspace_size < l.ui->workspace_size) l.workspace_size = l.ui->workspace_size;
 
     l.ug = (layer*)calloc(1, sizeof(layer));
-    *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+    *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
     l.ug->batch = batch;
     if (l.workspace_size < l.ug->workspace_size) l.workspace_size = l.ug->workspace_size;
 
     l.uo = (layer*)calloc(1, sizeof(layer));
-    *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+    *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
     l.uo->batch = batch;
     if (l.workspace_size < l.uo->workspace_size) l.workspace_size = l.uo->workspace_size;
 
 
     // W
     l.wf = (layer*)calloc(1, sizeof(layer));
-    *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+    *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
     l.wf->batch = batch;
     if (l.workspace_size < l.wf->workspace_size) l.workspace_size = l.wf->workspace_size;
 
     l.wi = (layer*)calloc(1, sizeof(layer));
-    *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+    *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
     l.wi->batch = batch;
     if (l.workspace_size < l.wi->workspace_size) l.workspace_size = l.wi->workspace_size;
 
     l.wg = (layer*)calloc(1, sizeof(layer));
-    *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+    *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
     l.wg->batch = batch;
     if (l.workspace_size < l.wg->workspace_size) l.workspace_size = l.wg->workspace_size;
 
     l.wo = (layer*)calloc(1, sizeof(layer));
-    *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+    *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
     l.wo->batch = batch;
     if (l.workspace_size < l.wo->workspace_size) l.workspace_size = l.wo->workspace_size;
 
@@ -111,21 +111,21 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i
     // V
     l.vf = (layer*)calloc(1, sizeof(layer));
     if (l.peephole) {
-        *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+        *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
         l.vf->batch = batch;
         if (l.workspace_size < l.vf->workspace_size) l.workspace_size = l.vf->workspace_size;
     }
 
     l.vi = (layer*)calloc(1, sizeof(layer));
     if (l.peephole) {
-        *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+        *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
         l.vi->batch = batch;
         if (l.workspace_size < l.vi->workspace_size) l.workspace_size = l.vi->workspace_size;
     }
 
     l.vo = (layer*)calloc(1, sizeof(layer));
     if (l.peephole) {
-        *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+        *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
         l.vo->batch = batch;
         if (l.workspace_size < l.vo->workspace_size) l.workspace_size = l.vo->workspace_size;
     }
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index b476ac76e3d..566fb893335 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -605,6 +605,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
         fix_nan_and_inf(l.output_gpu, l.outputs*l.batch);
     }
 
+    if(l.assisted_excitation && state.train) assisted_excitation_forward_gpu(l, state);
+
     if (l.antialiasing) {
         network_state s = { 0 };
         s.train = state.train;
@@ -890,6 +892,195 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
     }
 }
 
+static box float_to_box_stride(float *f, int stride)
+{
+    box b = { 0 };
+    b.x = f[0];
+    b.y = f[1 * stride];
+    b.w = f[2 * stride];
+    b.h = f[3 * stride];
+    return b;
+}
+
+__global__ void calc_avg_activation_kernel(float *src, float *dst, int size, int channels, int batches)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int xy = i % size;
+    int b = i / size;
+
+    if (i < size*batches) {
+        dst[i] = 0;
+        for (int c = 0; c < channels; ++c) {
+            dst[i] += src[xy + size*(c + channels*b)];
+        }
+        dst[i] = dst[i] / channels;
+    }
+}
+
+#include <iostream>
+
+void calc_avg_activation_gpu(float *src, float *dst, int size, int channels, int batches)
+{
+    const int num_blocks = get_number_of_blocks(size*batches, BLOCK);
+
+    std::cout << " size = " << size << ",  channels = " << channels << ", batches = " << batches << std::endl;
+    calc_avg_activation_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (src, dst, size, channels, batches);
+}
+
+
+__global__ void assisted_activation_kernel(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int xy = i % size;
+    int b = i / size;
+
+    if (b < batches) {
+        for (int c = 0; c < channels; ++c) {
+            output[xy + size*(c + channels*b)] += alpha * gt_gpu[i] * a_avg_gpu[i];
+        }
+    }
+}
+
+void assisted_activation_gpu(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches)
+{
+    const int num_blocks = get_number_of_blocks(size*batches, BLOCK);
+
+    assisted_activation_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (alpha, output, gt_gpu, a_avg_gpu, size, channels, batches);
+}
+
+void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
+{
+    const int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+
+    // epoch
+    const float epoch = (float)(*state.net.seen) / state.net.train_images_num;
+
+    // calculate alpha
+    //const float alpha = (1 + cos(3.141592 * iteration_num)) / (2 * state.net.max_batches);
+    //const float alpha = (1 + cos(3.141592 * epoch)) / (2 * state.net.max_batches);
+    const float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches)) / 2;
+
+    //printf("\n epoch = %f, alpha = %f, seen = %d, max_batches = %d, train_images_num = %d \n",
+    //    epoch, alpha, (*state.net.seen), state.net.max_batches, state.net.train_images_num);
+
+    //const int size = l.outputs * l.batch;
+
+    float *a_avg = (float *)calloc(l.out_w * l.out_h * l.batch, sizeof(float));
+    float *gt = (float *)calloc(l.out_w * l.out_h * l.batch, sizeof(float));
+
+    int b;
+    int w, h, c;
+
+    l.max_boxes = state.net.num_boxes;
+    l.truths = l.max_boxes*(4 + 1);
+
+    int num_truth = l.batch*l.truths;
+    float *truth_cpu = (float *)calloc(num_truth, sizeof(float));
+    cuda_pull_array(state.truth, truth_cpu, num_truth);
+    //cudaStreamSynchronize(get_cuda_stream());
+    //CHECK_CUDA(cudaPeekAtLastError());
+
+    for (b = 0; b < l.batch; ++b)
+    {
+        // calculate G
+        int t;
+        for (t = 0; t < state.net.num_boxes; ++t) {
+            box truth = float_to_box_stride(truth_cpu + t*(4 + 1) + b*l.truths, 1);
+            if (!truth.x) break;  // continue;
+
+            int left = floor((truth.x - truth.w / 2) * l.out_w);
+            int right = ceil((truth.x + truth.w / 2) * l.out_w);
+            int top = floor((truth.y - truth.h / 2) * l.out_h);
+            int bottom = ceil((truth.y + truth.h / 2) * l.out_h);
+
+            for (w = left; w <= right; w++) {
+                for (h = top; h < bottom; h++) {
+                    gt[w + l.out_w * h + l.out_w*l.out_h*b] = 1;
+                }
+            }
+        }
+    }
+
+    cuda_push_array(l.gt_gpu, gt, l.out_w * l.out_h * l.batch);
+    //cudaStreamSynchronize(get_cuda_stream());
+    //CHECK_CUDA(cudaPeekAtLastError());
+
+    // calc avg_output on GPU - for whole batch
+    calc_avg_activation_gpu(l.output_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
+    //cudaStreamSynchronize(get_cuda_stream());
+    //CHECK_CUDA(cudaPeekAtLastError());
+
+    // calc new output
+    assisted_activation_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
+    //cudaStreamSynchronize(get_cuda_stream());
+    //CHECK_CUDA(cudaPeekAtLastError());
+
+
+
+    /*
+    for (b = 0; b < l.batch; ++b)
+    {
+        // calculate average A
+        for (w = 0; w < l.out_w; w++) {
+            for (h = 0; h < l.out_h; h++) {
+                for (c = 0; c < l.out_c; c++) {
+                    a_avg[w + l.out_w*(h + l.out_h*b)] += l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))];
+                }
+                a_avg[w + l.out_w*(h + l.out_h*b)] /= l.out_c;  // a_avg / d
+            }
+        }
+    }
+
+    // change activation
+    for (b = 0; b < l.batch; ++b)
+    {
+        for (w = 0; w < l.out_w; w++) {
+            for (h = 0; h < l.out_h; h++) {
+                for (c = 0; c < l.out_c; c++)
+                {
+                    // a = a + alpha(t) + e(c,i,j) = a + alpha(t) + g(i,j) * avg_a(i,j) / channels
+                    l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] +=
+                        alpha *
+                        g[w + l.out_w*(h + l.out_h*b)] *
+                        a_avg[w + l.out_w*(h + l.out_h*b)];
+
+                    //l.output[w + l.out_w*(h + l.out_h*(c + l.out_c*b))] =
+                    //    alpha * g[w + l.out_w*(h + l.out_h*b)] * a_avg[w + l.out_w*(h + l.out_h*b)];
+                }
+            }
+        }
+    }
+    */
+
+    if (0)   // visualize ground truth
+    {
+#ifdef OPENCV
+        cuda_pull_array(l.output_gpu, l.output, l.outputs * l.batch);
+        cudaStreamSynchronize(get_cuda_stream());
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        for (b = 0; b < l.batch; ++b)
+        {
+            image img = float_to_image(l.out_w, l.out_h, 1, &gt[l.out_w*l.out_h*b]);
+            char buff[100];
+            sprintf(buff, "a_excitation_%d", b);
+            show_image_cv(img, buff);
+
+            image img2 = float_to_image(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]);
+            char buff2[100];
+            sprintf(buff2, "a_excitation_act_%d", b);
+            show_image_cv(img2, buff2);
+            wait_key_cv(5);
+        }
+        wait_until_press_key_cv();
+#endif // OPENCV
+    }
+
+    free(truth_cpu);
+    free(gt);
+    free(a_avg);
+}
+
 void pull_convolutional_layer(convolutional_layer l)
 {
     cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index c5c59576349..157058eb91c 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -332,7 +332,7 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 #endif
 #endif
 
-convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer)
+convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer, int assisted_excitation)
 {
     int total_batch = batch*steps;
     int i;
@@ -349,6 +349,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
         stride_x = stride_y = l.stride = l.stride_x = l.stride_y = 1; // use stride=1 in host-layer
     }
 
+    l.assisted_excitation = assisted_excitation;
     l.share_layer = share_layer;
     l.index = index;
     l.h = h;
@@ -503,7 +504,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
 #ifdef CUDNN_HALF
             l.weights_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1);
             l.weight_updates_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1);
-#endif
+#endif  // CUDNN_HALF
             l.biases_gpu = cuda_make_array(l.biases, n);
             l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
         }
@@ -547,19 +548,27 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
             l.x_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
             l.x_norm_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
         }
+
+        if (l.assisted_excitation)
+        {
+            const int size = l.out_w * l.out_h * l.batch;
+            l.gt_gpu = cuda_make_array(NULL, size);
+            l.a_avg_gpu = cuda_make_array(NULL, size);
+        }
 #ifdef CUDNN
         create_convolutional_cudnn_tensors(&l);
         cudnn_convolutional_setup(&l, cudnn_fastest);
-#endif
+#endif  // CUDNN
     }
-#endif
+#endif  // GPU
     l.workspace_size = get_convolutional_workspace_size(l);
 
     //fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
     l.bflops = (2.0 * l.nweights * l.out_h*l.out_w) / 1000000000.;
     if (l.xnor && l.use_bin_output) fprintf(stderr, "convXB");
     else if (l.xnor) fprintf(stderr, "convX ");
-    else if(l.share_layer) fprintf(stderr, "convS ");
+    else if (l.share_layer) fprintf(stderr, "convS ");
+    else if (l.assisted_excitation) fprintf(stderr, "convAE");
     else fprintf(stderr, "conv  ");
 
     if (groups > 1) fprintf(stderr, "%5d/%4d ", n, groups);
@@ -579,7 +588,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
         printf("AA:  ");
         l.input_layer = (layer*)calloc(1, sizeof(layer));
         const int blur_size = 3;
-        *(l.input_layer) = make_convolutional_layer(batch, steps, out_h, out_w, n, n, n, blur_size, blur_stride_x, blur_stride_y, 1, blur_size / 2, LINEAR, 0, 0, 0, 0, 0, index, 0, NULL);
+        *(l.input_layer) = make_convolutional_layer(batch, steps, out_h, out_w, n, n, n, blur_size, blur_stride_x, blur_stride_y, 1, blur_size / 2, LINEAR, 0, 0, 0, 0, 0, index, 0, NULL, 0);
         const int blur_nweights = n * blur_size * blur_size;  // (n / n) * n * blur_size * blur_size;
         int i;
         for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
@@ -636,7 +645,7 @@ void denormalize_convolutional_layer(convolutional_layer l)
 
 void test_convolutional_layer()
 {
-    convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 1, 5, 2, 2, 1, 1, LEAKY, 1, 0, 0, 0, 0, 0, 0, NULL);
+    convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 1, 5, 2, 2, 1, 1, LEAKY, 1, 0, 0, 0, 0, 0, 0, NULL, 0);
     l.batch_normalize = 1;
     float data[] = {1,1,1,1,1,
         1,1,1,1,1,
@@ -1236,7 +1245,7 @@ void assisted_excitation_forward(convolutional_layer l, network_state state)
         }
     }
 
-    if(0)   // visualize ground truth
+    if(1)   // visualize ground truth
     {
 #ifdef OPENCV
         for (b = 0; b < l.batch; ++b)
diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h
index 1012663a5b3..0072ce549c3 100644
--- a/src/convolutional_layer.h
+++ b/src/convolutional_layer.h
@@ -30,7 +30,7 @@ void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16);
 #endif
 
 size_t get_convolutional_workspace_size(layer l);
-convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer);
+convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer, int assisted_excitation);
 void denormalize_convolutional_layer(convolutional_layer l);
 void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
 void forward_convolutional_layer(const convolutional_layer layer, network_state state);
@@ -57,6 +57,7 @@ int convolutional_out_width(convolutional_layer layer);
 void rescale_weights(convolutional_layer l, float scale, float trans);
 void rgbgr_weights(convolutional_layer l);
 void assisted_excitation_forward(convolutional_layer l, network_state state);
+void assisted_excitation_forward_gpu(convolutional_layer l, network_state state);
 
 #ifdef __cplusplus
 }
diff --git a/src/crnn_layer.c b/src/crnn_layer.c
index e3114fc9497..588db7411a0 100644
--- a/src/crnn_layer.c
+++ b/src/crnn_layer.c
@@ -50,17 +50,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
     l.state = (float*)calloc(l.hidden * l.batch * (l.steps + 1), sizeof(float));
 
     l.input_layer = (layer*)calloc(1, sizeof(layer));
-    *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+    *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
     l.input_layer->batch = batch;
     if (l.workspace_size < l.input_layer->workspace_size) l.workspace_size = l.input_layer->workspace_size;
 
     l.self_layer = (layer*)calloc(1, sizeof(layer));
-    *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+    *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
     l.self_layer->batch = batch;
     if (l.workspace_size < l.self_layer->workspace_size) l.workspace_size = l.self_layer->workspace_size;
 
     l.output_layer = (layer*)calloc(1, sizeof(layer));
-    *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL);
+    *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
     l.output_layer->batch = batch;
     if (l.workspace_size < l.output_layer->workspace_size) l.workspace_size = l.output_layer->workspace_size;
 
diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c
index 1239262197d..27d338603d1 100644
--- a/src/maxpool_layer.c
+++ b/src/maxpool_layer.c
@@ -107,7 +107,7 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
         printf("AA:  ");
         l.input_layer = (layer*)calloc(1, sizeof(layer));
         const int blur_size = 3;
-        *(l.input_layer) = make_convolutional_layer(batch, 1, l.out_h, l.out_w, l.out_c, l.out_c, l.out_c, blur_size, blur_stride_x, blur_stride_y, 1, blur_size / 2, LINEAR, 0, 0, 0, 0, 0, 1, 0, NULL);
+        *(l.input_layer) = make_convolutional_layer(batch, 1, l.out_h, l.out_w, l.out_c, l.out_c, l.out_c, blur_size, blur_stride_x, blur_stride_y, 1, blur_size / 2, LINEAR, 0, 0, 0, 0, 0, 1, 0, NULL, 0);
         const int blur_nweights = l.out_c * blur_size * blur_size;  // (n / n) * n * blur_size * blur_size;
         int i;
         for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
diff --git a/src/parser.c b/src/parser.c
index b89bf0acc7b..97d6aef9c96 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -170,6 +170,8 @@ convolutional_layer parse_convolutional(list *options, size_params params, netwo
     char *activation_s = option_find_str(options, "activation", "logistic");
     ACTIVATION activation = get_activation(activation_s);
 
+    int assisted_excitation = option_find_float_quiet(options, "assisted_excitation", 0);
+
     int share_index = option_find_int_quiet(options, "share_index", -1000000000);
     convolutional_layer *share_layer = NULL;
     if(share_index >= 0) share_layer = &net.layers[share_index];
@@ -186,10 +188,10 @@ convolutional_layer parse_convolutional(list *options, size_params params, netwo
     int xnor = option_find_int_quiet(options, "xnor", 0);
     int use_bin_output = option_find_int_quiet(options, "bin_output", 0);
 
-    convolutional_layer layer = make_convolutional_layer(batch,1,h,w,c,n,groups,size,stride_x,stride_y,dilation,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index, antialiasing, share_layer);
+    convolutional_layer layer = make_convolutional_layer(batch,1,h,w,c,n,groups,size,stride_x,stride_y,dilation,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index, antialiasing, share_layer, assisted_excitation);
     layer.flipped = option_find_int_quiet(options, "flipped", 0);
     layer.dot = option_find_float_quiet(options, "dot", 0);
-    layer.assisted_excitation = option_find_float_quiet(options, "assisted_excitation", 0);
+
 
     if(params.net.adam){
         layer.B1 = params.net.B1;

From 9c02df864e32259292e3189a0879b361165eadfb Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Wed, 4 Sep 2019 18:50:56 +0300
Subject: [PATCH 17/86] Fixed assisted_excitation and added also for [shortcut]
 layer

---
 src/convolutional_kernels.cu | 33 ++++++++++++++++++++++++++-------
 src/convolutional_layer.c    |  9 +++++++--
 src/image.c                  | 14 ++++++++++++++
 src/image.h                  |  1 +
 src/layer.c                  |  4 ++++
 src/parser.c                 |  3 ++-
 src/shortcut_layer.c         | 21 +++++++++++++++++----
 src/shortcut_layer.h         |  2 +-
 8 files changed, 72 insertions(+), 15 deletions(-)

diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index 566fb893335..d766c9cf7cb 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -917,13 +917,10 @@ __global__ void calc_avg_activation_kernel(float *src, float *dst, int size, int
     }
 }
 
-#include <iostream>
-
 void calc_avg_activation_gpu(float *src, float *dst, int size, int channels, int batches)
 {
     const int num_blocks = get_number_of_blocks(size*batches, BLOCK);
 
-    std::cout << " size = " << size << ",  channels = " << channels << ", batches = " << batches << std::endl;
     calc_avg_activation_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (src, dst, size, channels, batches);
 }
 
@@ -937,6 +934,9 @@ __global__ void assisted_activation_kernel(float alpha, float *output, float *gt
     if (b < batches) {
         for (int c = 0; c < channels; ++c) {
             output[xy + size*(c + channels*b)] += alpha * gt_gpu[i] * a_avg_gpu[i];
+            //output[xy + size*(c + channels*b)] += gt_gpu[i] * a_avg_gpu[i];
+            //output[xy + size*(c + channels*b)] += gt_gpu[i] * output[xy + size*(c + channels*b)];
+            //output[xy + size*(c + channels*b)] = a_avg_gpu[i];
         }
     }
 }
@@ -953,12 +953,18 @@ void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
     const int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
 
     // epoch
-    const float epoch = (float)(*state.net.seen) / state.net.train_images_num;
+    //const float epoch = (float)(*state.net.seen) / state.net.train_images_num;
 
     // calculate alpha
     //const float alpha = (1 + cos(3.141592 * iteration_num)) / (2 * state.net.max_batches);
     //const float alpha = (1 + cos(3.141592 * epoch)) / (2 * state.net.max_batches);
-    const float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches)) / 2;
+    //const float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches)) / 2;
+    float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches));
+
+    if (l.assisted_excitation > 1) {
+        if (iteration_num > l.assisted_excitation) alpha = 0;
+        else alpha = (1 + cos(3.141592 * iteration_num / l.assisted_excitation));
+    }
 
     //printf("\n epoch = %f, alpha = %f, seen = %d, max_batches = %d, train_images_num = %d \n",
     //    epoch, alpha, (*state.net.seen), state.net.max_batches, state.net.train_images_num);
@@ -969,7 +975,7 @@ void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
     float *gt = (float *)calloc(l.out_w * l.out_h * l.batch, sizeof(float));
 
     int b;
-    int w, h, c;
+    int w, h;
 
     l.max_boxes = state.net.num_boxes;
     l.truths = l.max_boxes*(4 + 1);
@@ -1061,15 +1067,28 @@ void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
 
         for (b = 0; b < l.batch; ++b)
         {
+            printf(" Assisted Excitation alpha = %f \n", alpha);
             image img = float_to_image(l.out_w, l.out_h, 1, &gt[l.out_w*l.out_h*b]);
             char buff[100];
             sprintf(buff, "a_excitation_%d", b);
             show_image_cv(img, buff);
 
-            image img2 = float_to_image(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]);
+            //image img2 = float_to_image(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]);
+            image img2 = float_to_image_scaled(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]);
             char buff2[100];
             sprintf(buff2, "a_excitation_act_%d", b);
             show_image_cv(img2, buff2);
+
+            /*
+            int c = l.out_c;
+            if (c > 4) c = 4;
+            image img3 = float_to_image(l.out_w, l.out_h, c, &l.output[l.out_w*l.out_h*l.out_c*b]);
+            image dc = collapse_image_layers(img3, 1);
+            char buff3[100];
+            sprintf(buff3, "a_excitation_act_collapsed_%d", b);
+            show_image_cv(dc, buff3);
+            */
+
             wait_key_cv(5);
         }
         wait_until_press_key_cv();
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 157058eb91c..72bb602589e 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -1172,12 +1172,17 @@ void assisted_excitation_forward(convolutional_layer l, network_state state)
     const int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
 
     // epoch
-    const float epoch = (float)(*state.net.seen) / state.net.train_images_num;
+    //const float epoch = (float)(*state.net.seen) / state.net.train_images_num;
 
     // calculate alpha
     //const float alpha = (1 + cos(3.141592 * iteration_num)) / (2 * state.net.max_batches);
     //const float alpha = (1 + cos(3.141592 * epoch)) / (2 * state.net.max_batches);
-    const float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches)) / 2;
+    float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches));
+
+    if (l.assisted_excitation > 1) {
+        if (iteration_num > l.assisted_excitation) alpha = 0;
+        else alpha = (1 + cos(3.141592 * iteration_num / l.assisted_excitation));
+    }
 
     //printf("\n epoch = %f, alpha = %f, seen = %d, max_batches = %d, train_images_num = %d \n",
     //    epoch, alpha, (*state.net.seen), state.net.max_batches, state.net.train_images_num);
diff --git a/src/image.c b/src/image.c
index 2f085801dbe..8befaa2b8c6 100644
--- a/src/image.c
+++ b/src/image.c
@@ -770,6 +770,20 @@ image make_random_image(int w, int h, int c)
     return out;
 }
 
+image float_to_image_scaled(int w, int h, int c, float *data)
+{
+    image out = make_image(w, h, c);
+    int abs_max = 0;
+    int i = 0;
+    for (i = 0; i < w*h*c; ++i) {
+        if (fabs(data[i]) > abs_max) abs_max = fabs(data[i]);
+    }
+    for (i = 0; i < w*h*c; ++i) {
+        out.data[i] = data[i] / abs_max;
+    }
+    return out;
+}
+
 image float_to_image(int w, int h, int c, float *data)
 {
     image out = make_empty_image(w,h,c);
diff --git a/src/image.h b/src/image.h
index 3a1c5b9a73c..14792c9b9ff 100644
--- a/src/image.h
+++ b/src/image.h
@@ -79,6 +79,7 @@ void print_image(image m);
 //LIB_API image make_image(int w, int h, int c);
 image make_random_image(int w, int h, int c);
 image make_empty_image(int w, int h, int c);
+image float_to_image_scaled(int w, int h, int c, float *data);
 image float_to_image(int w, int h, int c, float *data);
 image copy_image(image p);
 image load_image(char *filename, int w, int h, int c);
diff --git a/src/layer.c b/src/layer.c
index b6ae95dba12..e9ae67b5ff5 100644
--- a/src/layer.c
+++ b/src/layer.c
@@ -157,6 +157,10 @@ void free_layer(layer l)
     if (l.x_gpu)                   cuda_free(l.x_gpu);  // dont free
     if (l.x_norm_gpu)              cuda_free(l.x_norm_gpu);
 
+    // assisted excitation
+    if (l.gt_gpu)                  cuda_free(l.gt_gpu);
+    if (l.a_avg_gpu)               cuda_free(l.a_avg_gpu);
+
     if (l.align_bit_weights_gpu)   cuda_free((float *)l.align_bit_weights_gpu);
     if (l.mean_arr_gpu)            cuda_free(l.mean_arr_gpu);
     if (l.align_workspace_gpu)     cuda_free(l.align_workspace_gpu);
diff --git a/src/parser.c b/src/parser.c
index 97d6aef9c96..4b56dfc4c10 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -601,6 +601,7 @@ layer parse_batchnorm(list *options, size_params params)
 
 layer parse_shortcut(list *options, size_params params, network net)
 {
+    int assisted_excitation = option_find_float_quiet(options, "assisted_excitation", 0);
     char *l = option_find(options, "from");
     int index = atoi(l);
     if(index < 0) index = params.index + index;
@@ -608,7 +609,7 @@ layer parse_shortcut(list *options, size_params params, network net)
     int batch = params.batch;
     layer from = net.layers[index];
 
-    layer s = make_shortcut_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c);
+    layer s = make_shortcut_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c, assisted_excitation);
 
     char *activation_s = option_find_str(options, "activation", "linear");
     ACTIVATION activation = get_activation(activation_s);
diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c
index 1f7c6d35e90..d056a6a0a20 100644
--- a/src/shortcut_layer.c
+++ b/src/shortcut_layer.c
@@ -4,9 +4,10 @@
 #include <stdio.h>
 #include <assert.h>
 
-layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2)
+layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2, int assisted_excitation)
 {
-    fprintf(stderr,"Shortcut Layer: %d\n", index);
+    if(assisted_excitation) fprintf(stderr, "Shortcut Layer - AE: %d\n", index);
+    else fprintf(stderr,"Shortcut Layer: %d\n", index);
     layer l = { (LAYER_TYPE)0 };
     l.type = SHORTCUT;
     l.batch = batch;
@@ -19,6 +20,8 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int
     l.outputs = w*h*c;
     l.inputs = l.outputs;
 
+    l.assisted_excitation = assisted_excitation;
+
     if(w != w2 || h != h2 || c != c2) fprintf(stderr, " w = %d, w2 = %d, h = %d, h2 = %d, c = %d, c2 = %d \n", w, w2, h, h2, c, c2);
 
     l.index = index;
@@ -28,13 +31,19 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int
 
     l.forward = forward_shortcut_layer;
     l.backward = backward_shortcut_layer;
-    #ifdef GPU
+#ifdef GPU
     l.forward_gpu = forward_shortcut_layer_gpu;
     l.backward_gpu = backward_shortcut_layer_gpu;
 
     l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
     l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
-    #endif
+    if (l.assisted_excitation)
+    {
+        const int size = l.out_w * l.out_h * l.batch;
+        l.gt_gpu = cuda_make_array(NULL, size);
+        l.a_avg_gpu = cuda_make_array(NULL, size);
+    }
+#endif  // GPU
     return l;
 }
 
@@ -72,6 +81,8 @@ void forward_shortcut_layer(const layer l, network_state state)
         shortcut_cpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output);
     }
     activate_array(l.output, l.outputs*l.batch, l.activation);
+
+    if (l.assisted_excitation && state.train) assisted_excitation_forward(l, state);
 }
 
 void backward_shortcut_layer(const layer l, network_state state)
@@ -89,6 +100,8 @@ void forward_shortcut_layer_gpu(const layer l, network_state state)
     //shortcut_gpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
     input_shortcut_gpu(state.input, l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
     activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+
+    if (l.assisted_excitation && state.train) assisted_excitation_forward_gpu(l, state);
 }
 
 void backward_shortcut_layer_gpu(const layer l, network_state state)
diff --git a/src/shortcut_layer.h b/src/shortcut_layer.h
index b24aa3e6682..ad8d45f3e28 100644
--- a/src/shortcut_layer.h
+++ b/src/shortcut_layer.h
@@ -7,7 +7,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2);
+layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2, int assisted_excitation);
 void forward_shortcut_layer(const layer l, network_state state);
 void backward_shortcut_layer(const layer l, network_state state);
 void resize_shortcut_layer(layer *l, int w, int h);

From 1e0b50c29e3c589ddbcccafc84b1ccd7c538c16b Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Wed, 4 Sep 2019 18:51:40 +0300
Subject: [PATCH 18/86] Added yolov3-tiny-prn.cfg and enet-coco.cfg
 (EfficientNetb0-Yolo- 45.5% mAP@0.5 - 3.7 BFlops)
 https://github.com/WongKinYiu/PartialResidualNetworks

---
 README.md                                 |    6 +
 build/darknet/x64/cfg/enet-coco.cfg       | 1072 +++++++++++++++++++++
 build/darknet/x64/cfg/yolov3-tiny-prn.cfg |  199 ++++
 cfg/enet-coco.cfg                         | 1072 +++++++++++++++++++++
 cfg/yolov3-tiny-prn.cfg                   |  199 ++++
 5 files changed, 2548 insertions(+)
 create mode 100644 build/darknet/x64/cfg/enet-coco.cfg
 create mode 100644 build/darknet/x64/cfg/yolov3-tiny-prn.cfg
 create mode 100644 cfg/enet-coco.cfg
 create mode 100644 cfg/yolov3-tiny-prn.cfg

diff --git a/README.md b/README.md
index 03492ade7e8..b79d9786e1c 100644
--- a/README.md
+++ b/README.md
@@ -63,12 +63,18 @@ There are weights-file for different cfg-files (smaller size -> faster speed & l
 * `yolov3-spp.cfg` (240 MB COCO **Yolo v3**) - requires 4 GB GPU-RAM: https://pjreddie.com/media/files/yolov3-spp.weights
 * `yolov3.cfg` (236 MB COCO **Yolo v3**) - requires 4 GB GPU-RAM: https://pjreddie.com/media/files/yolov3.weights
 * `yolov3-tiny.cfg` (34 MB COCO **Yolo v3 tiny**) - requires 1 GB GPU-RAM:  https://pjreddie.com/media/files/yolov3-tiny.weights
+* `enet-coco.cfg` (EfficientNetb0-Yolo- 45.5% mAP@0.5 - 3.7 BFlops) [enetb0-coco_final.weights](https://drive.google.com/file/d/1FlHeQjWEQVJt0ay1PVsiuuMzmtNyv36m/view) and `yolov3-tiny-prn.cfg` (33.1% mAP@0.5 - 3.5 BFlops - [more](https://github.com/WongKinYiu/PartialResidualNetworks))
+
+<details><summary><b>CLICK ME</b> - Yolo v2 models</summary>
+
 * `yolov2.cfg` (194 MB COCO Yolo v2) - requires 4 GB GPU-RAM: https://pjreddie.com/media/files/yolov2.weights
 * `yolo-voc.cfg` (194 MB VOC Yolo v2) - requires 4 GB GPU-RAM: http://pjreddie.com/media/files/yolo-voc.weights
 * `yolov2-tiny.cfg` (43 MB COCO Yolo v2) - requires 1 GB GPU-RAM: https://pjreddie.com/media/files/yolov2-tiny.weights
 * `yolov2-tiny-voc.cfg` (60 MB VOC Yolo v2) - requires 1 GB GPU-RAM: http://pjreddie.com/media/files/yolov2-tiny-voc.weights
 * `yolo9000.cfg` (186 MB Yolo9000-model) - requires 4 GB GPU-RAM: http://pjreddie.com/media/files/yolo9000.weights
 
+</details>
+
 Put it near compiled: darknet.exe
 
 You can get cfg-files by path: `darknet/cfg/`
diff --git a/build/darknet/x64/cfg/enet-coco.cfg b/build/darknet/x64/cfg/enet-coco.cfg
new file mode 100644
index 00000000000..b530ed360b3
--- /dev/null
+++ b/build/darknet/x64/cfg/enet-coco.cfg
@@ -0,0 +1,1072 @@
+[net]
+# Testing
+#batch=1
+#subdivisions=1
+# Training
+batch=64
+subdivisions=8
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+### CONV1 - 1 (1)
+# conv1
+[convolutional]
+filters=32
+size=3
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+### CONV2 - MBConv1 - 1 (1)
+# conv2_1_expand
+[convolutional]
+filters=32
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv2_1_dwise
+[convolutional]
+groups=32
+filters=32
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=4 (recommended r=16)
+[convolutional]
+filters=8
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv2_1_linear
+[convolutional]
+filters=16
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV3 - MBConv6 - 1 (2)
+# conv2_2_expand
+[convolutional]
+filters=96
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv2_2_dwise
+[convolutional]
+groups=96
+filters=96
+size=3
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=8 (recommended r=16)
+[convolutional]
+filters=16
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=96
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv2_2_linear
+[convolutional]
+filters=24
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV3 - MBConv6 - 2 (2)
+# conv3_1_expand
+[convolutional]
+filters=144
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv3_1_dwise
+[convolutional]
+groups=144
+filters=144
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=8
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=144
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv3_1_linear
+[convolutional]
+filters=24
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV4 - MBConv6 - 1 (2)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_3_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_3_2_expand
+[convolutional]
+filters=144
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_3_2_dwise
+[convolutional]
+groups=144
+filters=144
+size=5
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=8
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=144
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_3_2_linear
+[convolutional]
+filters=40
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV4 - MBConv6 - 2 (2)
+# conv_4_1_expand
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_1_dwise
+[convolutional]
+groups=192
+filters=192
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=16
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=192
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_1_linear
+[convolutional]
+filters=40
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+
+### CONV5 - MBConv6 - 1 (3)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_4_2
+[shortcut]
+from=-9
+activation=linear
+
+# conv_4_3_expand
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_3_dwise
+[convolutional]
+groups=192
+filters=192
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=16
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=192
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_3_linear
+[convolutional]
+filters=80
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV5 - MBConv6 - 2 (3)
+# conv_4_4_expand
+[convolutional]
+filters=384
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_4_dwise
+[convolutional]
+groups=384
+filters=384
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=24
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=384
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_4_linear
+[convolutional]
+filters=80
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV5 - MBConv6 - 3 (3)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_4_4
+[shortcut]
+from=-9
+activation=linear
+
+# conv_4_5_expand
+[convolutional]
+filters=384
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_5_dwise
+[convolutional]
+groups=384
+filters=384
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=24
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=384
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_5_linear
+[convolutional]
+filters=80
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV6 - MBConv6 - 1 (3)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_4_6
+[shortcut]
+from=-9
+activation=linear
+
+# conv_4_7_expand
+[convolutional]
+filters=384
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_7_dwise
+[convolutional]
+groups=384
+filters=384
+size=5
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=24
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=384
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_7_linear
+[convolutional]
+filters=112
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV6 - MBConv6 - 2 (3)
+# conv_5_1_expand
+[convolutional]
+filters=576
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_5_1_dwise
+[convolutional]
+groups=576
+filters=576
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=576
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_5_1_linear
+[convolutional]
+filters=112
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV6 - MBConv6 - 3 (3)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_5_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_5_2_expand
+[convolutional]
+filters=576
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_5_2_dwise
+[convolutional]
+groups=576
+filters=576
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=576
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_5_2_linear
+[convolutional]
+filters=112
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 1 (4)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_5_2
+[shortcut]
+from=-9
+activation=linear
+
+# conv_5_3_expand
+[convolutional]
+filters=576
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_5_3_dwise
+[convolutional]
+groups=576
+filters=576
+size=5
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=576
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_5_3_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 2 (4)
+# conv_6_1_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_1_dwise
+[convolutional]
+groups=960
+filters=960
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_1_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 3 (4)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_6_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_6_2_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_2_dwise
+[convolutional]
+groups=960
+filters=960
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_2_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 4 (4)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_6_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_6_2_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_2_dwise
+[convolutional]
+groups=960
+filters=960
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_2_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV8 - MBConv6 - 1 (1)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_6_2
+[shortcut]
+from=-9
+activation=linear
+
+# conv_6_3_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_3_dwise
+[convolutional]
+groups=960
+filters=960
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_3_linear
+[convolutional]
+filters=320
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV9 - Conv2d 1x1
+# conv_6_4
+[convolutional]
+filters=1280
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+##########################
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+activation=leaky
+from=-2
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=0
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[shortcut]
+activation=leaky
+from=90
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+activation=leaky
+from=-3
+
+[shortcut]
+activation=leaky
+from=90
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+[yolo]
+mask = 1,2,3
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=0
+
diff --git a/build/darknet/x64/cfg/yolov3-tiny-prn.cfg b/build/darknet/x64/cfg/yolov3-tiny-prn.cfg
new file mode 100644
index 00000000000..109c969cb2a
--- /dev/null
+++ b/build/darknet/x64/cfg/yolov3-tiny-prn.cfg
@@ -0,0 +1,199 @@
+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+#batch=64
+#subdivisions=8
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=16
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=1
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+activation=leaky
+from=-3
+
+###########
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+activation=leaky
+from=-2
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[shortcut]
+activation=leaky
+from=8
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+activation=leaky
+from=-3
+
+[shortcut]
+activation=leaky
+from=8
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+[yolo]
+mask = 1,2,3
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
diff --git a/cfg/enet-coco.cfg b/cfg/enet-coco.cfg
new file mode 100644
index 00000000000..b530ed360b3
--- /dev/null
+++ b/cfg/enet-coco.cfg
@@ -0,0 +1,1072 @@
+[net]
+# Testing
+#batch=1
+#subdivisions=1
+# Training
+batch=64
+subdivisions=8
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+### CONV1 - 1 (1)
+# conv1
+[convolutional]
+filters=32
+size=3
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+### CONV2 - MBConv1 - 1 (1)
+# conv2_1_expand
+[convolutional]
+filters=32
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv2_1_dwise
+[convolutional]
+groups=32
+filters=32
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=4 (recommended r=16)
+[convolutional]
+filters=8
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv2_1_linear
+[convolutional]
+filters=16
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV3 - MBConv6 - 1 (2)
+# conv2_2_expand
+[convolutional]
+filters=96
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv2_2_dwise
+[convolutional]
+groups=96
+filters=96
+size=3
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=8 (recommended r=16)
+[convolutional]
+filters=16
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=96
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv2_2_linear
+[convolutional]
+filters=24
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV3 - MBConv6 - 2 (2)
+# conv3_1_expand
+[convolutional]
+filters=144
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv3_1_dwise
+[convolutional]
+groups=144
+filters=144
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=8
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=144
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv3_1_linear
+[convolutional]
+filters=24
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV4 - MBConv6 - 1 (2)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_3_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_3_2_expand
+[convolutional]
+filters=144
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_3_2_dwise
+[convolutional]
+groups=144
+filters=144
+size=5
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=8
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=144
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_3_2_linear
+[convolutional]
+filters=40
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV4 - MBConv6 - 2 (2)
+# conv_4_1_expand
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_1_dwise
+[convolutional]
+groups=192
+filters=192
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=16
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=192
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_1_linear
+[convolutional]
+filters=40
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+
+### CONV5 - MBConv6 - 1 (3)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_4_2
+[shortcut]
+from=-9
+activation=linear
+
+# conv_4_3_expand
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_3_dwise
+[convolutional]
+groups=192
+filters=192
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=16
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=192
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_3_linear
+[convolutional]
+filters=80
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV5 - MBConv6 - 2 (3)
+# conv_4_4_expand
+[convolutional]
+filters=384
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_4_dwise
+[convolutional]
+groups=384
+filters=384
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=24
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=384
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_4_linear
+[convolutional]
+filters=80
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV5 - MBConv6 - 3 (3)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_4_4
+[shortcut]
+from=-9
+activation=linear
+
+# conv_4_5_expand
+[convolutional]
+filters=384
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_5_dwise
+[convolutional]
+groups=384
+filters=384
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=24
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=384
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_5_linear
+[convolutional]
+filters=80
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV6 - MBConv6 - 1 (3)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_4_6
+[shortcut]
+from=-9
+activation=linear
+
+# conv_4_7_expand
+[convolutional]
+filters=384
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_4_7_dwise
+[convolutional]
+groups=384
+filters=384
+size=5
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=24
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=384
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_4_7_linear
+[convolutional]
+filters=112
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV6 - MBConv6 - 2 (3)
+# conv_5_1_expand
+[convolutional]
+filters=576
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_5_1_dwise
+[convolutional]
+groups=576
+filters=576
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=576
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_5_1_linear
+[convolutional]
+filters=112
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV6 - MBConv6 - 3 (3)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_5_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_5_2_expand
+[convolutional]
+filters=576
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_5_2_dwise
+[convolutional]
+groups=576
+filters=576
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=576
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_5_2_linear
+[convolutional]
+filters=112
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 1 (4)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_5_2
+[shortcut]
+from=-9
+activation=linear
+
+# conv_5_3_expand
+[convolutional]
+filters=576
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_5_3_dwise
+[convolutional]
+groups=576
+filters=576
+size=5
+pad=1
+stride=2
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=32
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=576
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_5_3_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 2 (4)
+# conv_6_1_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_1_dwise
+[convolutional]
+groups=960
+filters=960
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_1_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 3 (4)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_6_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_6_2_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_2_dwise
+[convolutional]
+groups=960
+filters=960
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_2_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV7 - MBConv6 - 4 (4)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_6_1
+[shortcut]
+from=-9
+activation=linear
+
+# conv_6_2_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_2_dwise
+[convolutional]
+groups=960
+filters=960
+size=5
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_2_linear
+[convolutional]
+filters=192
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+
+### CONV8 - MBConv6 - 1 (1)
+# dropout only before residual connection
+[dropout]
+probability=.0
+
+# block_6_2
+[shortcut]
+from=-9
+activation=linear
+
+# conv_6_3_expand
+[convolutional]
+filters=960
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+# conv_6_3_dwise
+[convolutional]
+groups=960
+filters=960
+size=3
+stride=1
+pad=1
+batch_normalize=1
+activation=swish
+
+
+#squeeze-n-excitation
+[avgpool]
+
+# squeeze ratio r=16 (recommended r=16)
+[convolutional]
+filters=64
+size=1
+stride=1
+activation=swish
+
+# excitation
+[convolutional]
+filters=960
+size=1
+stride=1
+activation=logistic
+
+# multiply channels
+[scale_channels]
+from=-4
+
+
+# conv_6_3_linear
+[convolutional]
+filters=320
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=linear
+
+
+### CONV9 - Conv2d 1x1
+# conv_6_4
+[convolutional]
+filters=1280
+size=1
+stride=1
+pad=0
+batch_normalize=1
+activation=swish
+
+##########################
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+activation=leaky
+from=-2
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=0
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[shortcut]
+activation=leaky
+from=90
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+activation=leaky
+from=-3
+
+[shortcut]
+activation=leaky
+from=90
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+[yolo]
+mask = 1,2,3
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=0
+
diff --git a/cfg/yolov3-tiny-prn.cfg b/cfg/yolov3-tiny-prn.cfg
new file mode 100644
index 00000000000..109c969cb2a
--- /dev/null
+++ b/cfg/yolov3-tiny-prn.cfg
@@ -0,0 +1,199 @@
+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+#batch=64
+#subdivisions=8
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=16
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=1
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+activation=leaky
+from=-3
+
+###########
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+activation=leaky
+from=-2
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[shortcut]
+activation=leaky
+from=8
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+activation=leaky
+from=-3
+
+[shortcut]
+activation=leaky
+from=8
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+[yolo]
+mask = 1,2,3
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1

From f6fa4a56d938f4f8c69774d3622e768e7411507d Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Wed, 4 Sep 2019 19:58:36 +0300
Subject: [PATCH 19/86] compile fix

---
 build/darknet/x64/cfg/yolov3-tiny-prn.cfg | 8 ++++----
 cfg/yolov3-tiny-prn.cfg                   | 8 ++++----
 src/shortcut_layer.c                      | 1 +
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/build/darknet/x64/cfg/yolov3-tiny-prn.cfg b/build/darknet/x64/cfg/yolov3-tiny-prn.cfg
index 109c969cb2a..215162e973b 100644
--- a/build/darknet/x64/cfg/yolov3-tiny-prn.cfg
+++ b/build/darknet/x64/cfg/yolov3-tiny-prn.cfg
@@ -1,10 +1,10 @@
 [net]
 # Testing
-batch=1
-subdivisions=1
+#batch=1
+#subdivisions=1
 # Training
-#batch=64
-#subdivisions=8
+batch=64
+subdivisions=8
 width=416
 height=416
 channels=3
diff --git a/cfg/yolov3-tiny-prn.cfg b/cfg/yolov3-tiny-prn.cfg
index 109c969cb2a..215162e973b 100644
--- a/cfg/yolov3-tiny-prn.cfg
+++ b/cfg/yolov3-tiny-prn.cfg
@@ -1,10 +1,10 @@
 [net]
 # Testing
-batch=1
-subdivisions=1
+#batch=1
+#subdivisions=1
 # Training
-#batch=64
-#subdivisions=8
+batch=64
+subdivisions=8
 width=416
 height=416
 channels=3
diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c
index d056a6a0a20..8345858e47d 100644
--- a/src/shortcut_layer.c
+++ b/src/shortcut_layer.c
@@ -1,4 +1,5 @@
 #include "shortcut_layer.h"
+#include "convolutional_layer.h"
 #include "dark_cuda.h"
 #include "blas.h"
 #include <stdio.h>

From e33019e669cbfe3dfc9037664335bfcb6ae8b988 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Thu, 5 Sep 2019 14:47:26 +0300
Subject: [PATCH 20/86] Fixed stride_x and stride_y. Fixed AntiAliasing. Minor
 fixes.

---
 src/convolutional_layer.c |  4 ++--
 src/data.c                | 12 ++++++++++--
 src/image_opencv.cpp      | 13 +++++++++++--
 src/network.c             |  7 +++++++
 src/sam_layer.c           |  4 ++--
 5 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 72bb602589e..00b27f6b88f 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -363,8 +363,8 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     l.batch = batch;
     l.steps = steps;
     l.stride = stride_x;
-    l.stride_y = stride_x;
-    l.stride_x = stride_y;
+    l.stride_x = stride_x;
+    l.stride_y = stride_y;
     l.dilation = dilation;
     l.size = size;
     l.pad = padding;
diff --git a/src/data.c b/src/data.c
index 6dd3274ae7b..4206871533f 100644
--- a/src/data.c
+++ b/src/data.c
@@ -343,7 +343,7 @@ void fill_truth_region(char *path, float *truth, int classes, int num_boxes, int
     free(boxes);
 }
 
-void fill_truth_detection(const char *path, int num_boxes, float *truth, int classes, int flip, float dx, float dy, float sx, float sy,
+int fill_truth_detection(const char *path, int num_boxes, float *truth, int classes, int flip, float dx, float dy, float sx, float sy,
     int net_w, int net_h)
 {
     char labelpath[4096];
@@ -352,6 +352,7 @@ void fill_truth_detection(const char *path, int num_boxes, float *truth, int cla
     int count = 0;
     int i;
     box_label *boxes = read_boxes(labelpath, &count);
+    int min_w_h = 0;
     float lowest_w = 1.F / net_w;
     float lowest_h = 1.F / net_h;
     randomize_boxes(boxes, count);
@@ -424,8 +425,13 @@ void fill_truth_detection(const char *path, int num_boxes, float *truth, int cla
         truth[(i-sub)*5+2] = w;
         truth[(i-sub)*5+3] = h;
         truth[(i-sub)*5+4] = id;
+
+        if (min_w_h == 0) min_w_h = w*net_w;
+        if (min_w_h > w*net_w) min_w_h = w*net_w;
+        if (min_w_h > h*net_h) min_w_h = h*net_h;
     }
     free(boxes);
+    return min_w_h;
 }
 
 
@@ -914,7 +920,9 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo
             float dy = ((float)ptop / oh) / sy;
 
 
-            fill_truth_detection(filename, boxes, truth, classes, flip, dx, dy, 1. / sx, 1. / sy, w, h);
+            int min_w_h = fill_truth_detection(filename, boxes, truth, classes, flip, dx, dy, 1. / sx, 1. / sy, w, h);
+
+            if (min_w_h < blur*4) blur = 0;   // disable blur if one of the objects is too small
 
             image ai = image_data_augmentation(src, w, h, pleft, ptop, swidth, sheight, flip, dhue, dsat, dexp,
                 blur, boxes, d.y.vals[i]);
diff --git a/src/image_opencv.cpp b/src/image_opencv.cpp
index 91d07bd7a8b..b6cb65f9f2a 100644
--- a/src/image_opencv.cpp
+++ b/src/image_opencv.cpp
@@ -1206,11 +1206,20 @@ image image_data_augmentation(mat_cv* mat, int w, int h,
         if (blur) {
             cv::Mat dst(sized.size(), sized.type());
             if(blur == 1) cv::GaussianBlur(sized, dst, cv::Size(31, 31), 0);
-            else cv::GaussianBlur(sized, dst, cv::Size((blur / 2) * 2 + 1, (blur / 2) * 2 + 1), 0);
-            cv::Rect img_rect(0, 0, sized.cols, sized.rows);
+            else {
+                cv::Size kernel_size = cv::Size((blur / 2) * 2 + 1, (blur / 2) * 2 + 1);
+                cv::GaussianBlur(sized, dst, kernel_size, 0);
+
+                // sharpen
+                //cv::Mat img_tmp;
+                //cv::GaussianBlur(dst, img_tmp, cv::Size(), 3);
+                //cv::addWeighted(dst, 1.5, img_tmp, -0.5, 0, img_tmp);
+                //dst = img_tmp;
+            }
             //std::cout << " blur num_boxes = " << num_boxes << std::endl;
 
             if (blur == 1) {
+                cv::Rect img_rect(0, 0, sized.cols, sized.rows);
                 int t;
                 for (t = 0; t < num_boxes; ++t) {
                     box b = float_to_box_stride(truth + t*(4 + 1), 1);
diff --git a/src/network.c b/src/network.c
index 9bdab8adce1..82dc4d53978 100644
--- a/src/network.c
+++ b/src/network.c
@@ -1150,6 +1150,13 @@ void copy_weights_net(network net_train, network *net_map)
             copy_cudnn_descriptors(tmp_self_layer, net_map->layers[k].self_layer);
             copy_cudnn_descriptors(tmp_output_layer, net_map->layers[k].output_layer);
         }
+        else if(l->input_layer) // for AntiAliasing
+        {
+            layer tmp_input_layer;
+            copy_cudnn_descriptors(*net_map->layers[k].input_layer, &tmp_input_layer);
+            net_map->layers[k].input_layer = net_train.layers[k].input_layer;
+            copy_cudnn_descriptors(tmp_input_layer, net_map->layers[k].input_layer);
+        }
         net_map->layers[k].batch = 1;
         net_map->layers[k].steps = 1;
     }
diff --git a/src/sam_layer.c b/src/sam_layer.c
index da28e319775..70e55052883 100644
--- a/src/sam_layer.c
+++ b/src/sam_layer.c
@@ -60,7 +60,7 @@ void resize_sam_layer(layer *l, int w, int h)
 void forward_sam_layer(const layer l, network_state state)
 {
     int size = l.batch * l.out_c * l.out_w * l.out_h;
-    int channel_size = 1;
+    //int channel_size = 1;
     float *from_output = state.net.layers[l.index].output;
 
     int i;
@@ -79,7 +79,7 @@ void backward_sam_layer(const layer l, network_state state)
     //scale_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, state.net.layers[l.index].delta);
 
     int size = l.batch * l.out_c * l.out_w * l.out_h;
-    int channel_size = 1;
+    //int channel_size = 1;
     float *from_output = state.net.layers[l.index].output;
     float *from_delta = state.net.layers[l.index].delta;
 

From b0b1584a069c12a214791c339bbff9dc3904283e Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 6 Sep 2019 01:53:36 +0300
Subject: [PATCH 21/86] Minor fix

---
 src/convolutional_layer.c  | 2 +-
 src/maxpool_layer.c        | 8 +++++---
 src/sam_layer.c            | 2 +-
 src/scale_channels_layer.c | 2 +-
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 00b27f6b88f..10a45bab5d5 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -574,7 +574,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     if (groups > 1) fprintf(stderr, "%5d/%4d ", n, groups);
     else           fprintf(stderr, "%5d      ", n);
 
-    if (stride_x != stride_y) fprintf(stderr, "%2d x%2d/%2dx%2d ", size, size, stride_x, stride_y);
+    if (stride_x != stride_y) fprintf(stderr, "%2dx%2d/%2dx%2d ", size, size, stride_x, stride_y);
     else {
         if (dilation > 1) fprintf(stderr, "%2d x%2d/%2d(%1d)", size, size, stride_x, dilation);
         else             fprintf(stderr, "%2d x%2d/%2d   ", size, size, stride_x);
diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c
index 27d338603d1..2f290497dcb 100644
--- a/src/maxpool_layer.c
+++ b/src/maxpool_layer.c
@@ -98,10 +98,12 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
 
     #endif  // GPU
 	l.bflops = (l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.;
-    if(stride_x == stride_y)
-        fprintf(stderr, "max               %d x %d/%2d   %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+    if (maxpool_depth)
+        fprintf(stderr, "max-depth         %2dx%2d/%2d   %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+    else if(stride_x == stride_y)
+        fprintf(stderr, "max               %2dx%2d/%2d   %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
     else
-        fprintf(stderr, "max             %d x %d/%2dx%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, stride_y, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+        fprintf(stderr, "max              %2dx%2d/%2dx%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, stride_y, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
 
     if (l.antialiasing) {
         printf("AA:  ");
diff --git a/src/sam_layer.c b/src/sam_layer.c
index 70e55052883..e95290d7867 100644
--- a/src/sam_layer.c
+++ b/src/sam_layer.c
@@ -18,7 +18,7 @@ layer make_sam_layer(int batch, int index, int w, int h, int c, int w2, int h2,
     l.out_h = h2;
     l.out_c = c2;
     assert(l.out_c == l.c);
-    assert(l.w == l.out_w & l.h == l.out_h);
+    assert(l.w == l.out_w && l.h == l.out_h);
 
     l.outputs = l.out_w*l.out_h*l.out_c;
     l.inputs = l.outputs;
diff --git a/src/scale_channels_layer.c b/src/scale_channels_layer.c
index 7322570116e..80be5361126 100644
--- a/src/scale_channels_layer.c
+++ b/src/scale_channels_layer.c
@@ -13,7 +13,7 @@ layer make_scale_channels_layer(int batch, int index, int w, int h, int c, int w
     l.w = w;
     l.h = h;
     l.c = c;
-    assert(w == 1 & h == 1);
+    assert(w == 1 && h == 1);
 
     l.out_w = w2;
     l.out_h = h2;

From 1c71f001531a5df0637903117c6568725d7a66b3 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 7 Sep 2019 15:15:19 +0300
Subject: [PATCH 22/86] Fixed shortcut_layer for antialiasing

---
 src/data.c           | 3 +++
 src/image_opencv.cpp | 7 +++++--
 src/parser.c         | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/data.c b/src/data.c
index 4206871533f..d8f0c9e81c0 100644
--- a/src/data.c
+++ b/src/data.c
@@ -959,6 +959,9 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo
 
                 save_image(tmp_ai, buff);
                 if (show_imgs == 1) {
+                    //char buff_src[1000];
+                    //sprintf(buff_src, "src_%d_%d_%s_%d", random_index, i, basecfg((char*)filename), random_gen());
+                    //show_image_mat(src, buff_src);
                     show_image(tmp_ai, buff);
                     wait_until_press_key_cv();
                 }
diff --git a/src/image_opencv.cpp b/src/image_opencv.cpp
index b6cb65f9f2a..cc50c0719a1 100644
--- a/src/image_opencv.cpp
+++ b/src/image_opencv.cpp
@@ -1207,8 +1207,11 @@ image image_data_augmentation(mat_cv* mat, int w, int h,
             cv::Mat dst(sized.size(), sized.type());
             if(blur == 1) cv::GaussianBlur(sized, dst, cv::Size(31, 31), 0);
             else {
-                cv::Size kernel_size = cv::Size((blur / 2) * 2 + 1, (blur / 2) * 2 + 1);
-                cv::GaussianBlur(sized, dst, kernel_size, 0);
+                int ksize = (blur / 2) * 2 + 1;
+                cv::Size kernel_size = cv::Size(ksize, ksize);
+                //cv::GaussianBlur(sized, dst, kernel_size, 0);
+                //cv::medianBlur(sized, dst, ksize);
+                cv::bilateralFilter(sized, dst, ksize, 75, 75);
 
                 // sharpen
                 //cv::Mat img_tmp;
diff --git a/src/parser.c b/src/parser.c
index 4b56dfc4c10..829134d1131 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -608,6 +608,7 @@ layer parse_shortcut(list *options, size_params params, network net)
 
     int batch = params.batch;
     layer from = net.layers[index];
+    if (from.antialiasing) from = *from.input_layer;
 
     layer s = make_shortcut_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c, assisted_excitation);
 

From fa74f691cb624d916503af309de13300638d097f Mon Sep 17 00:00:00 2001
From: "gilberto.plaza" <gilberto.plaza@cinfo.es>
Date: Wed, 18 Sep 2019 13:49:54 +0200
Subject: [PATCH 23/86] Removed useless snippet that only breaks if batch
 processing is enabled

---
 src/yolo_layer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index a76b5efb082..20ee8e34391 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -492,7 +492,9 @@ int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh,
     //printf("\n l.batch = %d, l.w = %d, l.h = %d, l.n = %d \n", l.batch, l.w, l.h, l.n);
     int i,j,n;
     float *predictions = l.output;
-    if (l.batch == 2) avg_flipped_yolo(l);
+    // This snippet below is not necessary
+    // Need to comment it in order to batch processing >= 2 images
+    //if (l.batch == 2) avg_flipped_yolo(l);
     int count = 0;
     for (i = 0; i < l.w*l.h; ++i){
         int row = i / l.w;

From 6f4d93bb9f59a805e6884789c97623e98806450d Mon Sep 17 00:00:00 2001
From: Alexey <AlexeyAB@users.noreply.github.com>
Date: Sun, 22 Sep 2019 17:38:42 +0300
Subject: [PATCH 24/86] Update readme.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b79d9786e1c..36019663e86 100644
--- a/README.md
+++ b/README.md
@@ -569,7 +569,7 @@ Example of custom object detection: `darknet.exe detector test data/obj.data yol
 
   * each: `model of object, side, illimination, scale, each 30 grad` of the turn and inclination angles - these are *different objects* from an internal perspective of the neural network. So the more *different objects* you want to detect, the more complex network model should be used.
 
-  * recalculate anchors for your dataset for `width` and `height` from cfg-file:
+  * Only if you are an **expert** in neural detection networks - recalculate anchors for your dataset for `width` and `height` from cfg-file:
   `darknet.exe detector calc_anchors data/obj.data -num_of_clusters 9 -width 416 -height 416`
    then set the same 9 `anchors` in each of 3 `[yolo]`-layers in your cfg-file. But you should change indexes of anchors `masks=` for each [yolo]-layer, so that 1st-[yolo]-layer has anchors larger than 60x60, 2nd larger than 30x30, 3rd remaining. Also you should change the `filters=(classes + 5)*<number of mask>` before each [yolo]-layer. If many of the calculated anchors do not fit under the appropriate layers - then just try using all the default anchors.
 

From 05545b260784cb5ce9b27f85462c4a31137b684b Mon Sep 17 00:00:00 2001
From: Alexey <AlexeyAB@users.noreply.github.com>
Date: Tue, 24 Sep 2019 15:31:28 +0300
Subject: [PATCH 25/86] Create FUNDING.yml

---
 .github/FUNDING.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 .github/FUNDING.yml

diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 00000000000..0c5ae2e2b8f
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,12 @@
+# These are supported funding model platforms
+
+github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+custom: ['https://paypal.me/alexeyab84', 'https://blockchain.coinmarketcap.com/address/bitcoin/36La9T7DoLVMrUQzm6rBDGsxutyvDzbHnp', 'https://etherscan.io/address/0x193d56BE3C65e3Fb8f48c291B17C0702e211A588#', 'https://explorer.zcha.in/accounts/t1PzwJ28Prb7Nk8fgfT3RXCr6Xtw54tgjoy'] # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']

From 2fa539779f4e12e264b9e1b2fc463ac7edec165c Mon Sep 17 00:00:00 2001
From: Alexey <AlexeyAB@users.noreply.github.com>
Date: Fri, 27 Sep 2019 22:35:56 +0300
Subject: [PATCH 26/86] Readme.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 36019663e86..ca3c2d9f37d 100644
--- a/README.md
+++ b/README.md
@@ -540,6 +540,8 @@ Example of custom object detection: `darknet.exe detector test data/obj.data yol
 
   * desirable that your training dataset include images with non-labeled objects that you do not want to detect - negative samples without bounded box (empty `.txt` files) - use as many images of negative samples as there are images with objects
 
+  * What is the best way to mark objects: label only the visible part of the object, or label the visible and overlapped part of the object, or label a little more than the entire object (with a little gap)? Mark as you like - how would you like it to be detected.
+
   * for training with a large number of objects in each image, add the parameter `max=200` or higher value in the last `[yolo]`-layer or `[region]`-layer in your cfg-file (the global maximum number of objects that can be detected by YoloV3 is `0,0615234375*(width*height)` where are width and height are parameters from `[net]` section in cfg-file) 
   
   * for training for small objects (smaller than 16x16 after the image is resized to 416x416) - set `layers = -1, 11` instead of https://github.com/AlexeyAB/darknet/blob/6390a5a2ab61a0bdf6f1a9a6b4a739c16b36e0d7/cfg/yolov3.cfg#L720

From ff0b739e4357afba477c25f37fba08410ec86b04 Mon Sep 17 00:00:00 2001
From: Alexey <AlexeyAB@users.noreply.github.com>
Date: Thu, 3 Oct 2019 12:12:00 +0300
Subject: [PATCH 27/86] Update Readme.md

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ca3c2d9f37d..29e5174451e 100644
--- a/README.md
+++ b/README.md
@@ -451,11 +451,11 @@ Usually sufficient 2000 iterations for each class(object), but not less than 400
   > Region Avg IOU: 0.798363, Class: 0.893232, Obj: 0.700808, No Obj: 0.004567, Avg Recall: 1.000000,  count: 8
   > Region Avg IOU: 0.800677, Class: 0.892181, Obj: 0.701590, No Obj: 0.004574, Avg Recall: 1.000000,  count: 8
   >
-  > **9002**: 0.211667, **0.060730 avg**, 0.001000 rate, 3.868000 seconds, 576128 images
+  > **9002**: 0.211667, **0.60730 avg**, 0.001000 rate, 3.868000 seconds, 576128 images
   > Loaded: 0.000000 seconds
 
   * **9002** - iteration number (number of batch)
-  * **0.060730 avg** - average loss (error) - **the lower, the better**
+  * **0.60730 avg** - average loss (error) - **the lower, the better**
 
   When you see that average loss **0.xxxxxx avg** no longer decreases at many iterations then you should stop training. The final avgerage loss can be from `0.05` (for a small model and easy dataset) to `3.0` (for a big model and a difficult dataset).
 

From 7181c7435f6ccc99b9c9340eccb5bfd16826804e Mon Sep 17 00:00:00 2001
From: Alexey <AlexeyAB@users.noreply.github.com>
Date: Thu, 3 Oct 2019 16:40:08 +0300
Subject: [PATCH 28/86] Update readme.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 29e5174451e..fa44665be17 100644
--- a/README.md
+++ b/README.md
@@ -333,7 +333,7 @@ Training Yolo v3:
 
   * change line batch to [`batch=64`](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L3)
   * change line subdivisions to [`subdivisions=8`](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L4)
-  * change line max_batches to (`classes*2000`), f.e. [`max_batches=6000`](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L20) if you train for 3 classes
+  * change line max_batches to (`classes*2000` but not less than `4000`), f.e. [`max_batches=6000`](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L20) if you train for 3 classes
   * change line steps to 80% and 90% of max_batches, f.e. [`steps=4800,5400`](https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L22)
   * change line `classes=80` to your number of objects in each of 3 `[yolo]`-layers:
       * https://github.com/AlexeyAB/darknet/blob/0039fd26786ab5f71d5af725fc18b3f521e7acfd/cfg/yolov3.cfg#L610

From e24c96dc8bfe151c9c39684c34345f7981b3a08e Mon Sep 17 00:00:00 2001
From: Alexey <AlexeyAB@users.noreply.github.com>
Date: Thu, 3 Oct 2019 17:28:06 +0300
Subject: [PATCH 29/86] Update Readme.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index fa44665be17..8027726284f 100644
--- a/README.md
+++ b/README.md
@@ -536,6 +536,8 @@ Example of custom object detection: `darknet.exe detector test data/obj.data yol
 
   * check that each object that you want to detect is mandatory labeled in your dataset - no one object in your data set should not be without label. In the most training issues - there are wrong labels in your dataset (got labels by using some conversion script, marked with a third-party tool, ...). Always check your dataset by using: https://github.com/AlexeyAB/Yolo_mark
 
+  * my Loss is very high and mAP is very low, is training wrong? Run training with ` -show_imgs` flag at the end of training command, do you see correct bounded boxes of objects (in windows or in files `aug_...jpg`)? If no - your training dataset is wrong.
+
   * for each object which you want to detect - there must be at least 1 similar object in the Training dataset with about the same: shape, side of object, relative size, angle of rotation, tilt, illumination. So desirable that your training dataset include images with objects at diffrent: scales, rotations, lightings, from different sides, on different backgrounds - you should preferably have 2000 different images for each class or more, and you should train `2000*classes` iterations or more
 
   * desirable that your training dataset include images with non-labeled objects that you do not want to detect - negative samples without bounded box (empty `.txt` files) - use as many images of negative samples as there are images with objects

From 0823d04247573d5371733ce6755ba143e1591108 Mon Sep 17 00:00:00 2001
From: acxz <17132214+acxz@users.noreply.github.com>
Date: Fri, 18 Oct 2019 19:24:49 -0400
Subject: [PATCH 30/86] Add readability changes

Make CMake-GUI install more visible than vcpkg install
---
 README.md | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 2f85a17c15e..8f14d167ba9 100644
--- a/README.md
+++ b/README.md
@@ -22,8 +22,8 @@ More details: http://pjreddie.com/darknet/yolo/
     * [Using cmake](#how-to-compile-on-linux-using-cmake)
     * [Using make](#how-to-compile-on-linux-using-make)
 3.  How to compile on Windows
+    * [Using CMake-GUI](#how-to-compile-on-windows-using-cmake-gui)
     * [Using vcpkg](#how-to-compile-on-windows-using-vcpkg)
-    * [Using Cmake-GUI](#how-to-compile-on-windows-using-cmake-gui)
     * [Legacy way](#how-to-compile-on-windows-legacy-way)
 4.  [How to train (Pascal VOC Data)](#how-to-train-pascal-voc-data)
 5.  [How to train with multi-GPU:](#how-to-train-with-multi-gpu)
@@ -159,7 +159,7 @@ The `CMakeLists.txt` will attempt to find installed optional dependencies like
 CUDA, cudnn, ZED and build against those. It will also create a shared object
 library file to use `darknet` for code development.
 
-Inside the cloned repository:
+Do inside the cloned repository:
 
 ```
 mkdir build-release
@@ -187,9 +187,28 @@ Before make, you can set such options in the `Makefile`: [link](https://github.c
 
 To run Darknet on Linux use examples from this article, just use `./darknet` instead of `darknet.exe`, i.e. use this command: `./darknet detector test ./cfg/coco.data ./cfg/yolov3.cfg ./yolov3.weights`
 
+### How to compile on Windows (using `CMake-GUI`)
+
+This is the recommended approach to build Darknet on Windows if you have already
+installed Visual Studio 2015/2017/2019, CUDA > 10.0, cuDNN > 7.0, and
+OpenCV > 2.4.
+
+Use `CMake-GUI` as shown here on this [**IMAGE**](https://user-images.githubusercontent.com/4096485/55107892-6becf380-50e3-11e9-9a0a-556a943c429a.png):
+
+1. Configure
+2. Optional platform for generator (Set: x64)
+3. Finish
+4. Generate
+5. Open Project
+6. Set: x64 & Release
+7. Build
+8. Build solution
+
 ### How to compile on Windows (using `vcpkg`)
 
-If you have already installed Visual Studio 2015/2017/2019, CUDA > 10.0, cuDNN > 7.0, OpenCV > 2.4, then compile Darknet by using `C:\Program Files\CMake\bin\cmake-gui.exe` as on this [**IMAGE**](https://user-images.githubusercontent.com/4096485/55107892-6becf380-50e3-11e9-9a0a-556a943c429a.png): Configure -> Optional platform for generator (Set: x64) -> Finish -> Generate -> Open Project -> x64 & Release -> Build -> Build solution
+If you have already installed Visual Studio 2015/2017/2019, CUDA > 10.0,
+cuDNN > 7.0, OpenCV > 2.4, then to compile Darknet it is recommended to use
+[CMake-GUI](#how-to-compile-on-windows-using-cmake-gui).
 
 Otherwise, follow these steps:
 
@@ -216,19 +235,6 @@ PS Code\vcpkg>         .\vcpkg install pthreads opencv[ffmpeg] #replace with ope
 
 9.  Open Powershell, go to the `darknet` folder and build with the command `.\build.ps1`. If you want to use Visual Studio, you will find two custom solutions created for you by CMake after the build, one in `build_win_debug` and the other in `build_win_release`, containing all the appropriate config flags for your system.
 
-### How to compile on Windows (using `Cmake-GUI`)
-
-Using `Cmake-GUI` as shown here on this [**IMAGE**](https://user-images.githubusercontent.com/4096485/55107892-6becf380-50e3-11e9-9a0a-556a943c429a.png):
-
-1. Configure
-2. Optional platform for generator (Set: x64)
-3. Finish
-4. Generate
-5. Open Project
-6. x64 & Release
-7. Build
-8. Build solution
-
 ### How to compile on Windows (legacy way)
 
 1. If you have **CUDA 10.0, cuDNN 7.4 and OpenCV 3.x** (with paths: `C:\opencv_3.0\opencv\build\include` & `C:\opencv_3.0\opencv\build\x64\vc14\lib`), then open `build\darknet\darknet.sln`, set **x64** and **Release** https://hsto.org/webt/uh/fk/-e/uhfk-eb0q-hwd9hsxhrikbokd6u.jpeg and do the: Build -> Build darknet. Also add Windows system variable `CUDNN` with path to CUDNN: https://user-images.githubusercontent.com/4096485/53249764-019ef880-36ca-11e9-8ffe-d9cf47e7e462.jpg

From 2eb68d5177d43d6927753631a0d4fad5446cab17 Mon Sep 17 00:00:00 2001
From: Alexey <AlexeyAB@users.noreply.github.com>
Date: Sat, 19 Oct 2019 15:09:15 +0300
Subject: [PATCH 31/86] Update Readme.md

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 2aed1180a43..9042a6f99e7 100644
--- a/README.md
+++ b/README.md
@@ -625,6 +625,13 @@ Here you can find repository with GUI-software for marking bounded boxes of obje
 
 With example of: `train.txt`, `obj.names`, `obj.data`, `yolo-obj.cfg`, `air`1-6`.txt`, `bird`1-4`.txt` for 2 classes of objects (air, bird) and `train_obj.cmd` with example how to train this image-set with Yolo v2 & v3
 
+Different tools for marking objects in images:
+1. in C++: https://github.com/AlexeyAB/Yolo_mark 
+2. in Python: https://github.com/tzutalin/labelImg
+3. in Python: https://github.com/Cartucho/OpenLabeling
+4. in C++: https://www.ccoderun.ca/darkmark/
+
+
 ## Using Yolo9000
 
  Simultaneous detection and classification of 9000 objects: `darknet.exe detector test cfg/combine9k.data cfg/yolo9000.cfg yolo9000.weights data/dog.jpg`

From e6486ab594e877e0b870eab6788de9e888c35840 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Mon, 21 Oct 2019 15:33:01 +0300
Subject: [PATCH 32/86] Use ignore_thresh only if class_id matched. Temporary
 changed Assisted_Excitation (reduces background activations rather than
 enhancing objects activations). Added antialiasiong=2 for 2x2.

---
 src/convolutional_kernels.cu | 40 +++++++++++++++++----
 src/convolutional_layer.c    | 70 +++++++++++++++++++++---------------
 src/detector.c               |  3 +-
 src/http_stream.cpp          |  2 +-
 src/maxpool_layer.c          | 69 +++++++++++++++++++++--------------
 src/yolo_layer.c             | 32 ++++++++++++++++-
 6 files changed, 151 insertions(+), 65 deletions(-)

diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index d766c9cf7cb..edfb03b81c7 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -948,6 +948,30 @@ void assisted_activation_gpu(float alpha, float *output, float *gt_gpu, float *a
     assisted_activation_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (alpha, output, gt_gpu, a_avg_gpu, size, channels, batches);
 }
 
+
+__global__ void assisted_activation2_kernel(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    int xy = i % size;
+    int b = i / size;
+    float beta = 1 - alpha;
+
+    if (b < batches) {
+        for (int c = 0; c < channels; ++c) {
+            if(gt_gpu[i] == 0)
+                output[xy + size*(c + channels*b)] *= beta;
+
+        }
+    }
+}
+
+void assisted_activation2_gpu(float alpha, float *output, float *gt_gpu, float *a_avg_gpu, int size, int channels, int batches)
+{
+    const int num_blocks = get_number_of_blocks(size*batches, BLOCK);
+
+    assisted_activation2_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> > (alpha, output, gt_gpu, a_avg_gpu, size, channels, batches);
+}
+
 void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
 {
     const int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
@@ -958,12 +982,13 @@ void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
     // calculate alpha
     //const float alpha = (1 + cos(3.141592 * iteration_num)) / (2 * state.net.max_batches);
     //const float alpha = (1 + cos(3.141592 * epoch)) / (2 * state.net.max_batches);
-    //const float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches)) / 2;
-    float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches));
+    float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches)) / 2;
+    //float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches));
 
     if (l.assisted_excitation > 1) {
-        if (iteration_num > l.assisted_excitation) alpha = 0;
-        else alpha = (1 + cos(3.141592 * iteration_num / l.assisted_excitation));
+        if (iteration_num < state.net.burn_in) alpha = 0;
+        else if (iteration_num > l.assisted_excitation) alpha = 0;
+        else alpha = (1 + cos(3.141592 * iteration_num / l.assisted_excitation)) / 2;
     }
 
     //printf("\n epoch = %f, alpha = %f, seen = %d, max_batches = %d, train_images_num = %d \n",
@@ -1017,7 +1042,8 @@ void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
     //CHECK_CUDA(cudaPeekAtLastError());
 
     // calc new output
-    assisted_activation_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
+    assisted_activation2_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
+    //assisted_activation_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
     //cudaStreamSynchronize(get_cuda_stream());
     //CHECK_CUDA(cudaPeekAtLastError());
 
@@ -1070,13 +1096,13 @@ void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
             printf(" Assisted Excitation alpha = %f \n", alpha);
             image img = float_to_image(l.out_w, l.out_h, 1, &gt[l.out_w*l.out_h*b]);
             char buff[100];
-            sprintf(buff, "a_excitation_%d", b);
+            sprintf(buff, "a_excitation_gt_%d", b);
             show_image_cv(img, buff);
 
             //image img2 = float_to_image(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]);
             image img2 = float_to_image_scaled(l.out_w, l.out_h, 1, &l.output[l.out_w*l.out_h*l.out_c*b]);
             char buff2[100];
-            sprintf(buff2, "a_excitation_act_%d", b);
+            sprintf(buff2, "a_excitation_output_%d", b);
             show_image_cv(img2, buff2);
 
             /*
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 10a45bab5d5..8bce5aa67d9 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -587,36 +587,50 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     if (l.antialiasing) {
         printf("AA:  ");
         l.input_layer = (layer*)calloc(1, sizeof(layer));
-        const int blur_size = 3;
-        *(l.input_layer) = make_convolutional_layer(batch, steps, out_h, out_w, n, n, n, blur_size, blur_stride_x, blur_stride_y, 1, blur_size / 2, LINEAR, 0, 0, 0, 0, 0, index, 0, NULL, 0);
+        int blur_size = 3;
+        int blur_pad = blur_size / 2;
+        if (l.antialiasing == 2) {
+            blur_size = 2;
+            blur_pad = 0;
+        }
+        *(l.input_layer) = make_convolutional_layer(batch, steps, out_h, out_w, n, n, n, blur_size, blur_stride_x, blur_stride_y, 1, blur_pad, LINEAR, 0, 0, 0, 0, 0, index, 0, NULL, 0);
         const int blur_nweights = n * blur_size * blur_size;  // (n / n) * n * blur_size * blur_size;
         int i;
-        for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
-            /*
-            l.input_layer->weights[i + 0] = 0;
-            l.input_layer->weights[i + 1] = 0;
-            l.input_layer->weights[i + 2] = 0;
-
-            l.input_layer->weights[i + 3] = 0;
-            l.input_layer->weights[i + 4] = 1;
-            l.input_layer->weights[i + 5] = 0;
-
-            l.input_layer->weights[i + 6] = 0;
-            l.input_layer->weights[i + 7] = 0;
-            l.input_layer->weights[i + 8] = 0;
-            */
-            l.input_layer->weights[i + 0] = 1 / 16.f;
-            l.input_layer->weights[i + 1] = 2 / 16.f;
-            l.input_layer->weights[i + 2] = 1 / 16.f;
-
-            l.input_layer->weights[i + 3] = 2 / 16.f;
-            l.input_layer->weights[i + 4] = 4 / 16.f;
-            l.input_layer->weights[i + 5] = 2 / 16.f;
-
-            l.input_layer->weights[i + 6] = 1 / 16.f;
-            l.input_layer->weights[i + 7] = 2 / 16.f;
-            l.input_layer->weights[i + 8] = 1 / 16.f;
-
+        if (blur_size == 2) {
+            for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
+                l.input_layer->weights[i + 0] = 1 / 4.f;
+                l.input_layer->weights[i + 1] = 1 / 4.f;
+                l.input_layer->weights[i + 2] = 1 / 4.f;
+                l.input_layer->weights[i + 3] = 1 / 4.f;
+            }
+        }
+        else {
+            for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
+                /*
+                l.input_layer->weights[i + 0] = 0;
+                l.input_layer->weights[i + 1] = 0;
+                l.input_layer->weights[i + 2] = 0;
+
+                l.input_layer->weights[i + 3] = 0;
+                l.input_layer->weights[i + 4] = 1;
+                l.input_layer->weights[i + 5] = 0;
+
+                l.input_layer->weights[i + 6] = 0;
+                l.input_layer->weights[i + 7] = 0;
+                l.input_layer->weights[i + 8] = 0;
+                */
+                l.input_layer->weights[i + 0] = 1 / 16.f;
+                l.input_layer->weights[i + 1] = 2 / 16.f;
+                l.input_layer->weights[i + 2] = 1 / 16.f;
+
+                l.input_layer->weights[i + 3] = 2 / 16.f;
+                l.input_layer->weights[i + 4] = 4 / 16.f;
+                l.input_layer->weights[i + 5] = 2 / 16.f;
+
+                l.input_layer->weights[i + 6] = 1 / 16.f;
+                l.input_layer->weights[i + 7] = 2 / 16.f;
+                l.input_layer->weights[i + 8] = 1 / 16.f;
+            }
         }
         for (i = 0; i < n; ++i) l.input_layer->biases[i] = 0;
 #ifdef GPU
diff --git a/src/detector.c b/src/detector.c
index 689ace6da83..efe5571aa8d 100644
--- a/src/detector.c
+++ b/src/detector.c
@@ -798,7 +798,7 @@ float validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, floa
             replace_image_to_label(path, labelpath);
             int num_labels = 0;
             box_label *truth = read_boxes(labelpath, &num_labels);
-            int i, j;
+            int j;
             for (j = 0; j < num_labels; ++j) {
                 truth_classes_count[truth[j].id]++;
             }
@@ -818,6 +818,7 @@ float validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, floa
 
             const int checkpoint_detections_count = detections_count;
 
+            int i;
             for (i = 0; i < nboxes; ++i) {
 
                 int class_id;
diff --git a/src/http_stream.cpp b/src/http_stream.cpp
index 3ce2a212bca..af2e6730c74 100644
--- a/src/http_stream.cpp
+++ b/src/http_stream.cpp
@@ -48,7 +48,7 @@ static int close_socket(SOCKET s) {
     cerr << "Close socket: out = " << close_output << ", in = " << close_input << " \n";
     return result;
 }
-#else   // nix
+#else   // _WIN32 - else: nix
 #include "darkunistd.h"
 #include <sys/time.h>
 #include <sys/types.h>
diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c
index 2f290497dcb..dca9c1b15f7 100644
--- a/src/maxpool_layer.c
+++ b/src/maxpool_layer.c
@@ -108,35 +108,50 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
     if (l.antialiasing) {
         printf("AA:  ");
         l.input_layer = (layer*)calloc(1, sizeof(layer));
-        const int blur_size = 3;
-        *(l.input_layer) = make_convolutional_layer(batch, 1, l.out_h, l.out_w, l.out_c, l.out_c, l.out_c, blur_size, blur_stride_x, blur_stride_y, 1, blur_size / 2, LINEAR, 0, 0, 0, 0, 0, 1, 0, NULL, 0);
+        int blur_size = 3;
+        int blur_pad = blur_size / 2;
+        if (l.antialiasing == 2) {
+            blur_size = 2;
+            blur_pad = 0;
+        }
+        *(l.input_layer) = make_convolutional_layer(batch, 1, l.out_h, l.out_w, l.out_c, l.out_c, l.out_c, blur_size, blur_stride_x, blur_stride_y, 1, blur_pad, LINEAR, 0, 0, 0, 0, 0, 1, 0, NULL, 0);
         const int blur_nweights = l.out_c * blur_size * blur_size;  // (n / n) * n * blur_size * blur_size;
         int i;
-        for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
-            /*
-            l.input_layer->weights[i + 0] = 0;
-            l.input_layer->weights[i + 1] = 0;
-            l.input_layer->weights[i + 2] = 0;
-
-            l.input_layer->weights[i + 3] = 0;
-            l.input_layer->weights[i + 4] = 1;
-            l.input_layer->weights[i + 5] = 0;
-
-            l.input_layer->weights[i + 6] = 0;
-            l.input_layer->weights[i + 7] = 0;
-            l.input_layer->weights[i + 8] = 0;
-            */
-            l.input_layer->weights[i + 0] = 1 / 16.f;
-            l.input_layer->weights[i + 1] = 2 / 16.f;
-            l.input_layer->weights[i + 2] = 1 / 16.f;
-
-            l.input_layer->weights[i + 3] = 2 / 16.f;
-            l.input_layer->weights[i + 4] = 4 / 16.f;
-            l.input_layer->weights[i + 5] = 2 / 16.f;
-
-            l.input_layer->weights[i + 6] = 1 / 16.f;
-            l.input_layer->weights[i + 7] = 2 / 16.f;
-            l.input_layer->weights[i + 8] = 1 / 16.f;
+        if (blur_size == 2) {
+            for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
+                l.input_layer->weights[i + 0] = 1 / 4.f;
+                l.input_layer->weights[i + 1] = 1 / 4.f;
+                l.input_layer->weights[i + 2] = 1 / 4.f;
+                l.input_layer->weights[i + 3] = 1 / 4.f;
+            }
+        }
+        else {
+            for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
+                /*
+                l.input_layer->weights[i + 0] = 0;
+                l.input_layer->weights[i + 1] = 0;
+                l.input_layer->weights[i + 2] = 0;
+
+                l.input_layer->weights[i + 3] = 0;
+                l.input_layer->weights[i + 4] = 1;
+                l.input_layer->weights[i + 5] = 0;
+
+                l.input_layer->weights[i + 6] = 0;
+                l.input_layer->weights[i + 7] = 0;
+                l.input_layer->weights[i + 8] = 0;
+                */
+                l.input_layer->weights[i + 0] = 1 / 16.f;
+                l.input_layer->weights[i + 1] = 2 / 16.f;
+                l.input_layer->weights[i + 2] = 1 / 16.f;
+
+                l.input_layer->weights[i + 3] = 2 / 16.f;
+                l.input_layer->weights[i + 4] = 4 / 16.f;
+                l.input_layer->weights[i + 5] = 2 / 16.f;
+
+                l.input_layer->weights[i + 6] = 1 / 16.f;
+                l.input_layer->weights[i + 7] = 2 / 16.f;
+                l.input_layer->weights[i + 8] = 1 / 16.f;
+            }
         }
         for (i = 0; i < l.out_c; ++i) l.input_layer->biases[i] = 0;
 #ifdef GPU
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index 20ee8e34391..2006f4b8047 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -128,6 +128,26 @@ box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw
     return b;
 }
 
+
+int get_yolo_class(float *output, int classes, int class_index, int stride, float objectness)
+{
+    int class_id = 0;
+    float max_prob = FLT_MIN;
+
+    int j;
+    for (j = 0; j < classes; ++j) {
+        float prob = objectness * output[class_index + stride*j];
+        if (prob > max_prob) {
+            max_prob = prob;
+            class_id = j;
+        }
+        //int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j);
+        //float prob = objectness*predictions[class_index];
+        //dets[count].prob[j] = (prob > thresh) ? prob : 0;
+    }
+    return class_id;
+}
+
 ious delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss)
 {
     ious all_ious = { 0 };
@@ -272,6 +292,7 @@ void forward_yolo_layer(const layer l, network_state state)
                     box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.w*l.h);
                     float best_iou = 0;
                     int best_t = 0;
+                    int class_id_match = 0;
                     for (t = 0; t < l.max_boxes; ++t) {
                         box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
                         int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
@@ -282,8 +303,17 @@ void forward_yolo_layer(const layer l, network_state state)
                             continue; // if label contains class_id more than number of classes in the cfg-file
                         }
                         if (!truth.x) break;  // continue;
+
+                        int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
+                        int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
+                        float objectness = l.output[obj_index];
+                        int pred_class_id = get_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness);
+                        if (class_id == pred_class_id) class_id_match = 1;
+                        else class_id_match = 0;
+
                         float iou = box_iou(pred, truth);
-                        if (iou > best_iou) {
+                        //if (iou > best_iou) {
+                        if (iou > best_iou && class_id_match == 1) {
                             best_iou = iou;
                             best_t = t;
                         }

From bb7d69941cbce4f67f10406395d685ea92be9478 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Mon, 21 Oct 2019 15:49:46 +0300
Subject: [PATCH 33/86] Added debugging info for Training Classifier for case:
 Too many or too few labels

---
 src/data.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/data.c b/src/data.c
index d8f0c9e81c0..7cb7bf0a998 100644
--- a/src/data.c
+++ b/src/data.c
@@ -498,7 +498,16 @@ void fill_truth(char *path, char **labels, int k, float *truth)
             ++count;
         }
     }
-    if(count != 1) printf("Too many or too few labels: %d, %s\n", count, path);
+    if (count != 1) {
+        printf("Too many or too few labels: %d, %s\n", count, path);
+        count = 0;
+        for (i = 0; i < k; ++i) {
+            if (strstr(path, labels[i])) {
+                printf("\t label %d: %s  \n", count, labels[i]);
+                count++;
+            }
+        }
+    }
 }
 
 void fill_hierarchy(float *truth, int k, tree *hierarchy)

From b3a24952985352f81d759095aa3b38c4e761c342 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 25 Oct 2019 20:47:17 +0300
Subject: [PATCH 34/86] Added Gaussian YOLOv3 layer [Gaussian_yolo]

---
 Makefile                      |   2 +-
 build/darknet/darknet.vcxproj |   2 +
 include/darknet.h             |   2 +
 src/box.c                     |  10 +
 src/box.h                     |   1 +
 src/convolutional_kernels.cu  |  11 +-
 src/convolutional_layer.c     |  11 +-
 src/data.c                    |  11 +-
 src/gaussian_yolo_layer.c     | 445 ++++++++++++++++++++++++++++++++++
 src/gaussian_yolo_layer.h     |  20 ++
 src/network.c                 |  16 ++
 src/parser.c                  |  65 +++++
 src/yolo_layer.c              |  10 -
 13 files changed, 565 insertions(+), 41 deletions(-)
 create mode 100644 src/gaussian_yolo_layer.c
 create mode 100644 src/gaussian_yolo_layer.h

diff --git a/Makefile b/Makefile
index 25a85f81855..41e5fc8d737 100644
--- a/Makefile
+++ b/Makefile
@@ -118,7 +118,7 @@ LDFLAGS+= -L/usr/local/zed/lib -lsl_core -lsl_input -lsl_zed
 #-lstdc++ -D_GLIBCXX_USE_CXX11_ABI=0 
 endif
 
-OBJ=image_opencv.o http_stream.o gemm.o utils.o dark_cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o voxel.o tree.o yolo_layer.o upsample_layer.o lstm_layer.o conv_lstm_layer.o scale_channels_layer.o sam_layer.o
+OBJ=image_opencv.o http_stream.o gemm.o utils.o dark_cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o reorg_old_layer.o super.o voxel.o tree.o yolo_layer.o gaussian_yolo_layer.o upsample_layer.o lstm_layer.o conv_lstm_layer.o scale_channels_layer.o sam_layer.o
 ifeq ($(GPU), 1) 
 LDFLAGS+= -lstdc++ 
 OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o
diff --git a/build/darknet/darknet.vcxproj b/build/darknet/darknet.vcxproj
index d7dc91590fa..b685bebd50b 100644
--- a/build/darknet/darknet.vcxproj
+++ b/build/darknet/darknet.vcxproj
@@ -199,6 +199,7 @@
     <ClCompile Include="..\..\src\detector.c" />
     <ClCompile Include="..\..\src\dice.c" />
     <ClCompile Include="..\..\src\dropout_layer.c" />
+    <ClCompile Include="..\..\src\gaussian_yolo_layer.c" />
     <ClCompile Include="..\..\src\gemm.c" />
     <ClCompile Include="..\..\src\getopt.c" />
     <ClCompile Include="..\..\src\gettimeofday.c" />
@@ -263,6 +264,7 @@
     <ClInclude Include="..\..\src\demo.h" />
     <ClInclude Include="..\..\src\detection_layer.h" />
     <ClInclude Include="..\..\src\dropout_layer.h" />
+    <ClInclude Include="..\..\src\gaussian_yolo_layer.h" />
     <ClInclude Include="..\..\src\gemm.h" />
     <ClInclude Include="..\..\src\getopt.h" />
     <ClInclude Include="..\..\src\gettimeofday.h" />
diff --git a/include/darknet.h b/include/darknet.h
index e78abe6a5c9..00b49921f52 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -149,6 +149,7 @@ typedef enum {
     XNOR,
     REGION,
     YOLO,
+    GAUSSIAN_YOLO,
     ISEG,
     REORG,
     REORG_OLD,
@@ -728,6 +729,7 @@ typedef struct detection{
     float *mask;
     float objectness;
     int sort_class;
+    float *uc; // Gaussian_YOLOv3 - tx,ty,tw,th uncertainty
 } detection;
 
 // matrix.h
diff --git a/src/box.c b/src/box.c
index 1b5c4998a6b..c6a27ed587f 100644
--- a/src/box.c
+++ b/src/box.c
@@ -13,6 +13,16 @@ box float_to_box(float *f)
     return b;
 }
 
+box float_to_box_stride(float *f, int stride)
+{
+    box b = { 0 };
+    b.x = f[0];
+    b.y = f[1 * stride];
+    b.w = f[2 * stride];
+    b.h = f[3 * stride];
+    return b;
+}
+
 dbox derivative(box a, box b)
 {
     dbox d;
diff --git a/src/box.h b/src/box.h
index 2392fedd20c..172c135293c 100644
--- a/src/box.h
+++ b/src/box.h
@@ -31,6 +31,7 @@ typedef struct detection_with_class {
 extern "C" {
 #endif
 box float_to_box(float *f);
+box float_to_box_stride(float *f, int stride);
 float box_iou(box a, box b);
 float box_rmse(box a, box b);
 dxrep dx_box_iou(box a, box b, IOU_LOSS iou_loss);
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index edfb03b81c7..23005ccb91e 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -10,6 +10,7 @@
 #include "col2im.h"
 #include "utils.h"
 #include "dark_cuda.h"
+#include "box.h"
 
 
 __global__ void binarize_kernel(float *x, int n, float *binary)
@@ -892,16 +893,6 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
     }
 }
 
-static box float_to_box_stride(float *f, int stride)
-{
-    box b = { 0 };
-    b.x = f[0];
-    b.y = f[1 * stride];
-    b.w = f[2 * stride];
-    b.h = f[3 * stride];
-    return b;
-}
-
 __global__ void calc_avg_activation_kernel(float *src, float *dst, int size, int channels, int batches)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 8bce5aa67d9..6818b603529 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -5,6 +5,7 @@
 #include "col2im.h"
 #include "blas.h"
 #include "gemm.h"
+#include "box.h"
 #include <stdio.h>
 #include <time.h>
 
@@ -1171,16 +1172,6 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
     }
 }
 
-static box float_to_box_stride(float *f, int stride)
-{
-    box b = { 0 };
-    b.x = f[0];
-    b.y = f[1 * stride];
-    b.w = f[2 * stride];
-    b.h = f[3 * stride];
-    return b;
-}
-
 void assisted_excitation_forward(convolutional_layer l, network_state state)
 {
     const int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
diff --git a/src/data.c b/src/data.c
index 7cb7bf0a998..c0af1ab857d 100644
--- a/src/data.c
+++ b/src/data.c
@@ -2,6 +2,7 @@
 #include "utils.h"
 #include "image.h"
 #include "dark_cuda.h"
+#include "box.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -779,16 +780,6 @@ data load_data_swag(char **paths, int n, int classes, float jitter)
     return d;
 }
 
-static box float_to_box_stride(float *f, int stride)
-{
-    box b = { 0 };
-    b.x = f[0];
-    b.y = f[1 * stride];
-    b.w = f[2 * stride];
-    b.h = f[3 * stride];
-    return b;
-}
-
 void blend_truth(float *new_truth, int boxes, float *old_truth)
 {
     const int t_size = 4 + 1;
diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
new file mode 100644
index 00000000000..320834018d9
--- /dev/null
+++ b/src/gaussian_yolo_layer.c
@@ -0,0 +1,445 @@
+// Gaussian YOLOv3 implementation
+// Author: Jiwoong Choi
+// ICCV 2019 Paper: http://openaccess.thecvf.com/content_ICCV_2019/html/Choi_Gaussian_YOLOv3_An_Accurate_and_Fast_Object_Detector_Using_Localization_ICCV_2019_paper.html
+// arxiv.org: https://arxiv.org/abs/1904.04620v2
+// source code: https://github.com/jwchoi384/Gaussian_YOLOv3
+
+#include "gaussian_yolo_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "box.h"
+#include "dark_cuda.h"
+#include "utils.h"
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#ifndef M_PI
+#define M_PI 3.141592
+#endif
+
+layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes)
+{
+    int i;
+    layer l = {0};
+    l.type = GAUSSIAN_YOLO;
+
+    l.n = n;
+    l.total = total;
+    l.batch = batch;
+    l.h = h;
+    l.w = w;
+    l.c = n*(classes + 8 + 1);
+    l.out_w = l.w;
+    l.out_h = l.h;
+    l.out_c = l.c;
+    l.classes = classes;
+    l.cost = calloc(1, sizeof(float));
+    l.biases = calloc(total*2, sizeof(float));
+    if(mask) l.mask = mask;
+    else{
+        l.mask = calloc(n, sizeof(int));
+        for(i = 0; i < n; ++i){
+            l.mask[i] = i;
+        }
+    }
+    l.bias_updates = calloc(n*2, sizeof(float));
+    l.outputs = h*w*n*(classes + 8 + 1);
+    l.inputs = l.outputs;
+    l.truths = 90*(4 + 1);
+    l.delta = calloc(batch*l.outputs, sizeof(float));
+    l.output = calloc(batch*l.outputs, sizeof(float));
+    for(i = 0; i < total*2; ++i){
+        l.biases[i] = .5;
+    }
+
+    l.forward = forward_gaussian_yolo_layer;
+    l.backward = backward_gaussian_yolo_layer;
+#ifdef GPU
+    l.forward_gpu = forward_gaussian_yolo_layer_gpu;
+    l.backward_gpu = backward_gaussian_yolo_layer_gpu;
+    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
+    l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
+#endif
+
+    fprintf(stderr, "Gaussian_yolo\n");
+    srand(0);
+
+    return l;
+}
+
+void resize_gaussian_yolo_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+
+    l->outputs = h*w*l->n*(l->classes + 8 + 1);
+    l->inputs = l->outputs;
+
+    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
+    l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->delta_gpu);
+    cuda_free(l->output_gpu);
+
+    l->delta_gpu =     cuda_make_array(l->delta, l->batch*l->outputs);
+    l->output_gpu =    cuda_make_array(l->output, l->batch*l->outputs);
+#endif
+}
+
+box get_gaussian_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride)
+{
+    box b;
+    b.x = (i + x[index + 0*stride]) / lw;
+    b.y = (j + x[index + 2*stride]) / lh;
+    b.w = exp(x[index + 4*stride]) * biases[2*n]   / w;
+    b.h = exp(x[index + 6*stride]) * biases[2*n+1] / h;
+    return b;
+}
+
+float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride)
+{
+    box pred = get_gaussian_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
+    float iou = box_iou(pred, truth);
+
+    float tx = (truth.x*lw - i);
+    float ty = (truth.y*lh - j);
+    float tw = log(truth.w*w / biases[2*n]);
+    float th = log(truth.h*h / biases[2*n + 1]);
+
+    float sigma_const = 0.3;
+    float epsi = pow(10,-9);
+
+    float in_exp_x = (tx - x[index + 0*stride])/x[index+1*stride];
+    float in_exp_x_2 = pow(in_exp_x, 2);
+    float normal_dist_x = exp(in_exp_x_2*(-1./2.))/(sqrt(M_PI * 2.0)*(x[index+1*stride]+sigma_const));
+
+    float in_exp_y = (ty - x[index + 2*stride])/x[index+3*stride];
+    float in_exp_y_2 = pow(in_exp_y, 2);
+    float normal_dist_y = exp(in_exp_y_2*(-1./2.))/(sqrt(M_PI * 2.0)*(x[index+3*stride]+sigma_const));
+
+    float in_exp_w = (tw - x[index + 4*stride])/x[index+5*stride];
+    float in_exp_w_2 = pow(in_exp_w, 2);
+    float normal_dist_w = exp(in_exp_w_2*(-1./2.))/(sqrt(M_PI * 2.0)*(x[index+5*stride]+sigma_const));
+
+    float in_exp_h = (th - x[index + 6*stride])/x[index+7*stride];
+    float in_exp_h_2 = pow(in_exp_h, 2);
+    float normal_dist_h = exp(in_exp_h_2*(-1./2.))/(sqrt(M_PI * 2.0)*(x[index+7*stride]+sigma_const));
+
+    float temp_x = (1./2.) * 1./(normal_dist_x+epsi) * normal_dist_x * scale;
+    float temp_y = (1./2.) * 1./(normal_dist_y+epsi) * normal_dist_y * scale;
+    float temp_w = (1./2.) * 1./(normal_dist_w+epsi) * normal_dist_w * scale;
+    float temp_h = (1./2.) * 1./(normal_dist_h+epsi) * normal_dist_h * scale;
+
+    delta[index + 0*stride] = temp_x * in_exp_x  * (1./x[index+1*stride]);
+    delta[index + 2*stride] = temp_y * in_exp_y  * (1./x[index+3*stride]);
+    delta[index + 4*stride] = temp_w * in_exp_w  * (1./x[index+5*stride]);
+    delta[index + 6*stride] = temp_h * in_exp_h  * (1./x[index+7*stride]);
+
+    delta[index + 1*stride] = temp_x * (in_exp_x_2/x[index+1*stride] - 1./(x[index+1*stride]+sigma_const));
+    delta[index + 3*stride] = temp_y * (in_exp_y_2/x[index+3*stride] - 1./(x[index+3*stride]+sigma_const));
+    delta[index + 5*stride] = temp_w * (in_exp_w_2/x[index+5*stride] - 1./(x[index+5*stride]+sigma_const));
+    delta[index + 7*stride] = temp_h * (in_exp_h_2/x[index+7*stride] - 1./(x[index+7*stride]+sigma_const));
+    return iou;
+}
+
+
+void delta_gaussian_yolo_class(float *output, float *delta, int index, int class, int classes, int stride, float *avg_cat)
+{
+    int n;
+    if (delta[index]){
+        delta[index + stride*class] = 1 - output[index + stride*class];
+        if(avg_cat) *avg_cat += output[index + stride*class];
+        return;
+    }
+    for(n = 0; n < classes; ++n){
+        delta[index + stride*n] = ((n == class)?1 : 0) - output[index + stride*n];
+        if(n == class && avg_cat) *avg_cat += output[index + stride*n];
+    }
+}
+
+static int entry_gaussian_index(layer l, int batch, int location, int entry)
+{
+    int n =   location / (l.w*l.h);
+    int loc = location % (l.w*l.h);
+    return batch*l.outputs + n*l.w*l.h*(8+l.classes+1) + entry*l.w*l.h + loc;
+}
+
+void forward_gaussian_yolo_layer(const layer l, network net)
+{
+    int i,j,b,t,n;
+    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
+
+#ifndef GPU
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            // x : mu, sigma
+            int index = entry_gaussian_index(l, b, n*l.w*l.h, 0);
+            activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
+            // y : mu, sigma
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 2);
+            activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
+            // w : sigma
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 5);
+            activate_array(l.output + index, l.w*l.h, LOGISTIC);
+            // h : sigma
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 7);
+            activate_array(l.output + index, l.w*l.h, LOGISTIC);
+            // objectness & class
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 8);
+            activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC);
+        }
+    }
+#endif
+
+    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
+    if(!net.train) return;
+    float avg_iou = 0;
+    float recall = 0;
+    float recall75 = 0;
+    float avg_cat = 0;
+    float avg_obj = 0;
+    float avg_anyobj = 0;
+    int count = 0;
+    int class_count = 0;
+    *(l.cost) = 0;
+    for (b = 0; b < l.batch; ++b) {
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    int box_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                    box pred = get_gaussian_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.w*l.h);
+                    float best_iou = 0;
+                    int best_t = 0;
+                    for(t = 0; t < l.max_boxes; ++t){
+                        box truth = float_to_box_stride(net.truth + t*(4 + 1) + b*l.truths, 1);
+                        if(!truth.x) break;
+                        float iou = box_iou(pred, truth);
+                        if (iou > best_iou) {
+                            best_iou = iou;
+                            best_t = t;
+                        }
+                    }
+                    int obj_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 8);
+                    avg_anyobj += l.output[obj_index];
+                    l.delta[obj_index] = 0 - l.output[obj_index];
+                    if (best_iou > l.ignore_thresh) {
+                        l.delta[obj_index] = 0;
+                    }
+                    if (best_iou > l.truth_thresh) {
+                        l.delta[obj_index] = 1 - l.output[obj_index];
+
+                        int class = net.truth[best_t*(4 + 1) + b*l.truths + 4];
+                        if (l.map) class = l.map[class];
+                        int class_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 9);
+                        delta_gaussian_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0);
+                        box truth = float_to_box_stride(net.truth + best_t*(4 + 1) + b*l.truths, 1);
+                        delta_gaussian_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
+                    }
+                }
+            }
+        }
+        for(t = 0; t < l.max_boxes; ++t){
+            box truth = float_to_box_stride(net.truth + t*(4 + 1) + b*l.truths, 1);
+
+            if(!truth.x) break;
+            float best_iou = 0;
+            int best_n = 0;
+            i = (truth.x * l.w);
+            j = (truth.y * l.h);
+            box truth_shift = truth;
+            truth_shift.x = truth_shift.y = 0;
+            for(n = 0; n < l.total; ++n){
+                box pred = {0};
+                pred.w = l.biases[2*n]/net.w;
+                pred.h = l.biases[2*n+1]/net.h;
+                float iou = box_iou(pred, truth_shift);
+                if (iou > best_iou){
+                    best_iou = iou;
+                    best_n = n;
+                }
+            }
+
+            int mask_n = int_index(l.mask, best_n, l.n);
+            if(mask_n >= 0){
+                int box_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
+                float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
+
+                int obj_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 8);
+                avg_obj += l.output[obj_index];
+                l.delta[obj_index] = 1 - l.output[obj_index];
+
+                int class = net.truth[t*(4 + 1) + b*l.truths + 4];
+                if (l.map) class = l.map[class];
+                int class_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 9);
+                delta_gaussian_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);
+
+                ++count;
+                ++class_count;
+                if(iou > .5) recall += 1;
+                if(iou > .75) recall75 += 1;
+                avg_iou += iou;
+            }
+        }
+    }
+    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+    printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", net.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);
+}
+
+void backward_gaussian_yolo_layer(const layer l, network net)
+{
+   axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1);
+}
+
+void correct_gaussian_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
+{
+    int i;
+    int new_w=0;
+    int new_h=0;
+    if (((float)netw/w) < ((float)neth/h)) {
+        new_w = netw;
+        new_h = (h * netw)/w;
+    } else {
+        new_h = neth;
+        new_w = (w * neth)/h;
+    }
+    for (i = 0; i < n; ++i){
+        box b = dets[i].bbox;
+        b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw);
+        b.y =  (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth);
+        b.w *= (float)netw/new_w;
+        b.h *= (float)neth/new_h;
+        if(!relative){
+            b.x *= w;
+            b.w *= w;
+            b.y *= h;
+            b.h *= h;
+        }
+        dets[i].bbox = b;
+    }
+}
+
+int gaussian_yolo_num_detections(layer l, float thresh)
+{
+    int i, n;
+    int count = 0;
+    for (i = 0; i < l.w*l.h; ++i){
+        for(n = 0; n < l.n; ++n){
+            int obj_index  = entry_gaussian_index(l, 0, n*l.w*l.h + i, 8);
+            if(l.output[obj_index] > thresh){
+                ++count;
+            }
+        }
+    }
+    return count;
+}
+
+/*
+void avg_flipped_gaussian_yolo(layer l)
+{
+    int i,j,n,z;
+    float *flip = l.output + l.outputs;
+    for (j = 0; j < l.h; ++j) {
+        for (i = 0; i < l.w/2; ++i) {
+            for (n = 0; n < l.n; ++n) {
+                for(z = 0; z < l.classes + 8 + 1; ++z){
+                    int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
+                    int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
+                    float swap = flip[i1];
+                    flip[i1] = flip[i2];
+                    flip[i2] = swap;
+                    if(z == 0){
+                        flip[i1] = -flip[i1];
+                        flip[i2] = -flip[i2];
+                    }
+                }
+            }
+        }
+    }
+    for(i = 0; i < l.outputs; ++i){
+        l.output[i] = (l.output[i] + flip[i])/2.;
+    }
+}
+*/
+
+int get_gaussian_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets)
+{
+    int i,j,n;
+    float *predictions = l.output;
+    //if (l.batch == 2) avg_flipped_gaussian_yolo(l);
+    int count = 0;
+    for (i = 0; i < l.w*l.h; ++i){
+        int row = i / l.w;
+        int col = i % l.w;
+        for(n = 0; n < l.n; ++n){
+            int obj_index  = entry_gaussian_index(l, 0, n*l.w*l.h + i, 8);
+            float objectness = predictions[obj_index];
+            if(objectness <= thresh) continue;
+            int box_index  = entry_gaussian_index(l, 0, n*l.w*l.h + i, 0);
+            dets[count].bbox = get_gaussian_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
+            dets[count].objectness = objectness;
+            dets[count].classes = l.classes;
+
+            dets[count].uc[0] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 1)]; // tx uncertainty
+            dets[count].uc[1] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 3)]; // ty uncertainty
+            dets[count].uc[2] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 5)]; // tw uncertainty
+            dets[count].uc[3] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 7)]; // th uncertainty
+
+            for(j = 0; j < l.classes; ++j){
+                int class_index = entry_gaussian_index(l, 0, n*l.w*l.h + i, 9 + j);
+                float uc_aver = (dets[count].uc[0] + dets[count].uc[1] + dets[count].uc[2] + dets[count].uc[3])/4.0;
+                float prob = objectness*predictions[class_index]*(1.0-uc_aver);
+                dets[count].prob[j] = (prob > thresh) ? prob : 0;
+            }
+            ++count;
+        }
+    }
+    correct_gaussian_yolo_boxes(dets, count, w, h, netw, neth, relative);
+    return count;
+}
+
+#ifdef GPU
+
+void forward_gaussian_yolo_layer_gpu(const layer l, network net)
+{
+    copy_ongpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
+    int b, n;
+    for (b = 0; b < l.batch; ++b)
+    {
+        for(n = 0; n < l.n; ++n)
+        {
+            // x : mu, sigma
+            int index = entry_gaussian_index(l, b, n*l.w*l.h, 0);
+            activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
+            // y : mu, sigma
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 2);
+            activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
+            // w : sigma
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 5);
+            activate_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC);
+            // h : sigma
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 7);
+            activate_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC);
+            // objectness & class
+            index = entry_gaussian_index(l, b, n*l.w*l.h, 8);
+            activate_array_ongpu(l.output_gpu + index, (1+l.classes)*l.w*l.h, LOGISTIC);
+        }
+    }
+    if(!net.train || l.onlyforward){
+        cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+        return;
+    }
+
+    cuda_pull_array(l.output_gpu, net.input, l.batch*l.inputs);
+    forward_gaussian_yolo_layer(l, net);
+    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
+}
+
+void backward_gaussian_yolo_layer_gpu(const layer l, network net)
+{
+    axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+}
+#endif
diff --git a/src/gaussian_yolo_layer.h b/src/gaussian_yolo_layer.h
new file mode 100644
index 00000000000..96cb2a8f4a6
--- /dev/null
+++ b/src/gaussian_yolo_layer.h
@@ -0,0 +1,20 @@
+//Gaussian YOLOv3 implementation
+#ifndef GAUSSIAN_YOLO_LAYER_H
+#define GAUSSIAN_YOLO_LAYER_H
+
+#include "darknet.h"
+#include "layer.h"
+#include "network.h"
+
+layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes);
+void forward_gaussian_yolo_layer(const layer l, network net);
+void backward_gaussian_yolo_layer(const layer l, network net);
+void resize_gaussian_yolo_layer(layer *l, int w, int h);
+int gaussian_yolo_num_detections(layer l, float thresh);
+
+#ifdef GPU
+void forward_gaussian_yolo_layer_gpu(const layer l, network net);
+void backward_gaussian_yolo_layer_gpu(layer l, network net);
+#endif
+
+#endif
diff --git a/src/network.c b/src/network.c
index 82dc4d53978..0658788044c 100644
--- a/src/network.c
+++ b/src/network.c
@@ -34,6 +34,7 @@
 #include "shortcut_layer.h"
 #include "scale_channels_layer.h"
 #include "yolo_layer.h"
+#include "gaussian_yolo_layer.h"
 #include "upsample_layer.h"
 #include "parser.h"
 
@@ -202,6 +203,10 @@ char *get_layer_string(LAYER_TYPE a)
             return "detection";
         case REGION:
             return "region";
+        case YOLO:
+            return "yolo";
+        case GAUSSIAN_YOLO:
+            return "Gaussian_yolo";
         case DROPOUT:
             return "dropout";
         case CROP:
@@ -524,6 +529,8 @@ int resize_network(network *net, int w, int h)
             resize_region_layer(&l, w, h);
         }else if (l.type == YOLO) {
             resize_yolo_layer(&l, w, h);
+        }else if (l.type == GAUSSIAN_YOLO) {
+            resize_gaussian_yolo_layer(&l, w, h);
         }else if(l.type == ROUTE){
             resize_route_layer(&l, net);
         }else if (l.type == SHORTCUT) {
@@ -687,6 +694,9 @@ int num_detections(network *net, float thresh)
         if (l.type == YOLO) {
             s += yolo_num_detections(l, thresh);
         }
+        if (l.type == GAUSSIAN_YOLO) {
+            s += gaussian_yolo_num_detections(l, thresh);
+        }
         if (l.type == DETECTION || l.type == REGION) {
             s += l.w*l.h*l.n;
         }
@@ -703,6 +713,8 @@ detection *make_network_boxes(network *net, float thresh, int *num)
     detection* dets = (detection*)calloc(nboxes, sizeof(detection));
     for (i = 0; i < nboxes; ++i) {
         dets[i].prob = (float*)calloc(l.classes, sizeof(float));
+        // tx,ty,tw,th uncertainty
+        dets[i].uc = calloc(4, sizeof(float)); // Gaussian_YOLOv3
         if (l.coords > 4) {
             dets[i].mask = (float*)calloc(l.coords - 4, sizeof(float));
         }
@@ -749,6 +761,10 @@ void fill_network_boxes(network *net, int w, int h, float thresh, float hier, in
                     prev_classes, l.classes);
             }
         }
+        if (l.type == GAUSSIAN_YOLO) {
+            int count = get_gaussian_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets);
+            dets += count;
+        }
         if (l.type == REGION) {
             custom_get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets, letter);
             //get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
diff --git a/src/parser.c b/src/parser.c
index 829134d1131..b31c7673102 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -38,6 +38,7 @@
 #include "upsample_layer.h"
 #include "version.h"
 #include "yolo_layer.h"
+#include "gaussian_yolo_layer.h"
 
 typedef struct{
     char *type;
@@ -57,6 +58,7 @@ LAYER_TYPE string_to_layer_type(char * type)
     if (strcmp(type, "[detection]")==0) return DETECTION;
     if (strcmp(type, "[region]")==0) return REGION;
     if (strcmp(type, "[yolo]") == 0) return YOLO;
+    if (strcmp(type, "[Gaussian_yolo]") == 0) return GAUSSIAN_YOLO;
     if (strcmp(type, "[local]")==0) return LOCAL;
     if (strcmp(type, "[conv]")==0
             || strcmp(type, "[convolutional]")==0) return CONVOLUTIONAL;
@@ -390,6 +392,67 @@ layer parse_yolo(list *options, size_params params)
     return l;
 }
 
+
+int *parse_gaussian_yolo_mask(char *a, int *num) // Gaussian_YOLOv3
+{
+    int *mask = 0;
+    if (a) {
+        int len = strlen(a);
+        int n = 1;
+        int i;
+        for (i = 0; i < len; ++i) {
+            if (a[i] == ',') ++n;
+        }
+        mask = calloc(n, sizeof(int));
+        for (i = 0; i < n; ++i) {
+            int val = atoi(a);
+            mask[i] = val;
+            a = strchr(a, ',') + 1;
+        }
+        *num = n;
+    }
+    return mask;
+}
+
+
+layer parse_gaussian_yolo(list *options, size_params params) // Gaussian_YOLOv3
+{
+    int classes = option_find_int(options, "classes", 20);
+    int total = option_find_int(options, "num", 1);
+    int num = total;
+
+    char *a = option_find_str(options, "mask", 0);
+    int *mask = parse_gaussian_yolo_mask(a, &num);
+    layer l = make_gaussian_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes);
+    assert(l.outputs == params.inputs);
+
+    l.max_boxes = option_find_int_quiet(options, "max", 90);
+    l.jitter = option_find_float(options, "jitter", .2);
+
+    l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
+    l.truth_thresh = option_find_float(options, "truth_thresh", 1);
+    l.random = option_find_int_quiet(options, "random", 0);
+
+    char *map_file = option_find_str(options, "map", 0);
+    if (map_file) l.map = read_map(map_file);
+
+    a = option_find_str(options, "anchors", 0);
+    if (a) {
+        int len = strlen(a);
+        int n = 1;
+        int i;
+        for (i = 0; i < len; ++i) {
+            if (a[i] == ',') ++n;
+        }
+        for (i = 0; i < n; ++i) {
+            float bias = atof(a);
+            l.biases[i] = bias;
+            a = strchr(a, ',') + 1;
+        }
+    }
+    return l;
+}
+
 layer parse_region(list *options, size_params params)
 {
     int coords = option_find_int(options, "coords", 4);
@@ -923,6 +986,8 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
             l = parse_region(options, params);
         }else if (lt == YOLO) {
             l = parse_yolo(options, params);
+        }else if (lt == GAUSSIAN_YOLO) {
+            l = parse_gaussian_yolo(options, params);
         }else if(lt == DETECTION){
             l = parse_detection(options, params);
         }else if(lt == SOFTMAX){
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index 2006f4b8047..424811df6e6 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -242,16 +242,6 @@ static int entry_index(layer l, int batch, int location, int entry)
     return batch*l.outputs + n*l.w*l.h*(4+l.classes+1) + entry*l.w*l.h + loc;
 }
 
-static box float_to_box_stride(float *f, int stride)
-{
-    box b = { 0 };
-    b.x = f[0];
-    b.y = f[1 * stride];
-    b.w = f[2 * stride];
-    b.h = f[3 * stride];
-    return b;
-}
-
 void forward_yolo_layer(const layer l, network_state state)
 {
     int i, j, b, t, n;

From 24788b806175a9cfbbeb0e2057f0673cd0d3e657 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 25 Oct 2019 20:56:58 +0300
Subject: [PATCH 35/86] Compile fix

---
 src/image_opencv.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/image_opencv.cpp b/src/image_opencv.cpp
index cc50c0719a1..d4a69e6f197 100644
--- a/src/image_opencv.cpp
+++ b/src/image_opencv.cpp
@@ -1125,15 +1125,6 @@ void draw_train_loss(mat_cv* img_src, int img_size, float avg_loss, float max_im
 // ====================================================================
 // Data augmentation
 // ====================================================================
-static box float_to_box_stride(float *f, int stride)
-{
-    box b = { 0 };
-    b.x = f[0];
-    b.y = f[1 * stride];
-    b.w = f[2 * stride];
-    b.h = f[3 * stride];
-    return b;
-}
 
 image image_data_augmentation(mat_cv* mat, int w, int h,
     int pleft, int ptop, int swidth, int sheight, int flip,

From 72f6de30b2a75fd67436cff4638b8fa36e3fb205 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 25 Oct 2019 22:14:27 +0300
Subject: [PATCH 36/86] another compile fix

---
 src/gaussian_yolo_layer.c |  2 +-
 src/parser.c              | 12 +++++++++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index 320834018d9..74604e52346 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -23,7 +23,7 @@
 layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes)
 {
     int i;
-    layer l = {0};
+    layer l = { (LAYER_TYPE)0 };
     l.type = GAUSSIAN_YOLO;
 
     l.n = n;
diff --git a/src/parser.c b/src/parser.c
index b31c7673102..445771ca4aa 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -159,9 +159,15 @@ convolutional_layer parse_convolutional(list *options, size_params params, netwo
     int n = option_find_int(options, "filters",1);
     int groups = option_find_int_quiet(options, "groups", 1);
     int size = option_find_int(options, "size",1);
-    int stride = option_find_int(options, "stride",1);
-    int stride_x = option_find_int_quiet(options, "stride_x", stride);
-    int stride_y = option_find_int_quiet(options, "stride_y", stride);
+    int stride = -1;
+    //int stride = option_find_int(options, "stride",1);
+    int stride_x = option_find_int_quiet(options, "stride_x", -1);
+    int stride_y = option_find_int_quiet(options, "stride_y", -1);
+    if (stride_x < 1 || stride_y < 1) {
+        stride = option_find_int(options, "stride", 1);
+        if (stride_x < 1) stride_x = stride;
+        if (stride_y < 1) stride_y = stride;
+    }
     int dilation = option_find_int_quiet(options, "dilation", 1);
     int antialiasing = option_find_int_quiet(options, "antialiasing", 0);
     if (size == 1) dilation = 1;

From f18338de2667a402cd78e61f50423f4953cd32cc Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 26 Oct 2019 01:29:41 +0300
Subject: [PATCH 37/86] Fixed [Gaussian_yolo] layer (tested for training and
 detection)

---
 src/gaussian_yolo_layer.c | 154 ++++++++++++++++++++++++++++----------
 src/gaussian_yolo_layer.h |  12 +--
 src/network.c             |   5 +-
 src/parser.c              |   3 +-
 4 files changed, 127 insertions(+), 47 deletions(-)

diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index 74604e52346..ddb43939ce9 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -20,7 +20,7 @@
 #define M_PI 3.141592
 #endif
 
-layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes)
+layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes, int max_boxes)
 {
     int i;
     layer l = { (LAYER_TYPE)0 };
@@ -36,21 +36,22 @@ layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int total, int *m
     l.out_h = l.h;
     l.out_c = l.c;
     l.classes = classes;
-    l.cost = calloc(1, sizeof(float));
-    l.biases = calloc(total*2, sizeof(float));
+    l.cost = (float*)calloc(1, sizeof(float));
+    l.biases = (float*)calloc(total*2, sizeof(float));
     if(mask) l.mask = mask;
     else{
-        l.mask = calloc(n, sizeof(int));
+        l.mask = (int*)calloc(n, sizeof(int));
         for(i = 0; i < n; ++i){
             l.mask[i] = i;
         }
     }
-    l.bias_updates = calloc(n*2, sizeof(float));
+    l.bias_updates = (float*)calloc(n*2, sizeof(float));
     l.outputs = h*w*n*(classes + 8 + 1);
     l.inputs = l.outputs;
-    l.truths = 90*(4 + 1);
-    l.delta = calloc(batch*l.outputs, sizeof(float));
-    l.output = calloc(batch*l.outputs, sizeof(float));
+    l.max_boxes = max_boxes;
+    l.truths = l.max_boxes*(4 + 1);
+    l.delta = (float*)calloc(batch*l.outputs, sizeof(float));
+    l.output = (float*)calloc(batch*l.outputs, sizeof(float));
     for(i = 0; i < total*2; ++i){
         l.biases[i] = .5;
     }
@@ -62,10 +63,26 @@ layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int total, int *m
     l.backward_gpu = backward_gaussian_yolo_layer_gpu;
     l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
     l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
+
+    /*
+    free(l.output);
+    if (cudaSuccess == cudaHostAlloc(&l.output, batch*l.outputs * sizeof(float), cudaHostRegisterMapped)) l.output_pinned = 1;
+    else {
+        cudaGetLastError(); // reset CUDA-error
+        l.output = (float*)calloc(batch * l.outputs, sizeof(float));
+    }
+
+    free(l.delta);
+    if (cudaSuccess == cudaHostAlloc(&l.delta, batch*l.outputs * sizeof(float), cudaHostRegisterMapped)) l.delta_pinned = 1;
+    else {
+        cudaGetLastError(); // reset CUDA-error
+        l.delta = (float*)calloc(batch * l.outputs, sizeof(float));
+    }
+    */
 #endif
 
     fprintf(stderr, "Gaussian_yolo\n");
-    srand(0);
+    srand(time(0));
 
     return l;
 }
@@ -78,10 +95,33 @@ void resize_gaussian_yolo_layer(layer *l, int w, int h)
     l->outputs = h*w*l->n*(l->classes + 8 + 1);
     l->inputs = l->outputs;
 
-    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
-    l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float));
+    l->output = realloc(l->output, l->batch*l->outputs * sizeof(float));
+    l->delta = realloc(l->delta, l->batch*l->outputs * sizeof(float));
+
+    //if (!l->output_pinned) l->output = (float*)realloc(l->output, l->batch*l->outputs * sizeof(float));
+    //if (!l->delta_pinned) l->delta = (float*)realloc(l->delta, l->batch*l->outputs * sizeof(float));
 
 #ifdef GPU
+    /*
+    if (l->output_pinned) {
+        cudaFreeHost(l->output);
+        if (cudaSuccess != cudaHostAlloc(&l->output, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
+            cudaGetLastError(); // reset CUDA-error
+            l->output = (float*)realloc(l->output, l->batch * l->outputs * sizeof(float));
+            l->output_pinned = 0;
+        }
+    }
+
+    if (l->delta_pinned) {
+        cudaFreeHost(l->delta);
+        if (cudaSuccess != cudaHostAlloc(&l->delta, l->batch*l->outputs * sizeof(float), cudaHostRegisterMapped)) {
+            cudaGetLastError(); // reset CUDA-error
+            l->delta = (float*)realloc(l->delta, l->batch * l->outputs * sizeof(float));
+            l->delta_pinned = 0;
+        }
+    }
+    */
+
     cuda_free(l->delta_gpu);
     cuda_free(l->output_gpu);
 
@@ -168,10 +208,10 @@ static int entry_gaussian_index(layer l, int batch, int location, int entry)
     return batch*l.outputs + n*l.w*l.h*(8+l.classes+1) + entry*l.w*l.h + loc;
 }
 
-void forward_gaussian_yolo_layer(const layer l, network net)
+void forward_gaussian_yolo_layer(const layer l, network_state state)
 {
     int i,j,b,t,n;
-    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
+    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
 
 #ifndef GPU
     for (b = 0; b < l.batch; ++b){
@@ -196,7 +236,7 @@ void forward_gaussian_yolo_layer(const layer l, network net)
 #endif
 
     memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
-    if(!net.train) return;
+    if (!state.train) return;
     float avg_iou = 0;
     float recall = 0;
     float recall75 = 0;
@@ -211,11 +251,11 @@ void forward_gaussian_yolo_layer(const layer l, network net)
             for (i = 0; i < l.w; ++i) {
                 for (n = 0; n < l.n; ++n) {
                     int box_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 0);
-                    box pred = get_gaussian_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.w*l.h);
+                    box pred = get_gaussian_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.w*l.h);
                     float best_iou = 0;
                     int best_t = 0;
                     for(t = 0; t < l.max_boxes; ++t){
-                        box truth = float_to_box_stride(net.truth + t*(4 + 1) + b*l.truths, 1);
+                        box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
                         if(!truth.x) break;
                         float iou = box_iou(pred, truth);
                         if (iou > best_iou) {
@@ -232,18 +272,18 @@ void forward_gaussian_yolo_layer(const layer l, network net)
                     if (best_iou > l.truth_thresh) {
                         l.delta[obj_index] = 1 - l.output[obj_index];
 
-                        int class = net.truth[best_t*(4 + 1) + b*l.truths + 4];
+                        int class = state.truth[best_t*(4 + 1) + b*l.truths + 4];
                         if (l.map) class = l.map[class];
                         int class_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 9);
                         delta_gaussian_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0);
-                        box truth = float_to_box_stride(net.truth + best_t*(4 + 1) + b*l.truths, 1);
-                        delta_gaussian_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
+                        box truth = float_to_box_stride(state.truth + best_t*(4 + 1) + b*l.truths, 1);
+                        delta_gaussian_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
                     }
                 }
             }
         }
         for(t = 0; t < l.max_boxes; ++t){
-            box truth = float_to_box_stride(net.truth + t*(4 + 1) + b*l.truths, 1);
+            box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
 
             if(!truth.x) break;
             float best_iou = 0;
@@ -254,8 +294,8 @@ void forward_gaussian_yolo_layer(const layer l, network net)
             truth_shift.x = truth_shift.y = 0;
             for(n = 0; n < l.total; ++n){
                 box pred = {0};
-                pred.w = l.biases[2*n]/net.w;
-                pred.h = l.biases[2*n+1]/net.h;
+                pred.w = l.biases[2*n]/ state.net.w;
+                pred.h = l.biases[2*n+1]/ state.net.h;
                 float iou = box_iou(pred, truth_shift);
                 if (iou > best_iou){
                     best_iou = iou;
@@ -266,13 +306,13 @@ void forward_gaussian_yolo_layer(const layer l, network net)
             int mask_n = int_index(l.mask, best_n, l.n);
             if(mask_n >= 0){
                 int box_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
-                float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
+                float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
 
                 int obj_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 8);
                 avg_obj += l.output[obj_index];
                 l.delta[obj_index] = 1 - l.output[obj_index];
 
-                int class = net.truth[t*(4 + 1) + b*l.truths + 4];
+                int class = state.truth[t*(4 + 1) + b*l.truths + 4];
                 if (l.map) class = l.map[class];
                 int class_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 9);
                 delta_gaussian_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);
@@ -286,19 +326,34 @@ void forward_gaussian_yolo_layer(const layer l, network net)
         }
     }
     *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
-    printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", net.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);
+    printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", state.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);
 }
 
-void backward_gaussian_yolo_layer(const layer l, network net)
+void backward_gaussian_yolo_layer(const layer l, network_state state)
 {
-   axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1);
+   axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
 }
 
-void correct_gaussian_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
+void correct_gaussian_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative, int letter)
 {
     int i;
     int new_w=0;
     int new_h=0;
+    if (letter) {
+        if (((float)netw / w) < ((float)neth / h)) {
+            new_w = netw;
+            new_h = (h * netw) / w;
+        }
+        else {
+            new_h = neth;
+            new_w = (w * neth) / h;
+        }
+    }
+    else {
+        new_w = netw;
+        new_h = neth;
+    }
+    /*
     if (((float)netw/w) < ((float)neth/h)) {
         new_w = netw;
         new_h = (h * netw)/w;
@@ -306,6 +361,7 @@ void correct_gaussian_yolo_boxes(detection *dets, int n, int w, int h, int netw,
         new_h = neth;
         new_w = (w * neth)/h;
     }
+    */
     for (i = 0; i < n; ++i){
         box b = dets[i].bbox;
         b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw);
@@ -365,7 +421,7 @@ void avg_flipped_gaussian_yolo(layer l)
 }
 */
 
-int get_gaussian_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets)
+int get_gaussian_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter)
 {
     int i,j,n;
     float *predictions = l.output;
@@ -390,22 +446,22 @@ int get_gaussian_yolo_detections(layer l, int w, int h, int netw, int neth, floa
 
             for(j = 0; j < l.classes; ++j){
                 int class_index = entry_gaussian_index(l, 0, n*l.w*l.h + i, 9 + j);
-                float uc_aver = (dets[count].uc[0] + dets[count].uc[1] + dets[count].uc[2] + dets[count].uc[3])/4.0;
+                float uc_aver = (dets[count].uc[0] + dets[count].uc[1] + dets[count].uc[2] + dets[count].uc[3]) / 4.0;
                 float prob = objectness*predictions[class_index]*(1.0-uc_aver);
                 dets[count].prob[j] = (prob > thresh) ? prob : 0;
             }
             ++count;
         }
     }
-    correct_gaussian_yolo_boxes(dets, count, w, h, netw, neth, relative);
+    correct_gaussian_yolo_boxes(dets, count, w, h, netw, neth, relative, letter);
     return count;
 }
 
 #ifdef GPU
 
-void forward_gaussian_yolo_layer_gpu(const layer l, network net)
+void forward_gaussian_yolo_layer_gpu(const layer l, network_state state)
 {
-    copy_ongpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
+    copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1);
     int b, n;
     for (b = 0; b < l.batch; ++b)
     {
@@ -428,18 +484,38 @@ void forward_gaussian_yolo_layer_gpu(const layer l, network net)
             activate_array_ongpu(l.output_gpu + index, (1+l.classes)*l.w*l.h, LOGISTIC);
         }
     }
-    if(!net.train || l.onlyforward){
-        cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+
+    if (!state.train || l.onlyforward) {
+        //cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+        cuda_pull_array_async(l.output_gpu, l.output, l.batch*l.outputs);
+        CHECK_CUDA(cudaPeekAtLastError());
         return;
     }
 
-    cuda_pull_array(l.output_gpu, net.input, l.batch*l.inputs);
-    forward_gaussian_yolo_layer(l, net);
+    float *in_cpu = (float *)calloc(l.batch*l.inputs, sizeof(float));
+    cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+    memcpy(in_cpu, l.output, l.batch*l.outputs * sizeof(float));
+    float *truth_cpu = 0;
+    if (state.truth) {
+        int num_truth = l.batch*l.truths;
+        truth_cpu = (float *)calloc(num_truth, sizeof(float));
+        cuda_pull_array(state.truth, truth_cpu, num_truth);
+    }
+    network_state cpu_state = state;
+    cpu_state.net = state.net;
+    cpu_state.index = state.index;
+    cpu_state.train = state.train;
+    cpu_state.truth = truth_cpu;
+    cpu_state.input = in_cpu;
+    forward_gaussian_yolo_layer(l, cpu_state);
+    //forward_yolo_layer(l, state);
     cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
+    free(in_cpu);
+    if (cpu_state.truth) free(cpu_state.truth);
 }
 
-void backward_gaussian_yolo_layer_gpu(const layer l, network net)
+void backward_gaussian_yolo_layer_gpu(const layer l, network_state state)
 {
-    axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+    axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1);
 }
 #endif
diff --git a/src/gaussian_yolo_layer.h b/src/gaussian_yolo_layer.h
index 96cb2a8f4a6..9080881dc68 100644
--- a/src/gaussian_yolo_layer.h
+++ b/src/gaussian_yolo_layer.h
@@ -6,15 +6,17 @@
 #include "layer.h"
 #include "network.h"
 
-layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes);
-void forward_gaussian_yolo_layer(const layer l, network net);
-void backward_gaussian_yolo_layer(const layer l, network net);
+layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes, int max_boxes);
+void forward_gaussian_yolo_layer(const layer l, network_state state);
+void backward_gaussian_yolo_layer(const layer l, network_state state);
 void resize_gaussian_yolo_layer(layer *l, int w, int h);
 int gaussian_yolo_num_detections(layer l, float thresh);
+int get_gaussian_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets, int letter);
+void correct_gaussian_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative, int letter);
 
 #ifdef GPU
-void forward_gaussian_yolo_layer_gpu(const layer l, network net);
-void backward_gaussian_yolo_layer_gpu(layer l, network net);
+void forward_gaussian_yolo_layer_gpu(const layer l, network_state state);
+void backward_gaussian_yolo_layer_gpu(layer l, network_state state);
 #endif
 
 #endif
diff --git a/src/network.c b/src/network.c
index 0658788044c..cfe994343e8 100644
--- a/src/network.c
+++ b/src/network.c
@@ -714,7 +714,7 @@ detection *make_network_boxes(network *net, float thresh, int *num)
     for (i = 0; i < nboxes; ++i) {
         dets[i].prob = (float*)calloc(l.classes, sizeof(float));
         // tx,ty,tw,th uncertainty
-        dets[i].uc = calloc(4, sizeof(float)); // Gaussian_YOLOv3
+        dets[i].uc = (float*)calloc(4, sizeof(float)); // Gaussian_YOLOv3
         if (l.coords > 4) {
             dets[i].mask = (float*)calloc(l.coords - 4, sizeof(float));
         }
@@ -762,7 +762,7 @@ void fill_network_boxes(network *net, int w, int h, float thresh, float hier, in
             }
         }
         if (l.type == GAUSSIAN_YOLO) {
-            int count = get_gaussian_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets);
+            int count = get_gaussian_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets, letter);
             dets += count;
         }
         if (l.type == REGION) {
@@ -789,6 +789,7 @@ void free_detections(detection *dets, int n)
     int i;
     for (i = 0; i < n; ++i) {
         free(dets[i].prob);
+        if (dets[i].uc) free(dets[i].uc);
         if (dets[i].mask) free(dets[i].mask);
     }
     free(dets);
diff --git a/src/parser.c b/src/parser.c
index 445771ca4aa..1367d35b06a 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -424,12 +424,13 @@ int *parse_gaussian_yolo_mask(char *a, int *num) // Gaussian_YOLOv3
 layer parse_gaussian_yolo(list *options, size_params params) // Gaussian_YOLOv3
 {
     int classes = option_find_int(options, "classes", 20);
+    int max_boxes = option_find_int_quiet(options, "max", 90);
     int total = option_find_int(options, "num", 1);
     int num = total;
 
     char *a = option_find_str(options, "mask", 0);
     int *mask = parse_gaussian_yolo_mask(a, &num);
-    layer l = make_gaussian_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes);
+    layer l = make_gaussian_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes, max_boxes);
     assert(l.outputs == params.inputs);
 
     l.max_boxes = option_find_int_quiet(options, "max", 90);

From 29c71a190acb82aa4beda8762e087b658f4b0347 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 26 Oct 2019 01:30:11 +0300
Subject: [PATCH 38/86] Fixed antialiasing=1 for [convolutional] layer

---
 src/convolutional_kernels.cu | 4 +++-
 src/maxpool_layer_kernels.cu | 4 +++-
 src/yolo_layer.c             | 3 ++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index 23005ccb91e..cb5a1b99bae 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -628,11 +628,13 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
         s.train = state.train;
         s.workspace = state.workspace;
         s.net = state.net;
-        s.delta = l.delta_gpu;
+        s.delta = l.delta_gpu;  // s.delta will be returned to l.delta_gpu
         s.input = l.input_antialiasing_gpu;
         //if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
         simple_copy_ongpu(l.input_layer->outputs*l.input_layer->batch, l.delta_gpu, l.input_layer->delta_gpu);
         backward_convolutional_layer_gpu(*(l.input_layer), s);
+
+        simple_copy_ongpu(l.outputs*l.batch, l.input_antialiasing_gpu, l.output_gpu);
     }
 
     if(state.net.try_fix_nan) constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
diff --git a/src/maxpool_layer_kernels.cu b/src/maxpool_layer_kernels.cu
index cc546a0b50c..8677b4d713d 100644
--- a/src/maxpool_layer_kernels.cu
+++ b/src/maxpool_layer_kernels.cu
@@ -199,11 +199,13 @@ extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network_state st
         s.train = state.train;
         s.workspace = state.workspace;
         s.net = state.net;
-        s.delta = layer.delta_gpu;
+        s.delta = layer.delta_gpu;  // s.delta will be returned to l.delta_gpu
         s.input = layer.input_antialiasing_gpu;
         //if (!state.train) s.index = state.index;  // don't use TC for training (especially without cuda_convert_f32_to_f16() )
         simple_copy_ongpu(layer.input_layer->outputs*layer.input_layer->batch, layer.delta_gpu, layer.input_layer->delta_gpu);
         backward_convolutional_layer_gpu(*(layer.input_layer), s);
+
+        //simple_copy_ongpu(layer.outputs*layer.batch, layer.input_antialiasing_gpu, layer.output_gpu);
     }
 
     if (layer.maxpool_depth) {
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index 424811df6e6..906ed427f01 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -423,7 +423,8 @@ void forward_yolo_layer(const layer l, network_state state)
         }
         *(l.cost) = avg_iou_loss + classification_loss;
     }
-    printf("v3 (%s loss, Normalizer: (iou: %f, cls: %f) Region %d Avg (IOU: %f, GIOU: %f), Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f, count: %d\n", (l.iou_loss == MSE ? "mse" : (l.iou_loss == GIOU ? "giou" : "iou")), l.iou_normalizer, l.cls_normalizer, state.index, tot_iou / count, tot_giou / count, avg_cat / class_count, avg_obj / count, avg_anyobj / (l.w*l.h*l.n*l.batch), recall / count, recall75 / count, count);
+    printf("v3 (%s loss, Normalizer: (iou: %f, cls: %f) Region %d Avg (IOU: %f, GIOU: %f), Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f, count: %d\n",
+        (l.iou_loss == MSE ? "mse" : (l.iou_loss == GIOU ? "giou" : "iou")), l.iou_normalizer, l.cls_normalizer, state.index, tot_iou / count, tot_giou / count, avg_cat / class_count, avg_obj / count, avg_anyobj / (l.w*l.h*l.n*l.batch), recall / count, recall75 / count, count);
 }
 
 void backward_yolo_layer(const layer l, network_state state)

From fa7687e6b5936532ce35a607509fa5736e8f3c0d Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 26 Oct 2019 01:40:59 +0300
Subject: [PATCH 39/86] cfg-param blur > 1 is fixed (bilateral filter removes
 textures)

---
 src/data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/data.c b/src/data.c
index c0af1ab857d..6c9e565db89 100644
--- a/src/data.c
+++ b/src/data.c
@@ -922,7 +922,7 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo
 
             int min_w_h = fill_truth_detection(filename, boxes, truth, classes, flip, dx, dy, 1. / sx, 1. / sy, w, h);
 
-            if (min_w_h < blur*4) blur = 0;   // disable blur if one of the objects is too small
+            if (min_w_h/4 < blur) blur = min_w_h / 4;   // disable blur if one of the objects is too small
 
             image ai = image_data_augmentation(src, w, h, pleft, ptop, swidth, sheight, flip, dhue, dsat, dexp,
                 blur, boxes, d.y.vals[i]);

From f8c72acd42d73d0ee200d92b3d605f50e0928f5a Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 26 Oct 2019 14:47:00 +0300
Subject: [PATCH 40/86] Added scale_x_y to [Gaussian_yolo]. Fixed blur=10;

---
 src/data.c                |  8 +++--
 src/gaussian_yolo_layer.c | 67 +++++++++++++++++++++------------------
 src/image_opencv.cpp      |  5 ++-
 src/parser.c              |  1 +
 4 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/src/data.c b/src/data.c
index 6c9e565db89..1b9117d93cf 100644
--- a/src/data.c
+++ b/src/data.c
@@ -875,7 +875,11 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo
                 dexp = rand_scale(exposure);
 
                 flip = use_flip ? random_gen() % 2 : 0;
-                blur = rand_int(0, 1) ? (use_blur) : 0;
+
+                //blur = rand_int(0, 1) ? (use_blur) : 0;
+                int tmp_blur = rand_int(0, 2);  // 0 - disable, 1 - blur background, 2 - blur the whole image
+                if (tmp_blur == 2) blur = use_blur;
+                else blur = tmp_blur;
             }
 
             int pleft = rand_precalc_random(-dw, dw, r1);
@@ -922,7 +926,7 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo
 
             int min_w_h = fill_truth_detection(filename, boxes, truth, classes, flip, dx, dy, 1. / sx, 1. / sy, w, h);
 
-            if (min_w_h/4 < blur) blur = min_w_h / 4;   // disable blur if one of the objects is too small
+            if (min_w_h/4 < blur && blur > 1) blur = min_w_h / 4;   // disable blur if one of the objects is too small
 
             image ai = image_data_augmentation(src, w, h, pleft, ptop, swidth, sheight, flip, dhue, dsat, dexp,
                 blur, boxes, d.y.vals[i]);
diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index ddb43939ce9..3b58cc5a404 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -95,8 +95,8 @@ void resize_gaussian_yolo_layer(layer *l, int w, int h)
     l->outputs = h*w*l->n*(l->classes + 8 + 1);
     l->inputs = l->outputs;
 
-    l->output = realloc(l->output, l->batch*l->outputs * sizeof(float));
-    l->delta = realloc(l->delta, l->batch*l->outputs * sizeof(float));
+    l->output = (float *)realloc(l->output, l->batch*l->outputs * sizeof(float));
+    l->delta = (float *)realloc(l->delta, l->batch*l->outputs * sizeof(float));
 
     //if (!l->output_pinned) l->output = (float*)realloc(l->output, l->batch*l->outputs * sizeof(float));
     //if (!l->delta_pinned) l->delta = (float*)realloc(l->delta, l->batch*l->outputs * sizeof(float));
@@ -187,17 +187,17 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
 }
 
 
-void delta_gaussian_yolo_class(float *output, float *delta, int index, int class, int classes, int stride, float *avg_cat)
+void delta_gaussian_yolo_class(float *output, float *delta, int index, int class_id, int classes, int stride, float *avg_cat)
 {
     int n;
     if (delta[index]){
-        delta[index + stride*class] = 1 - output[index + stride*class];
-        if(avg_cat) *avg_cat += output[index + stride*class];
+        delta[index + stride*class_id] = 1 - output[index + stride*class_id];
+        if(avg_cat) *avg_cat += output[index + stride*class_id];
         return;
     }
     for(n = 0; n < classes; ++n){
-        delta[index + stride*n] = ((n == class)?1 : 0) - output[index + stride*n];
-        if(n == class && avg_cat) *avg_cat += output[index + stride*n];
+        delta[index + stride*n] = ((n == class_id)?1 : 0) - output[index + stride*n];
+        if(n == class_id && avg_cat) *avg_cat += output[index + stride*n];
     }
 }
 
@@ -219,9 +219,11 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
             // x : mu, sigma
             int index = entry_gaussian_index(l, b, n*l.w*l.h, 0);
             activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
+            scal_add_cpu(l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output + index, 1);    // scale x
             // y : mu, sigma
             index = entry_gaussian_index(l, b, n*l.w*l.h, 2);
             activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
+            scal_add_cpu(l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output + index, 1);    // scale y
             // w : sigma
             index = entry_gaussian_index(l, b, n*l.w*l.h, 5);
             activate_array(l.output + index, l.w*l.h, LOGISTIC);
@@ -272,10 +274,10 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
                     if (best_iou > l.truth_thresh) {
                         l.delta[obj_index] = 1 - l.output[obj_index];
 
-                        int class = state.truth[best_t*(4 + 1) + b*l.truths + 4];
-                        if (l.map) class = l.map[class];
+                        int class_id = state.truth[best_t*(4 + 1) + b*l.truths + 4];
+                        if (l.map) class_id = l.map[class_id];
                         int class_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 9);
-                        delta_gaussian_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0);
+                        delta_gaussian_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, 0);
                         box truth = float_to_box_stride(state.truth + best_t*(4 + 1) + b*l.truths, 1);
                         delta_gaussian_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
                     }
@@ -312,10 +314,10 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
                 avg_obj += l.output[obj_index];
                 l.delta[obj_index] = 1 - l.output[obj_index];
 
-                int class = state.truth[t*(4 + 1) + b*l.truths + 4];
-                if (l.map) class = l.map[class];
+                int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
+                if (l.map) class_id = l.map[class_id];
                 int class_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 9);
-                delta_gaussian_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);
+                delta_gaussian_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, &avg_cat);
 
                 ++count;
                 ++class_count;
@@ -433,24 +435,27 @@ int get_gaussian_yolo_detections(layer l, int w, int h, int netw, int neth, floa
         for(n = 0; n < l.n; ++n){
             int obj_index  = entry_gaussian_index(l, 0, n*l.w*l.h + i, 8);
             float objectness = predictions[obj_index];
-            if(objectness <= thresh) continue;
-            int box_index  = entry_gaussian_index(l, 0, n*l.w*l.h + i, 0);
-            dets[count].bbox = get_gaussian_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
-            dets[count].objectness = objectness;
-            dets[count].classes = l.classes;
-
-            dets[count].uc[0] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 1)]; // tx uncertainty
-            dets[count].uc[1] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 3)]; // ty uncertainty
-            dets[count].uc[2] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 5)]; // tw uncertainty
-            dets[count].uc[3] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 7)]; // th uncertainty
-
-            for(j = 0; j < l.classes; ++j){
-                int class_index = entry_gaussian_index(l, 0, n*l.w*l.h + i, 9 + j);
-                float uc_aver = (dets[count].uc[0] + dets[count].uc[1] + dets[count].uc[2] + dets[count].uc[3]) / 4.0;
-                float prob = objectness*predictions[class_index]*(1.0-uc_aver);
-                dets[count].prob[j] = (prob > thresh) ? prob : 0;
+            if (objectness <= thresh) continue;    // incorrect behavior for Nan values
+
+            if (objectness > thresh) {
+                int box_index = entry_gaussian_index(l, 0, n*l.w*l.h + i, 0);
+                dets[count].bbox = get_gaussian_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
+                dets[count].objectness = objectness;
+                dets[count].classes = l.classes;
+
+                dets[count].uc[0] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 1)]; // tx uncertainty
+                dets[count].uc[1] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 3)]; // ty uncertainty
+                dets[count].uc[2] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 5)]; // tw uncertainty
+                dets[count].uc[3] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 7)]; // th uncertainty
+
+                for (j = 0; j < l.classes; ++j) {
+                    int class_index = entry_gaussian_index(l, 0, n*l.w*l.h + i, 9 + j);
+                    float uc_aver = (dets[count].uc[0] + dets[count].uc[1] + dets[count].uc[2] + dets[count].uc[3]) / 4.0;
+                    float prob = objectness*predictions[class_index] * (1.0 - uc_aver);
+                    dets[count].prob[j] = (prob > thresh) ? prob : 0;
+                }
+                ++count;
             }
-            ++count;
         }
     }
     correct_gaussian_yolo_boxes(dets, count, w, h, netw, neth, relative, letter);
@@ -470,9 +475,11 @@ void forward_gaussian_yolo_layer_gpu(const layer l, network_state state)
             // x : mu, sigma
             int index = entry_gaussian_index(l, b, n*l.w*l.h, 0);
             activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
+            scal_add_ongpu(l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output_gpu + index, 1);      // scale x
             // y : mu, sigma
             index = entry_gaussian_index(l, b, n*l.w*l.h, 2);
             activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
+            scal_add_ongpu(l.w*l.h, l.scale_x_y, -0.5*(l.scale_x_y - 1), l.output_gpu + index, 1);      // scale y
             // w : sigma
             index = entry_gaussian_index(l, b, n*l.w*l.h, 5);
             activate_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC);
diff --git a/src/image_opencv.cpp b/src/image_opencv.cpp
index d4a69e6f197..a9d3b560cd8 100644
--- a/src/image_opencv.cpp
+++ b/src/image_opencv.cpp
@@ -1196,7 +1196,10 @@ image image_data_augmentation(mat_cv* mat, int w, int h,
 
         if (blur) {
             cv::Mat dst(sized.size(), sized.type());
-            if(blur == 1) cv::GaussianBlur(sized, dst, cv::Size(31, 31), 0);
+            if (blur == 1) {
+                //cv::GaussianBlur(sized, dst, cv::Size(31, 31), 0);
+                cv::bilateralFilter(sized, dst, 31, 75, 75);
+            }
             else {
                 int ksize = (blur / 2) * 2 + 1;
                 cv::Size kernel_size = cv::Size(ksize, ksize);
diff --git a/src/parser.c b/src/parser.c
index 1367d35b06a..e0962f02edc 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -433,6 +433,7 @@ layer parse_gaussian_yolo(list *options, size_params params) // Gaussian_YOLOv3
     layer l = make_gaussian_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes, max_boxes);
     assert(l.outputs == params.inputs);
 
+    l.scale_x_y = option_find_float_quiet(options, "scale_x_y", 1);
     l.max_boxes = option_find_int_quiet(options, "max", 90);
     l.jitter = option_find_float(options, "jitter", .2);
 

From 69b7a19f5bfce760debf2057b71b0882c535f350 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sun, 27 Oct 2019 02:45:31 +0300
Subject: [PATCH 41/86] minor fix

---
 src/data.c   | 3 ++-
 src/parser.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/data.c b/src/data.c
index 1b9117d93cf..2780a82c85b 100644
--- a/src/data.c
+++ b/src/data.c
@@ -841,7 +841,8 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo
     d.y = make_matrix(n, 5*boxes);
     int i_mixup = 0;
     for (i_mixup = 0; i_mixup <= mixup; i_mixup++) {
-        if (i_mixup) augmentation_calculated = 0;
+        if (i_mixup) augmentation_calculated = 0;   // recalculate augmentation for the 2nd sequence if(track==1)
+
         for (i = 0; i < n; ++i) {
             float *truth = (float*)calloc(5 * boxes, sizeof(float));
             const char *filename = (i_mixup) ? mixup_random_paths[i] : random_paths[i];
diff --git a/src/parser.c b/src/parser.c
index e0962f02edc..f425560f04a 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -409,7 +409,7 @@ int *parse_gaussian_yolo_mask(char *a, int *num) // Gaussian_YOLOv3
         for (i = 0; i < len; ++i) {
             if (a[i] == ',') ++n;
         }
-        mask = calloc(n, sizeof(int));
+        mask = (int *)calloc(n, sizeof(int));
         for (i = 0; i < n; ++i) {
             int val = atoi(a);
             mask[i] = val;

From 6e736339259142c87bd86fd7362eac4e6043a8cc Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Tue, 29 Oct 2019 14:53:57 +0300
Subject: [PATCH 42/86] Fixed Blur and Try to use new Assisted Excitation.

---
 src/convolutional_kernels.cu | 33 +++++++++++++++++++++++----------
 src/data.c                   |  2 +-
 src/image_opencv.cpp         |  2 +-
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index cb5a1b99bae..0b94dd29db4 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -978,10 +978,14 @@ void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
     float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches)) / 2;
     //float alpha = (1 + cos(3.141592 * iteration_num / state.net.max_batches));
 
-    if (l.assisted_excitation > 1) {
-        if (iteration_num < state.net.burn_in) alpha = 0;
-        else if (iteration_num > l.assisted_excitation) alpha = 0;
-        else alpha = (1 + cos(3.141592 * iteration_num / l.assisted_excitation)) / 2;
+    if (l.assisted_excitation == 1) {
+        if (iteration_num > state.net.max_batches / 2) return;
+    }
+    else {
+        if (iteration_num < state.net.burn_in) return;
+        else if (iteration_num > l.assisted_excitation) return;
+        else
+            alpha = (1 + cos(3.141592 * iteration_num / (state.net.burn_in + l.assisted_excitation))) / 2; // from 1 to 0
     }
 
     //printf("\n epoch = %f, alpha = %f, seen = %d, max_batches = %d, train_images_num = %d \n",
@@ -1011,11 +1015,19 @@ void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
         for (t = 0; t < state.net.num_boxes; ++t) {
             box truth = float_to_box_stride(truth_cpu + t*(4 + 1) + b*l.truths, 1);
             if (!truth.x) break;  // continue;
-
-            int left = floor((truth.x - truth.w / 2) * l.out_w);
-            int right = ceil((truth.x + truth.w / 2) * l.out_w);
-            int top = floor((truth.y - truth.h / 2) * l.out_h);
-            int bottom = ceil((truth.y + truth.h / 2) * l.out_h);
+            float beta = 1 - alpha; // from 0 to 1
+            float dw = (1 - truth.w) * beta;
+            float dh = (1 - truth.h) * beta;
+            //printf(" alpha = %f, beta = %f, truth.w = %f, dw = %f, tw+dw = %f, l.out_w = %d \n", alpha, beta, truth.w, dw, truth.w+dw, l.out_w);
+
+            int left = floor((truth.x - (dw + truth.w) / 2) * l.out_w);
+            int right = ceil((truth.x + (dw + truth.w) / 2) * l.out_w);
+            int top = floor((truth.y - (dh + truth.h) / 2) * l.out_h);
+            int bottom = ceil((truth.y + (dh + truth.h) / 2) * l.out_h);
+            if (left < 0) left = 0;
+            if (top < 0) top = 0;
+            if (right > l.out_w) right = l.out_w;
+            if (bottom > l.out_h) bottom = l.out_h;
 
             for (w = left; w <= right; w++) {
                 for (h = top; h < bottom; h++) {
@@ -1035,7 +1047,8 @@ void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
     //CHECK_CUDA(cudaPeekAtLastError());
 
     // calc new output
-    assisted_activation2_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
+    assisted_activation2_gpu(1, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);  // AE3: gt increases (beta = 1 - alpha = 0)
+    //assisted_activation2_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
     //assisted_activation_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
     //cudaStreamSynchronize(get_cuda_stream());
     //CHECK_CUDA(cudaPeekAtLastError());
diff --git a/src/data.c b/src/data.c
index 2780a82c85b..622e401ef75 100644
--- a/src/data.c
+++ b/src/data.c
@@ -927,7 +927,7 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int c, int bo
 
             int min_w_h = fill_truth_detection(filename, boxes, truth, classes, flip, dx, dy, 1. / sx, 1. / sy, w, h);
 
-            if (min_w_h/4 < blur && blur > 1) blur = min_w_h / 4;   // disable blur if one of the objects is too small
+            if (min_w_h / 8 < blur && blur > 1) blur = min_w_h / 8;   // disable blur if one of the objects is too small
 
             image ai = image_data_augmentation(src, w, h, pleft, ptop, swidth, sheight, flip, dhue, dsat, dexp,
                 blur, boxes, d.y.vals[i]);
diff --git a/src/image_opencv.cpp b/src/image_opencv.cpp
index a9d3b560cd8..912a0b1b6d2 100644
--- a/src/image_opencv.cpp
+++ b/src/image_opencv.cpp
@@ -1198,7 +1198,7 @@ image image_data_augmentation(mat_cv* mat, int w, int h,
             cv::Mat dst(sized.size(), sized.type());
             if (blur == 1) {
                 //cv::GaussianBlur(sized, dst, cv::Size(31, 31), 0);
-                cv::bilateralFilter(sized, dst, 31, 75, 75);
+                cv::bilateralFilter(sized, dst, 17, 75, 75);
             }
             else {
                 int ksize = (blur / 2) * 2 + 1;

From 4d9addedd719cfb5d68cb3b39d5c7f9193071c0d Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Tue, 29 Oct 2019 14:55:12 +0300
Subject: [PATCH 43/86] Fixed legacy compilation using MSVS
 (gaussian_yolo_layer)

---
 build/darknet/darknet_no_gpu.vcxproj      | 2 ++
 build/darknet/yolo_cpp_dll.vcxproj        | 2 ++
 build/darknet/yolo_cpp_dll_no_gpu.vcxproj | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/build/darknet/darknet_no_gpu.vcxproj b/build/darknet/darknet_no_gpu.vcxproj
index dcfce05d057..72d23af3d09 100644
--- a/build/darknet/darknet_no_gpu.vcxproj
+++ b/build/darknet/darknet_no_gpu.vcxproj
@@ -203,6 +203,7 @@
     <ClCompile Include="..\..\src\detector.c" />
     <ClCompile Include="..\..\src\dice.c" />
     <ClCompile Include="..\..\src\dropout_layer.c" />
+    <ClCompile Include="..\..\src\gaussian_yolo_layer.c" />
     <ClCompile Include="..\..\src\gemm.c" />
     <ClCompile Include="..\..\src\getopt.c" />
     <ClCompile Include="..\..\src\gettimeofday.c" />
@@ -267,6 +268,7 @@
     <ClInclude Include="..\..\src\demo.h" />
     <ClInclude Include="..\..\src\detection_layer.h" />
     <ClInclude Include="..\..\src\dropout_layer.h" />
+    <ClInclude Include="..\..\src\gaussian_yolo_layer.h" />
     <ClInclude Include="..\..\src\gemm.h" />
     <ClInclude Include="..\..\src\getopt.h" />
     <ClInclude Include="..\..\src\gettimeofday.h" />
diff --git a/build/darknet/yolo_cpp_dll.vcxproj b/build/darknet/yolo_cpp_dll.vcxproj
index d3f60a0260f..813cae31e9d 100644
--- a/build/darknet/yolo_cpp_dll.vcxproj
+++ b/build/darknet/yolo_cpp_dll.vcxproj
@@ -201,6 +201,7 @@
     <ClCompile Include="..\..\src\detector.c" />
     <ClCompile Include="..\..\src\dice.c" />
     <ClCompile Include="..\..\src\dropout_layer.c" />
+    <ClCompile Include="..\..\src\gaussian_yolo_layer.c" />
     <ClCompile Include="..\..\src\gemm.c" />
     <ClCompile Include="..\..\src\getopt.c" />
     <ClCompile Include="..\..\src\gettimeofday.c" />
@@ -267,6 +268,7 @@
     <ClInclude Include="..\..\src\demo.h" />
     <ClInclude Include="..\..\src\detection_layer.h" />
     <ClInclude Include="..\..\src\dropout_layer.h" />
+    <ClInclude Include="..\..\src\gaussian_yolo_layer.h" />
     <ClInclude Include="..\..\src\gemm.h" />
     <ClInclude Include="..\..\src\getopt.h" />
     <ClInclude Include="..\..\src\gettimeofday.h" />
diff --git a/build/darknet/yolo_cpp_dll_no_gpu.vcxproj b/build/darknet/yolo_cpp_dll_no_gpu.vcxproj
index f719d3f28b7..c9f3ddfa29c 100644
--- a/build/darknet/yolo_cpp_dll_no_gpu.vcxproj
+++ b/build/darknet/yolo_cpp_dll_no_gpu.vcxproj
@@ -187,6 +187,7 @@
     <ClCompile Include="..\..\src\detector.c" />
     <ClCompile Include="..\..\src\dice.c" />
     <ClCompile Include="..\..\src\dropout_layer.c" />
+    <ClCompile Include="..\..\src\gaussian_yolo_layer.c" />
     <ClCompile Include="..\..\src\gemm.c" />
     <ClCompile Include="..\..\src\getopt.c" />
     <ClCompile Include="..\..\src\gettimeofday.c" />
@@ -253,6 +254,7 @@
     <ClInclude Include="..\..\src\demo.h" />
     <ClInclude Include="..\..\src\detection_layer.h" />
     <ClInclude Include="..\..\src\dropout_layer.h" />
+    <ClInclude Include="..\..\src\gaussian_yolo_layer.h" />
     <ClInclude Include="..\..\src\gemm.h" />
     <ClInclude Include="..\..\src\getopt.h" />
     <ClInclude Include="..\..\src\gettimeofday.h" />

From 52e3bb252fddadf73957000303fa1a2ca247e306 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Mon, 4 Nov 2019 14:39:20 +0300
Subject: [PATCH 44/86] Use non-blocking sockets for JSON_server and
 MJPEG_server

---
 include/yolo_v2_class.hpp |  1 +
 src/http_stream.cpp       | 28 ++++++++++++++++++++++++++++
 src/parser.c              |  3 +++
 src/yolo_v2_class.cpp     |  9 ++++++---
 4 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/include/yolo_v2_class.hpp b/include/yolo_v2_class.hpp
index f25610d0424..5543df7f8fc 100644
--- a/include/yolo_v2_class.hpp
+++ b/include/yolo_v2_class.hpp
@@ -70,6 +70,7 @@ extern "C" LIB_API void send_json_custom(char const* send_buf, int port, int tim
 class Detector {
     std::shared_ptr<void> detector_gpu_ptr;
     std::deque<std::vector<bbox_t>> prev_bbox_vec_deque;
+    std::string _cfg_filename, _weight_filename;
 public:
     const int cur_gpu_id;
     float nms = .4;
diff --git a/src/http_stream.cpp b/src/http_stream.cpp
index af2e6730c74..927a65bcf35 100644
--- a/src/http_stream.cpp
+++ b/src/http_stream.cpp
@@ -148,6 +148,20 @@ class JSON_sender
         if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof(reuse)) < 0)
             cerr << "setsockopt(SO_REUSEADDR) failed" << endl;
 
+        // Non-blocking sockets
+        // Windows: ioctlsocket() and FIONBIO
+        // Linux: fcntl() and O_NONBLOCK
+#ifdef WIN32
+        unsigned long i_mode = 1;
+        int result = ioctlsocket(sock, FIONBIO, &i_mode);
+        if (result != NO_ERROR) {
+            std::cerr << "ioctlsocket(FIONBIO) failed with error: " << result << std::endl;
+        }
+#else // WIN32
+        int flags = fcntl(sock, F_GETFL, 0);
+        fcntl(sock, F_SETFL, flags | O_NONBLOCK);
+#endif // WIN32
+
 #ifdef SO_REUSEPORT
         if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (const char*)&reuse, sizeof(reuse)) < 0)
             cerr << "setsockopt(SO_REUSEPORT) failed" << endl;
@@ -375,6 +389,20 @@ class MJPG_sender
         if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&reuse, sizeof(reuse)) < 0)
             cerr << "setsockopt(SO_REUSEADDR) failed" << endl;
 
+        // Non-blocking sockets
+        // Windows: ioctlsocket() and FIONBIO
+        // Linux: fcntl() and O_NONBLOCK
+#ifdef WIN32
+        unsigned long i_mode = 1;
+        int result = ioctlsocket(sock, FIONBIO, &i_mode);
+        if (result != NO_ERROR) {
+            std::cerr << "ioctlsocket(FIONBIO) failed with error: " << result << std::endl;
+        }
+#else // WIN32
+        int flags = fcntl(sock, F_GETFL, 0);
+        fcntl(sock, F_SETFL, flags | O_NONBLOCK);
+#endif // WIN32
+
 #ifdef SO_REUSEPORT
         if (setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, (const char*)&reuse, sizeof(reuse)) < 0)
             cerr << "setsockopt(SO_REUSEPORT) failed" << endl;
diff --git a/src/parser.c b/src/parser.c
index f425560f04a..2cf295f070e 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -168,6 +168,9 @@ convolutional_layer parse_convolutional(list *options, size_params params, netwo
         if (stride_x < 1) stride_x = stride;
         if (stride_y < 1) stride_y = stride;
     }
+    else {
+        stride = option_find_int_quiet(options, "stride", 1);
+    }
     int dilation = option_find_int_quiet(options, "dilation", 1);
     int antialiasing = option_find_int_quiet(options, "antialiasing", 0);
     if (size == 1) dilation = 1;
diff --git a/src/yolo_v2_class.cpp b/src/yolo_v2_class.cpp
index 4667794a839..e01dadbb306 100644
--- a/src/yolo_v2_class.cpp
+++ b/src/yolo_v2_class.cpp
@@ -147,8 +147,11 @@ LIB_API Detector::Detector(std::string cfg_filename, std::string weight_filename
     net.gpu_index = cur_gpu_id;
     //gpu_index = i;
 
-    char *cfgfile = const_cast<char *>(cfg_filename.data());
-    char *weightfile = const_cast<char *>(weight_filename.data());
+    _cfg_filename = cfg_filename;
+    _weight_filename = weight_filename;
+
+    char *cfgfile = const_cast<char *>(_cfg_filename.c_str());
+    char *weightfile = const_cast<char *>(_weight_filename.c_str());
 
     net = parse_network_cfg_custom(cfgfile, 1, 0);
     if (weightfile) {
@@ -243,7 +246,7 @@ static image load_image_stb(char *filename, int channels)
 
 LIB_API image_t Detector::load_image(std::string image_filename)
 {
-    char *input = const_cast<char *>(image_filename.data());
+    char *input = const_cast<char *>(image_filename.c_str());
     image im = load_image_stb(input, 3);
 
     image_t img;

From 7018be435f06a9ac26c5b7494ac96068dbdf497a Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Mon, 4 Nov 2019 14:45:34 +0300
Subject: [PATCH 45/86] Compile fix

---
 src/http_stream.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/http_stream.cpp b/src/http_stream.cpp
index 927a65bcf35..781139b3dff 100644
--- a/src/http_stream.cpp
+++ b/src/http_stream.cpp
@@ -50,6 +50,7 @@ static int close_socket(SOCKET s) {
 }
 #else   // _WIN32 - else: nix
 #include "darkunistd.h"
+#include <fcntl.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <sys/socket.h>

From e345b8793c4dd8c28e0854145ac15b912b771bc6 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Tue, 5 Nov 2019 16:06:24 +0300
Subject: [PATCH 46/86] Fixed ./darknet partial for Anti-aliasing

---
 src/convolutional_layer.c | 19 ++++---------------
 src/maxpool_layer.c       | 19 ++++---------------
 src/parser.c              |  2 ++
 3 files changed, 10 insertions(+), 30 deletions(-)

diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 6818b603529..bf5beac7972 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -607,19 +607,6 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
         }
         else {
             for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
-                /*
-                l.input_layer->weights[i + 0] = 0;
-                l.input_layer->weights[i + 1] = 0;
-                l.input_layer->weights[i + 2] = 0;
-
-                l.input_layer->weights[i + 3] = 0;
-                l.input_layer->weights[i + 4] = 1;
-                l.input_layer->weights[i + 5] = 0;
-
-                l.input_layer->weights[i + 6] = 0;
-                l.input_layer->weights[i + 7] = 0;
-                l.input_layer->weights[i + 8] = 0;
-                */
                 l.input_layer->weights[i + 0] = 1 / 16.f;
                 l.input_layer->weights[i + 1] = 2 / 16.f;
                 l.input_layer->weights[i + 2] = 1 / 16.f;
@@ -635,8 +622,10 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
         }
         for (i = 0; i < n; ++i) l.input_layer->biases[i] = 0;
 #ifdef GPU
-        l.input_antialiasing_gpu = cuda_make_array(NULL, l.batch*l.outputs);
-        push_convolutional_layer(*(l.input_layer));
+        if (gpu_index >= 0) {
+            l.input_antialiasing_gpu = cuda_make_array(NULL, l.batch*l.outputs);
+            push_convolutional_layer(*(l.input_layer));
+        }
 #endif  // GPU
     }
 
diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c
index dca9c1b15f7..a05074ecc74 100644
--- a/src/maxpool_layer.c
+++ b/src/maxpool_layer.c
@@ -127,19 +127,6 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
         }
         else {
             for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) {
-                /*
-                l.input_layer->weights[i + 0] = 0;
-                l.input_layer->weights[i + 1] = 0;
-                l.input_layer->weights[i + 2] = 0;
-
-                l.input_layer->weights[i + 3] = 0;
-                l.input_layer->weights[i + 4] = 1;
-                l.input_layer->weights[i + 5] = 0;
-
-                l.input_layer->weights[i + 6] = 0;
-                l.input_layer->weights[i + 7] = 0;
-                l.input_layer->weights[i + 8] = 0;
-                */
                 l.input_layer->weights[i + 0] = 1 / 16.f;
                 l.input_layer->weights[i + 1] = 2 / 16.f;
                 l.input_layer->weights[i + 2] = 1 / 16.f;
@@ -155,8 +142,10 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
         }
         for (i = 0; i < l.out_c; ++i) l.input_layer->biases[i] = 0;
 #ifdef GPU
-        l.input_antialiasing_gpu = cuda_make_array(NULL, l.batch*l.outputs);
-        push_convolutional_layer(*(l.input_layer));
+        if (gpu_index >= 0) {
+            l.input_antialiasing_gpu = cuda_make_array(NULL, l.batch*l.outputs);
+            push_convolutional_layer(*(l.input_layer));
+        }
 #endif  // GPU
     }
 
diff --git a/src/parser.c b/src/parser.c
index 2cf295f070e..edda54d1d92 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -951,6 +951,8 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
     params.inputs = net.inputs;
     if (batch > 0) net.batch = batch;
     if (time_steps > 0) net.time_steps = time_steps;
+    if (net.batch < 1) net.batch = 1;
+    if (net.time_steps < 1) net.time_steps = 1;
     if (net.batch < net.time_steps) net.batch = net.time_steps;
     params.batch = net.batch;
     params.time_steps = net.time_steps;

From d628e8eab7371f136cc05ae090b904eede7f9c55 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Tue, 5 Nov 2019 16:57:03 +0300
Subject: [PATCH 47/86] Fixed darknet.py for Uncertainty (gaussian_yolo_layer)

---
 build/darknet/x64/darknet.py | 3 ++-
 darknet.py                   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/build/darknet/x64/darknet.py b/build/darknet/x64/darknet.py
index 10c9a456ebd..5cfb26ba16e 100644
--- a/build/darknet/x64/darknet.py
+++ b/build/darknet/x64/darknet.py
@@ -59,7 +59,8 @@ class DETECTION(Structure):
                 ("prob", POINTER(c_float)),
                 ("mask", POINTER(c_float)),
                 ("objectness", c_float),
-                ("sort_class", c_int)]
+                ("sort_class", c_int),
+                ("uc", POINTER(c_float))]
 
 
 class IMAGE(Structure):
diff --git a/darknet.py b/darknet.py
index 10c9a456ebd..5cfb26ba16e 100644
--- a/darknet.py
+++ b/darknet.py
@@ -59,7 +59,8 @@ class DETECTION(Structure):
                 ("prob", POINTER(c_float)),
                 ("mask", POINTER(c_float)),
                 ("objectness", c_float),
-                ("sort_class", c_int)]
+                ("sort_class", c_int),
+                ("uc", POINTER(c_float))]
 
 
 class IMAGE(Structure):

From bf8ea4183dc265ac17f7c9d939dc815269f0a213 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Thu, 7 Nov 2019 01:15:58 +0300
Subject: [PATCH 48/86] Added MISH activation, use activation=mish in
 [convolutional] layers

---
 include/darknet.h            |  6 +++---
 src/activation_kernels.cu    | 36 ++++++++++++++++++++++++++++++++++++
 src/activations.c            | 25 +++++++++++++++++++++++++
 src/activations.h            |  6 +++++-
 src/convolutional_kernels.cu |  9 ++++++---
 src/convolutional_layer.c    | 13 ++++++++-----
 src/layer.c                  |  4 ++--
 7 files changed, 85 insertions(+), 14 deletions(-)

diff --git a/include/darknet.h b/include/darknet.h
index 00b49921f52..55f94ac5224 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -102,7 +102,7 @@ typedef struct tree {
 
 // activations.h
 typedef enum {
-    LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN, SELU, SWISH
+    LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN, SELU, SWISH, MISH
 }ACTIVATION;
 
 // parser.h
@@ -347,7 +347,7 @@ struct layer {
     float *col_image;
     float * delta;
     float * output;
-    float * output_sigmoid;
+    float * activation_input;
     int delta_pinned;
     int output_pinned;
     float * loss;
@@ -532,7 +532,7 @@ struct layer {
 
     float * input_antialiasing_gpu;
     float * output_gpu;
-    float * output_sigmoid_gpu;
+    float * activation_input_gpu;
     float * loss_gpu;
     float * delta_gpu;
     float * rand_gpu;
diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu
index 24563c69d6e..846c586fada 100644
--- a/src/activation_kernels.cu
+++ b/src/activation_kernels.cu
@@ -199,6 +199,16 @@ __global__ void activate_array_swish_kernel(float *x, int n, float *output_sigmo
     }
 }
 
+__global__ void activate_array_mish_kernel(float *x, int n, float *activation_input, float *output_gpu)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < n) {
+        float x_val = x[i];
+        activation_input[i] = x_val;    // store value before activation
+        output_gpu[i] = x_val * tanh_activate_kernel(log(1 + expf(x_val)));
+    }
+}
+
 __global__ void activate_array_leaky_kernel(float *x, int n)
 {
     int index = blockIdx.x*blockDim.x + threadIdx.x;
@@ -263,6 +273,18 @@ __global__ void gradient_array_swish_kernel(float *x, int n, float *sigmoid_gpu,
     }
 }
 
+__global__ void gradient_array_mish_kernel(int n, float *activation_input, float *delta)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (i < n) {
+        float x = activation_input[i];
+        float d = 2 * expf(x) + expf(2 * x) + 2;
+        float w = 4 * (x + 1) + 4 * expf(2 * x) + expf(3 * x) + expf(x)*(4 * x + 6);
+        float derivative = expf(x) * w / (d * d);
+        delta[i] *= derivative;
+    }
+}
+
 __global__ void gradient_array_leaky_kernel(float *x, int n, float *delta)
 {
     int index = blockIdx.x*blockDim.x + threadIdx.x;
@@ -333,6 +355,13 @@ extern "C" void activate_array_swish_ongpu(float *x, int n, float *output_sigmoi
     CHECK_CUDA(cudaPeekAtLastError());
 }
 
+extern "C" void activate_array_mish_ongpu(float *x, int n, float *activation_input_gpu, float *output_gpu)
+{
+    const int num_blocks = get_number_of_blocks(n, BLOCK);
+    activate_array_mish_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(x, n, activation_input_gpu, output_gpu);
+    CHECK_CUDA(cudaPeekAtLastError());
+}
+
 extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta)
 {
     const int num_blocks = get_number_of_blocks(n, BLOCK);
@@ -354,4 +383,11 @@ extern "C" void gradient_array_swish_ongpu(float *x, int n, float *sigmoid_gpu,
     const int num_blocks = get_number_of_blocks(n, BLOCK);
     gradient_array_swish_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> > (x, n, sigmoid_gpu, delta);
     CHECK_CUDA(cudaPeekAtLastError());
+}
+
+extern "C" void gradient_array_mish_ongpu(int n, float *activation_input_gpu, float *delta)
+{
+    const int num_blocks = get_number_of_blocks(n, BLOCK);
+    gradient_array_mish_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> > (n, activation_input_gpu, delta);
+    CHECK_CUDA(cudaPeekAtLastError());
 }
\ No newline at end of file
diff --git a/src/activations.c b/src/activations.c
index c3ea4818aeb..5311fb97a77 100644
--- a/src/activations.c
+++ b/src/activations.c
@@ -46,6 +46,7 @@ ACTIVATION get_activation(char *s)
 {
     if (strcmp(s, "logistic")==0) return LOGISTIC;
     if (strcmp(s, "swish") == 0) return SWISH;
+    if (strcmp(s, "mish") == 0) return MISH;
     if (strcmp(s, "loggy")==0) return LOGGY;
     if (strcmp(s, "relu")==0) return RELU;
     if (strcmp(s, "elu")==0) return ELU;
@@ -133,6 +134,17 @@ void activate_array_swish(float *x, const int n, float * output_sigmoid, float *
     }
 }
 
+void activate_array_mish(float *x, const int n, float * activation_input, float * output)
+{
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < n; ++i) {
+        float x_val = x[i];
+        activation_input[i] = x_val;    // store value before activation
+        output[i] = x_val * tanh_activate(log(1 + expf(x_val)));
+    }
+}
+
 float gradient(float x, ACTIVATION a)
 {
     switch(a){
@@ -187,3 +199,16 @@ void gradient_array_swish(const float *x, const int n, const float * sigmoid, fl
         delta[i] *= swish + sigmoid[i]*(1 - swish);
     }
 }
+
+void gradient_array_mish(const int n, const float * activation_input, float * delta)
+{
+    int i;
+    #pragma omp parallel for
+    for (i = 0; i < n; ++i) {
+        float x = activation_input[i];
+        float d = 2 * expf(x) + expf(2 * x) + 2;
+        float w = 4 * (x + 1) + 4 * expf(2 * x) + expf(3 * x) + expf(x)*(4 * x + 6);
+        float derivative = expf(x) * w / (d * d);
+        delta[i] *= derivative;
+    }
+}
diff --git a/src/activations.h b/src/activations.h
index 19f3822c8e3..bba5ca8d10a 100644
--- a/src/activations.h
+++ b/src/activations.h
@@ -5,7 +5,7 @@
 #include "math.h"
 
 //typedef enum{
-//    LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN, SELU
+//    LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN, SELU, SWISH, MISH
 //}ACTIVATION;
 
 #ifdef __cplusplus
@@ -18,13 +18,17 @@ float activate(float x, ACTIVATION a);
 float gradient(float x, ACTIVATION a);
 void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
 void gradient_array_swish(const float *x, const int n, const float * sigmoid, float * delta);
+void gradient_array_mish(const int n, const float * activation_input, float * delta);
 void activate_array(float *x, const int n, const ACTIVATION a);
 void activate_array_swish(float *x, const int n, float * output_sigmoid, float * output);
+void activate_array_mish(float *x, const int n, float * activation_input, float * output);
 #ifdef GPU
 void activate_array_ongpu(float *x, int n, ACTIVATION a);
 void activate_array_swish_ongpu(float *x, int n, float *output_sigmoid_gpu, float *output_gpu);
+void activate_array_mish_ongpu(float *x, int n, float *activation_input_gpu, float *output_gpu);
 void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta);
 void gradient_array_swish_ongpu(float *x, int n, float *sigmoid_gpu, float *delta);
+void gradient_array_mish_ongpu(int n, float *activation_input_gpu, float *delta);
 #endif
 
 static inline float stair_activate(float x)
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index 0b94dd29db4..a73f277ee92 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -392,7 +392,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
             */
 
             //add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
-            if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.output_sigmoid_gpu, l.output_gpu);
+            if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
+            else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
             else if (l.activation != LINEAR && l.activation != LEAKY) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
             //if(l.activation != LINEAR && l.activation != LEAKY) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
             //if (l.binary || l.xnor) swap_binary(&l);
@@ -596,7 +597,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
 //#ifndef CUDNN_HALF
 //#endif // no CUDNN_HALF
 
-    if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.output_sigmoid_gpu, l.output_gpu);
+    if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
+    else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
     else if (l.activation != LINEAR) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
     //if(l.dot > 0) dot_error_gpu(l);
     if(l.binary || l.xnor) swap_binary(&l);
@@ -639,7 +641,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
 
     if(state.net.try_fix_nan) constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
 
-    if (l.activation == SWISH) gradient_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.output_sigmoid_gpu, l.delta_gpu);
+    if (l.activation == SWISH) gradient_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu);
+    else if (l.activation == MISH) gradient_array_mish_ongpu(l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu);
     else gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
 
     if (!l.batch_normalize)
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index bf5beac7972..b76d7ee735f 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -473,10 +473,10 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
         l.scale_v = (float*)calloc(n, sizeof(float));
     }
 
-    if(l.activation == SWISH) l.output_sigmoid = (float*)calloc(total_batch*l.outputs, sizeof(float));
+    if (l.activation == SWISH || l.activation == MISH) l.activation_input = (float*)calloc(total_batch*l.outputs, sizeof(float));
 
 #ifdef GPU
-    if (l.activation == SWISH) l.output_sigmoid_gpu = cuda_make_array(l.output_sigmoid, total_batch*out_h*out_w*n);
+    if (l.activation == SWISH || l.activation == MISH) l.activation_input_gpu = cuda_make_array(l.activation_input, total_batch*out_h*out_w*n);
 
     l.forward_gpu = forward_convolutional_layer_gpu;
     l.backward_gpu = backward_convolutional_layer_gpu;
@@ -1100,7 +1100,8 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
                 add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
 
                 //activate_array(l.output, m*n*l.batch, l.activation);
-                if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.output_sigmoid, l.output);
+                if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);
+                else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
                 else activate_array_cpu_custom(l.output, m*n*l.batch, l.activation);
                 return;
 
@@ -1139,7 +1140,8 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
     add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
 
     //activate_array(l.output, m*n*l.batch, l.activation);
-    if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.output_sigmoid, l.output);
+    if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);
+    else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
     else activate_array_cpu_custom(l.output, l.outputs*l.batch, l.activation);
 
     if(l.binary || l.xnor) swap_binary(&l);
@@ -1276,7 +1278,8 @@ void backward_convolutional_layer(convolutional_layer l, network_state state)
     int n = l.size*l.size*l.c / l.groups;
     int k = l.out_w*l.out_h;
 
-    if (l.activation == SWISH) gradient_array_swish(l.output, l.outputs*l.batch, l.output_sigmoid, l.delta);
+    if (l.activation == SWISH) gradient_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.delta);
+    else if (l.activation == MISH) gradient_array_mish(l.outputs*l.batch, l.activation_input, l.delta);
     else gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
 
     if (l.batch_normalize) {
diff --git a/src/layer.c b/src/layer.c
index e9ae67b5ff5..9fe4a439364 100644
--- a/src/layer.c
+++ b/src/layer.c
@@ -90,7 +90,7 @@ void free_layer(layer l)
 #endif  // GPU
     if (l.delta)              free(l.delta), l.delta = NULL;
     if (l.output)             free(l.output), l.output = NULL;
-    if (l.output_sigmoid)     free(l.output_sigmoid), l.output_sigmoid = NULL;
+    if (l.activation_input)   free(l.activation_input), l.activation_input = NULL;
     if (l.squared)            free(l.squared);
     if (l.norms)              free(l.norms);
     if (l.spatial_mean)       free(l.spatial_mean);
@@ -176,7 +176,7 @@ void free_layer(layer l)
     if (l.scale_updates_gpu)       cuda_free(l.scale_updates_gpu), l.scale_updates_gpu = NULL;
     if (l.input_antialiasing_gpu)  cuda_free(l.input_antialiasing_gpu), l.input_antialiasing_gpu = NULL;
     if (l.output_gpu)              cuda_free(l.output_gpu), l.output_gpu = NULL;
-    if (l.output_sigmoid_gpu)      cuda_free(l.output_sigmoid_gpu), l.output_sigmoid_gpu = NULL;
+    if (l.activation_input_gpu)    cuda_free(l.activation_input_gpu), l.activation_input_gpu = NULL;
     if (l.delta_gpu)               cuda_free(l.delta_gpu), l.delta_gpu = NULL;
     if (l.rand_gpu)                cuda_free(l.rand_gpu);
     if (l.squared_gpu)             cuda_free(l.squared_gpu);

From c7c7078de7952fbdcfe7b4a7d60303e56e84846b Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Thu, 7 Nov 2019 01:18:47 +0300
Subject: [PATCH 49/86] Added URL to the description of activation MISH on
 GitHub

---
 src/activation_kernels.cu | 2 ++
 src/activations.c         | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu
index 846c586fada..d8ff25f42e3 100644
--- a/src/activation_kernels.cu
+++ b/src/activation_kernels.cu
@@ -199,6 +199,7 @@ __global__ void activate_array_swish_kernel(float *x, int n, float *output_sigmo
     }
 }
 
+// https://github.com/digantamisra98/Mish
 __global__ void activate_array_mish_kernel(float *x, int n, float *activation_input, float *output_gpu)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -273,6 +274,7 @@ __global__ void gradient_array_swish_kernel(float *x, int n, float *sigmoid_gpu,
     }
 }
 
+// https://github.com/digantamisra98/Mish
 __global__ void gradient_array_mish_kernel(int n, float *activation_input, float *delta)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
diff --git a/src/activations.c b/src/activations.c
index 5311fb97a77..da92af0af38 100644
--- a/src/activations.c
+++ b/src/activations.c
@@ -134,6 +134,7 @@ void activate_array_swish(float *x, const int n, float * output_sigmoid, float *
     }
 }
 
+// https://github.com/digantamisra98/Mish
 void activate_array_mish(float *x, const int n, float * activation_input, float * output)
 {
     int i;
@@ -200,6 +201,7 @@ void gradient_array_swish(const float *x, const int n, const float * sigmoid, fl
     }
 }
 
+// https://github.com/digantamisra98/Mish
 void gradient_array_mish(const int n, const float * activation_input, float * delta)
 {
     int i;

From 704bd1138e967bedf30ba4292369b3057aa5abeb Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Thu, 7 Nov 2019 23:49:10 +0300
Subject: [PATCH 50/86] Fixed params.net in parse_convolutional() and
 parse_route()

---
 src/parser.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/parser.c b/src/parser.c
index edda54d1d92..52f3edceead 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -154,7 +154,7 @@ local_layer parse_local(list *options, size_params params)
     return layer;
 }
 
-convolutional_layer parse_convolutional(list *options, size_params params, network net)
+convolutional_layer parse_convolutional(list *options, size_params params)
 {
     int n = option_find_int(options, "filters",1);
     int groups = option_find_int_quiet(options, "groups", 1);
@@ -185,8 +185,8 @@ convolutional_layer parse_convolutional(list *options, size_params params, netwo
 
     int share_index = option_find_int_quiet(options, "share_index", -1000000000);
     convolutional_layer *share_layer = NULL;
-    if(share_index >= 0) share_layer = &net.layers[share_index];
-    else if(share_index != -1000000000) share_layer = &net.layers[params.index + share_index];
+    if(share_index >= 0) share_layer = &params.net.layers[share_index];
+    else if(share_index != -1000000000) share_layer = &params.net.layers[params.index + share_index];
 
     int batch,h,w,c;
     h = params.h;
@@ -754,7 +754,7 @@ layer parse_upsample(list *options, size_params params, network net)
     return l;
 }
 
-route_layer parse_route(list *options, size_params params, network net)
+route_layer parse_route(list *options, size_params params)
 {
     char *l = option_find(options, "layers");
     int len = strlen(l);
@@ -772,19 +772,19 @@ route_layer parse_route(list *options, size_params params, network net)
         l = strchr(l, ',')+1;
         if(index < 0) index = params.index + index;
         layers[i] = index;
-        sizes[i] = net.layers[index].outputs;
+        sizes[i] = params.net.layers[index].outputs;
     }
     int batch = params.batch;
 
     route_layer layer = make_route_layer(batch, n, layers, sizes);
 
-    convolutional_layer first = net.layers[layers[0]];
+    convolutional_layer first = params.net.layers[layers[0]];
     layer.out_w = first.out_w;
     layer.out_h = first.out_h;
     layer.out_c = first.out_c;
     for(i = 1; i < n; ++i){
         int index = layers[i];
-        convolutional_layer next = net.layers[index];
+        convolutional_layer next = params.net.layers[index];
         if(next.out_w == first.out_w && next.out_h == first.out_h){
             layer.out_c += next.out_c;
         }else{
@@ -974,7 +974,7 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
         layer l = { (LAYER_TYPE)0 };
         LAYER_TYPE lt = string_to_layer_type(s->type);
         if(lt == CONVOLUTIONAL){
-            l = parse_convolutional(options, params, net);
+            l = parse_convolutional(options, params);
         }else if(lt == LOCAL){
             l = parse_local(options, params);
         }else if(lt == ACTIVE){
@@ -1019,7 +1019,7 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
         }else if(lt == AVGPOOL){
             l = parse_avgpool(options, params);
         }else if(lt == ROUTE){
-            l = parse_route(options, params, net);
+            l = parse_route(options, params);
             int k;
             for (k = 0; k < l.n; ++k) net.layers[l.input_layers[k]].use_bin_output = 0;
         }else if (lt == UPSAMPLE) {

From 0fa9c8f10588cd5db54b742bc2bcbefcddd87000 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 8 Nov 2019 01:06:49 +0300
Subject: [PATCH 51/86] Added groups= and groupd_id= params to the [route]
 layer

---
 include/darknet.h |  1 +
 src/parser.c      |  6 +++++-
 src/route_layer.c | 35 ++++++++++++++++++++++++++---------
 src/route_layer.h |  2 +-
 4 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/include/darknet.h b/include/darknet.h
index 55f94ac5224..f29cb15689a 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -206,6 +206,7 @@ struct layer {
     int n;
     int max_boxes;
     int groups;
+    int group_id;
     int size;
     int side;
     int stride;
diff --git a/src/parser.c b/src/parser.c
index 52f3edceead..e0b65f3bf3b 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -776,7 +776,10 @@ route_layer parse_route(list *options, size_params params)
     }
     int batch = params.batch;
 
-    route_layer layer = make_route_layer(batch, n, layers, sizes);
+    int groups = option_find_int_quiet(options, "groups", 1);
+    int group_id = option_find_int_quiet(options, "group_id", 0);
+
+    route_layer layer = make_route_layer(batch, n, layers, sizes, groups, group_id);
 
     convolutional_layer first = params.net.layers[layers[0]];
     layer.out_w = first.out_w;
@@ -791,6 +794,7 @@ route_layer parse_route(list *options, size_params params)
             layer.out_h = layer.out_w = layer.out_c = 0;
         }
     }
+    layer.out_c = layer.out_c / layer.groups;
 
     return layer;
 }
diff --git a/src/route_layer.c b/src/route_layer.c
index b502fbe72b0..b636d4824e8 100644
--- a/src/route_layer.c
+++ b/src/route_layer.c
@@ -3,7 +3,7 @@
 #include "blas.h"
 #include <stdio.h>
 
-route_layer make_route_layer(int batch, int n, int *input_layers, int *input_sizes)
+route_layer make_route_layer(int batch, int n, int *input_layers, int *input_sizes, int groups, int group_id)
 {
     fprintf(stderr,"route ");
     route_layer l = { (LAYER_TYPE)0 };
@@ -12,6 +12,8 @@ route_layer make_route_layer(int batch, int n, int *input_layers, int *input_siz
     l.n = n;
     l.input_layers = input_layers;
     l.input_sizes = input_sizes;
+    l.groups = groups;
+    l.group_id = group_id;
     int i;
     int outputs = 0;
     for(i = 0; i < n; ++i){
@@ -19,6 +21,7 @@ route_layer make_route_layer(int batch, int n, int *input_layers, int *input_siz
         outputs += input_sizes[i];
     }
     fprintf(stderr, "\n");
+    outputs = outputs / groups;
     l.outputs = outputs;
     l.inputs = outputs;
     l.delta = (float*)calloc(outputs * batch, sizeof(float));
@@ -57,6 +60,8 @@ void resize_route_layer(route_layer *l, network *net)
             l->out_h = l->out_w = l->out_c = 0;
         }
     }
+    l->out_c = l->out_c / l->groups;
+    l->outputs = l->outputs / l->groups;
     l->inputs = l->outputs;
     l->delta = (float*)realloc(l->delta, l->outputs * l->batch * sizeof(float));
     l->output = (float*)realloc(l->output, l->outputs * l->batch * sizeof(float));
@@ -78,10 +83,13 @@ void forward_route_layer(const route_layer l, network_state state)
         int index = l.input_layers[i];
         float *input = state.net.layers[index].output;
         int input_size = l.input_sizes[i];
+        int part_input_size = input_size / l.groups;
         for(j = 0; j < l.batch; ++j){
-            copy_cpu(input_size, input + j*input_size, 1, l.output + offset + j*l.outputs, 1);
+            //copy_cpu(input_size, input + j*input_size, 1, l.output + offset + j*l.outputs, 1);
+            copy_cpu(part_input_size, input + j*input_size + part_input_size*l.group_id, 1, l.output + offset + j*l.outputs, 1);
         }
-        offset += input_size;
+        //offset += input_size;
+        offset += part_input_size;
     }
 }
 
@@ -93,10 +101,13 @@ void backward_route_layer(const route_layer l, network_state state)
         int index = l.input_layers[i];
         float *delta = state.net.layers[index].delta;
         int input_size = l.input_sizes[i];
+        int part_input_size = input_size / l.groups;
         for(j = 0; j < l.batch; ++j){
-            axpy_cpu(input_size, 1, l.delta + offset + j*l.outputs, 1, delta + j*input_size, 1);
+            //axpy_cpu(input_size, 1, l.delta + offset + j*l.outputs, 1, delta + j*input_size, 1);
+            axpy_cpu(part_input_size, 1, l.delta + offset + j*l.outputs, 1, delta + j*input_size + part_input_size*l.group_id, 1);
         }
-        offset += input_size;
+        //offset += input_size;
+        offset += part_input_size;
     }
 }
 
@@ -109,11 +120,14 @@ void forward_route_layer_gpu(const route_layer l, network_state state)
         int index = l.input_layers[i];
         float *input = state.net.layers[index].output_gpu;
         int input_size = l.input_sizes[i];
+        int part_input_size = input_size / l.groups;
         for(j = 0; j < l.batch; ++j){
             //copy_ongpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1);
-            simple_copy_ongpu(input_size, input + j*input_size, l.output_gpu + offset + j*l.outputs);
+            //simple_copy_ongpu(input_size, input + j*input_size, l.output_gpu + offset + j*l.outputs);
+            simple_copy_ongpu(part_input_size, input + j*input_size + part_input_size*l.group_id, l.output_gpu + offset + j*l.outputs);
         }
-        offset += input_size;
+        //offset += input_size;
+        offset += part_input_size;
     }
 }
 
@@ -125,10 +139,13 @@ void backward_route_layer_gpu(const route_layer l, network_state state)
         int index = l.input_layers[i];
         float *delta = state.net.layers[index].delta_gpu;
         int input_size = l.input_sizes[i];
+        int part_input_size = input_size / l.groups;
         for(j = 0; j < l.batch; ++j){
-            axpy_ongpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
+            //axpy_ongpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
+            axpy_ongpu(part_input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size + part_input_size*l.group_id, 1);
         }
-        offset += input_size;
+        //offset += input_size;
+        offset += part_input_size;
     }
 }
 #endif
diff --git a/src/route_layer.h b/src/route_layer.h
index 3ee9019385f..2ebe396039c 100644
--- a/src/route_layer.h
+++ b/src/route_layer.h
@@ -8,7 +8,7 @@ typedef layer route_layer;
 #ifdef __cplusplus
 extern "C" {
 #endif
-route_layer make_route_layer(int batch, int n, int *input_layers, int *input_size);
+route_layer make_route_layer(int batch, int n, int *input_layers, int *input_size, int groups, int group_id);
 void forward_route_layer(const route_layer l, network_state state);
 void backward_route_layer(const route_layer l, network_state state);
 void resize_route_layer(route_layer *l, network *net);

From 0eee8404bf271256f3455491fe3c55830f14f31d Mon Sep 17 00:00:00 2001
From: Alexey <AlexeyAB@users.noreply.github.com>
Date: Sat, 9 Nov 2019 14:03:29 +0300
Subject: [PATCH 52/86] Update Readme.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 9042a6f99e7..5c0efdeff1a 100644
--- a/README.md
+++ b/README.md
@@ -607,6 +607,8 @@ Example of custom object detection: `darknet.exe detector test data/obj.data yol
 
   * each: `model of object, side, illimination, scale, each 30 grad` of the turn and inclination angles - these are *different objects* from an internal perspective of the neural network. So the more *different objects* you want to detect, the more complex network model should be used.
 
+  * to make the detected bounded boxes more accurate, you can add 3 parameters `ignore_thresh = .9 iou_normalizer=0.5 iou_loss=giou` to each `[yolo]` layer and train, it will increase mAP@0.9, but decrease mAP@0.5.
+
   * Only if you are an **expert** in neural detection networks - recalculate anchors for your dataset for `width` and `height` from cfg-file:
   `darknet.exe detector calc_anchors data/obj.data -num_of_clusters 9 -width 416 -height 416`
    then set the same 9 `anchors` in each of 3 `[yolo]`-layers in your cfg-file. But you should change indexes of anchors `masks=` for each [yolo]-layer, so that 1st-[yolo]-layer has anchors larger than 60x60, 2nd larger than 30x30, 3rd remaining. Also you should change the `filters=(classes + 5)*<number of mask>` before each [yolo]-layer. If many of the calculated anchors do not fit under the appropriate layers - then just try using all the default anchors.

From d91d59a22fea9c266f06c2ae5edb23d38fd83c20 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 9 Nov 2019 22:48:18 +0300
Subject: [PATCH 53/86] Optimized memory allocation for Detection (inference
 only), without allocation memory for training

---
 include/darknet.h            |   1 +
 src/conv_lstm_layer.c        |  25 +++----
 src/conv_lstm_layer.h        |   2 +-
 src/convolutional_kernels.cu |  10 ++-
 src/convolutional_layer.c    | 122 ++++++++++++++++++++++++-----------
 src/convolutional_layer.h    |   3 +-
 src/crnn_layer.c             |   9 +--
 src/crnn_layer.h             |   2 +-
 src/gemm.c                   |   4 +-
 src/maxpool_layer.c          |  44 ++++++++-----
 src/maxpool_layer.h          |   2 +-
 src/maxpool_layer_kernels.cu |   4 +-
 src/network.c                |   1 +
 src/parser.c                 |  14 ++--
 src/shortcut_layer.c         |  16 +++--
 src/shortcut_layer.h         |   2 +-
 src/yolo_layer.c             |   2 +-
 17 files changed, 169 insertions(+), 94 deletions(-)

diff --git a/include/darknet.h b/include/darknet.h
index f29cb15689a..8be704061b1 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -190,6 +190,7 @@ struct layer {
     void(*backward_gpu)  (struct layer, struct network_state);
     void(*update_gpu)    (struct layer, int, float, float, float);
     layer *share_layer;
+    int train;
     int batch_normalize;
     int shortcut;
     int batch;
diff --git a/src/conv_lstm_layer.c b/src/conv_lstm_layer.c
index 4ae67b44a83..b4059ed1ec8 100644
--- a/src/conv_lstm_layer.c
+++ b/src/conv_lstm_layer.c
@@ -32,7 +32,7 @@ static void increment_layer(layer *l, int steps)
 }
 
 
-layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int groups, int steps, int size, int stride, int dilation, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor)
+layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int groups, int steps, int size, int stride, int dilation, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor, int train)
 {
     fprintf(stderr, "CONV_LSTM Layer: %d x %d x %d image, %d filters\n", h, w, c, output_filters);
     /*
@@ -48,6 +48,7 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i
     */
     batch = batch / steps;
     layer l = { (LAYER_TYPE)0 };
+    l.train = train;
     l.batch = batch;
     l.type = CONV_LSTM;
     l.steps = steps;
@@ -66,44 +67,44 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i
 
     // U
     l.uf = (layer*)calloc(1, sizeof(layer));
-    *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+    *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
     l.uf->batch = batch;
     if (l.workspace_size < l.uf->workspace_size) l.workspace_size = l.uf->workspace_size;
 
     l.ui = (layer*)calloc(1, sizeof(layer));
-    *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+    *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
     l.ui->batch = batch;
     if (l.workspace_size < l.ui->workspace_size) l.workspace_size = l.ui->workspace_size;
 
     l.ug = (layer*)calloc(1, sizeof(layer));
-    *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+    *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
     l.ug->batch = batch;
     if (l.workspace_size < l.ug->workspace_size) l.workspace_size = l.ug->workspace_size;
 
     l.uo = (layer*)calloc(1, sizeof(layer));
-    *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+    *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
     l.uo->batch = batch;
     if (l.workspace_size < l.uo->workspace_size) l.workspace_size = l.uo->workspace_size;
 
 
     // W
     l.wf = (layer*)calloc(1, sizeof(layer));
-    *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+    *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
     l.wf->batch = batch;
     if (l.workspace_size < l.wf->workspace_size) l.workspace_size = l.wf->workspace_size;
 
     l.wi = (layer*)calloc(1, sizeof(layer));
-    *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+    *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
     l.wi->batch = batch;
     if (l.workspace_size < l.wi->workspace_size) l.workspace_size = l.wi->workspace_size;
 
     l.wg = (layer*)calloc(1, sizeof(layer));
-    *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+    *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
     l.wg->batch = batch;
     if (l.workspace_size < l.wg->workspace_size) l.workspace_size = l.wg->workspace_size;
 
     l.wo = (layer*)calloc(1, sizeof(layer));
-    *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+    *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
     l.wo->batch = batch;
     if (l.workspace_size < l.wo->workspace_size) l.workspace_size = l.wo->workspace_size;
 
@@ -111,21 +112,21 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i
     // V
     l.vf = (layer*)calloc(1, sizeof(layer));
     if (l.peephole) {
-        *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+        *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
         l.vf->batch = batch;
         if (l.workspace_size < l.vf->workspace_size) l.workspace_size = l.vf->workspace_size;
     }
 
     l.vi = (layer*)calloc(1, sizeof(layer));
     if (l.peephole) {
-        *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+        *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
         l.vi->batch = batch;
         if (l.workspace_size < l.vi->workspace_size) l.workspace_size = l.vi->workspace_size;
     }
 
     l.vo = (layer*)calloc(1, sizeof(layer));
     if (l.peephole) {
-        *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+        *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
         l.vo->batch = batch;
         if (l.workspace_size < l.vo->workspace_size) l.workspace_size = l.vo->workspace_size;
     }
diff --git a/src/conv_lstm_layer.h b/src/conv_lstm_layer.h
index 17e4fdc3a66..0a0438b04f5 100644
--- a/src/conv_lstm_layer.h
+++ b/src/conv_lstm_layer.h
@@ -9,7 +9,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int groups, int steps, int size, int stride, int dilation, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor);
+layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int groups, int steps, int size, int stride, int dilation, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor, int train);
 void resize_conv_lstm_layer(layer *l, int w, int h);
 void free_state_conv_lstm(layer l);
 void randomize_state_conv_lstm(layer l);
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index a73f277ee92..1a6b5f8affb 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -986,7 +986,8 @@ void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
     }
     else {
         if (iteration_num < state.net.burn_in) return;
-        else if (iteration_num > l.assisted_excitation) return;
+        else
+            if (iteration_num > l.assisted_excitation) return;
         else
             alpha = (1 + cos(3.141592 * iteration_num / (state.net.burn_in + l.assisted_excitation))) / 2; // from 1 to 0
     }
@@ -1018,6 +1019,7 @@ void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
         for (t = 0; t < state.net.num_boxes; ++t) {
             box truth = float_to_box_stride(truth_cpu + t*(4 + 1) + b*l.truths, 1);
             if (!truth.x) break;  // continue;
+            //float beta = 0;
             float beta = 1 - alpha; // from 0 to 1
             float dw = (1 - truth.w) * beta;
             float dh = (1 - truth.h) * beta;
@@ -1162,8 +1164,10 @@ void push_convolutional_layer(convolutional_layer l)
     cuda_convert_f32_to_f16(l.weights_gpu, l.nweights, l.weights_gpu16);
 #endif
     cuda_push_array(l.biases_gpu, l.biases, l.n);
-    cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
-    cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.train) {
+        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+        cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    }
     if (l.batch_normalize){
         cuda_push_array(l.scales_gpu, l.scales, l.n);
         cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index b76d7ee735f..7f2bac9690b 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -123,7 +123,7 @@ size_t get_workspace_size32(layer l){
                 l.dweightDesc,
                 l.bf_algo,
                 &s));
-        if (s > most) most = s;
+        if (s > most && l.train) most = s;
         CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
                 l.weightDesc,
                 l.ddstTensorDesc,
@@ -131,7 +131,7 @@ size_t get_workspace_size32(layer l){
                 l.dsrcTensorDesc,
                 l.bd_algo,
                 &s));
-        if (s > most) most = s;
+        if (s > most && l.train) most = s;
         return most;
     }
     #endif
@@ -164,7 +164,7 @@ size_t get_workspace_size16(layer l) {
             l.dweightDesc16,
             l.bf_algo16,
             &s));
-        if (s > most) most = s;
+        if (s > most && l.train) most = s;
         CHECK_CUDNN(cudnnGetConvolutionBackwardDataWorkspaceSize(cudnn_handle(),
             l.weightDesc16,
             l.ddstTensorDesc16,
@@ -172,7 +172,7 @@ size_t get_workspace_size16(layer l) {
             l.dsrcTensorDesc16,
             l.bd_algo16,
             &s));
-        if (s > most) most = s;
+        if (s > most && l.train) most = s;
         return most;
     }
 #endif
@@ -333,12 +333,43 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 #endif
 #endif
 
-convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer, int assisted_excitation)
+
+void free_convolutional_batchnorm(convolutional_layer *l)
+{
+    if (!l->share_layer) {
+        free(l->scales);
+        free(l->scale_updates);
+        free(l->mean);
+        free(l->variance);
+        free(l->mean_delta);
+        free(l->variance_delta);
+        free(l->rolling_mean);
+        free(l->rolling_variance);
+        free(l->x);
+        free(l->x_norm);
+
+#ifdef GPU
+        cuda_free(l->scales_gpu);
+        cuda_free(l->scale_updates_gpu);
+        cuda_free(l->mean_gpu);
+        cuda_free(l->variance_gpu);
+        cuda_free(l->mean_delta_gpu);
+        cuda_free(l->variance_delta_gpu);
+        cuda_free(l->rolling_mean_gpu);
+        cuda_free(l->rolling_variance_gpu);
+        cuda_free(l->x_gpu);
+        cuda_free(l->x_norm_gpu);
+#endif
+    }
+}
+
+convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer, int assisted_excitation, int train)
 {
     int total_batch = batch*steps;
     int i;
     convolutional_layer l = { (LAYER_TYPE)0 };
     l.type = CONVOLUTIONAL;
+    l.train = train;
 
     if (xnor) groups = 1;   // disable groups for XNOR-net
     if (groups < 1) groups = 1;
@@ -382,10 +413,12 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     }
     else {
         l.weights = (float*)calloc(l.nweights, sizeof(float));
-        l.weight_updates = (float*)calloc(l.nweights, sizeof(float));
-
         l.biases = (float*)calloc(n, sizeof(float));
-        l.bias_updates = (float*)calloc(n, sizeof(float));
+
+        if (train) {
+            l.weight_updates = (float*)calloc(l.nweights, sizeof(float));
+            l.bias_updates = (float*)calloc(n, sizeof(float));
+        }
     }
 
     // float scale = 1./sqrt(size*size*c);
@@ -401,7 +434,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     l.activation = activation;
 
     l.output = (float*)calloc(total_batch*l.outputs, sizeof(float));
-    l.delta  = (float*)calloc(total_batch*l.outputs, sizeof(float));
+    if (train) l.delta = (float*)calloc(total_batch*l.outputs, sizeof(float));
 
     l.forward = forward_convolutional_layer;
     l.backward = backward_convolutional_layer;
@@ -445,23 +478,27 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
         }
         else {
             l.scales = (float*)calloc(n, sizeof(float));
-            l.scale_updates = (float*)calloc(n, sizeof(float));
             for (i = 0; i < n; ++i) {
                 l.scales[i] = 1;
             }
+            if (train) {
+                l.scale_updates = (float*)calloc(n, sizeof(float));
 
-            l.mean = (float*)calloc(n, sizeof(float));
-            l.variance = (float*)calloc(n, sizeof(float));
+                l.mean = (float*)calloc(n, sizeof(float));
+                l.variance = (float*)calloc(n, sizeof(float));
 
-            l.mean_delta = (float*)calloc(n, sizeof(float));
-            l.variance_delta = (float*)calloc(n, sizeof(float));
+                l.mean_delta = (float*)calloc(n, sizeof(float));
+                l.variance_delta = (float*)calloc(n, sizeof(float));
+            }
 
             l.rolling_mean = (float*)calloc(n, sizeof(float));
             l.rolling_variance = (float*)calloc(n, sizeof(float));
         }
 
-        l.x = (float*)calloc(total_batch * l.outputs, sizeof(float));
-        l.x_norm = (float*)calloc(total_batch * l.outputs, sizeof(float));
+        if (train) {
+            l.x = (float*)calloc(total_batch * l.outputs, sizeof(float));
+            l.x_norm = (float*)calloc(total_batch * l.outputs, sizeof(float));
+        }
     }
     if(adam){
         l.adam = 1;
@@ -501,17 +538,17 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
         }
         else {
             l.weights_gpu = cuda_make_array(l.weights, l.nweights);
-            l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);
+            if (train) l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);
 #ifdef CUDNN_HALF
             l.weights_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1);
-            l.weight_updates_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1);
+            if (train) l.weight_updates_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1);
 #endif  // CUDNN_HALF
             l.biases_gpu = cuda_make_array(l.biases, n);
-            l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
+            if (train) l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
         }
 
         l.output_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
-        l.delta_gpu = cuda_make_array(l.delta, total_batch*out_h*out_w*n);
+        if (train) l.delta_gpu = cuda_make_array(l.delta, total_batch*out_h*out_w*n);
 
         if(binary){
             l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
@@ -535,19 +572,25 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
             }
             else {
                 l.scales_gpu = cuda_make_array(l.scales, n);
-                l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);
 
-                l.mean_gpu = cuda_make_array(l.mean, n);
-                l.variance_gpu = cuda_make_array(l.variance, n);
+                if (train) {
+                    l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);
+
+                    l.mean_gpu = cuda_make_array(l.mean, n);
+                    l.variance_gpu = cuda_make_array(l.variance, n);
+
+                    l.mean_delta_gpu = cuda_make_array(l.mean, n);
+                    l.variance_delta_gpu = cuda_make_array(l.variance, n);
+                }
 
                 l.rolling_mean_gpu = cuda_make_array(l.mean, n);
                 l.rolling_variance_gpu = cuda_make_array(l.variance, n);
+            }
 
-                l.mean_delta_gpu = cuda_make_array(l.mean, n);
-                l.variance_delta_gpu = cuda_make_array(l.variance, n);
+            if (train) {
+                l.x_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
+                l.x_norm_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
             }
-            l.x_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
-            l.x_norm_gpu = cuda_make_array(l.output, total_batch*out_h*out_w*n);
         }
 
         if (l.assisted_excitation)
@@ -594,7 +637,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
             blur_size = 2;
             blur_pad = 0;
         }
-        *(l.input_layer) = make_convolutional_layer(batch, steps, out_h, out_w, n, n, n, blur_size, blur_stride_x, blur_stride_y, 1, blur_pad, LINEAR, 0, 0, 0, 0, 0, index, 0, NULL, 0);
+        *(l.input_layer) = make_convolutional_layer(batch, steps, out_h, out_w, n, n, n, blur_size, blur_stride_x, blur_stride_y, 1, blur_pad, LINEAR, 0, 0, 0, 0, 0, index, 0, NULL, 0, train);
         const int blur_nweights = n * blur_size * blur_size;  // (n / n) * n * blur_size * blur_size;
         int i;
         if (blur_size == 2) {
@@ -649,7 +692,7 @@ void denormalize_convolutional_layer(convolutional_layer l)
 
 void test_convolutional_layer()
 {
-    convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 1, 5, 2, 2, 1, 1, LEAKY, 1, 0, 0, 0, 0, 0, 0, NULL, 0);
+    convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 1, 5, 2, 2, 1, 1, LEAKY, 1, 0, 0, 0, 0, 0, 0, NULL, 0, 0);
     l.batch_normalize = 1;
     float data[] = {1,1,1,1,1,
         1,1,1,1,1,
@@ -688,10 +731,13 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h)
     l->inputs = l->w * l->h * l->c;
 
     l->output = (float*)realloc(l->output, total_batch * l->outputs * sizeof(float));
-    l->delta = (float*)realloc(l->delta, total_batch * l->outputs * sizeof(float));
-    if(l->batch_normalize){
-        l->x = (float*)realloc(l->x, total_batch * l->outputs * sizeof(float));
-        l->x_norm = (float*)realloc(l->x_norm, total_batch * l->outputs * sizeof(float));
+    if (l->train) {
+        l->delta = (float*)realloc(l->delta, total_batch * l->outputs * sizeof(float));
+
+        if (l->batch_normalize) {
+            l->x = (float*)realloc(l->x, total_batch * l->outputs * sizeof(float));
+            l->x_norm = (float*)realloc(l->x_norm, total_batch * l->outputs * sizeof(float));
+        }
     }
 
     if (l->xnor) {
@@ -700,10 +746,12 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h)
 
 #ifdef GPU
     if (old_w < w || old_h < h) {
-        cuda_free(l->delta_gpu);
-        cuda_free(l->output_gpu);
+        if (l->train) {
+            cuda_free(l->delta_gpu);
+            l->delta_gpu = cuda_make_array(l->delta, total_batch*l->outputs);
+        }
 
-        l->delta_gpu = cuda_make_array(l->delta, total_batch*l->outputs);
+        cuda_free(l->output_gpu);
         l->output_gpu = cuda_make_array(l->output, total_batch*l->outputs);
 
         if (l->batch_normalize) {
@@ -1246,7 +1294,7 @@ void assisted_excitation_forward(convolutional_layer l, network_state state)
         }
     }
 
-    if(1)   // visualize ground truth
+    if(0)   // visualize ground truth
     {
 #ifdef OPENCV
         for (b = 0; b < l.batch; ++b)
diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h
index 0072ce549c3..817f900b783 100644
--- a/src/convolutional_layer.h
+++ b/src/convolutional_layer.h
@@ -28,9 +28,10 @@ void create_convolutional_cudnn_tensors(layer *l);
 void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16);
 #endif
 #endif
+void free_convolutional_batchnorm(convolutional_layer *l);
 
 size_t get_convolutional_workspace_size(layer l);
-convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer, int assisted_excitation);
+convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride_x, int stride_y, int dilation, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index, int antialiasing, convolutional_layer *share_layer, int assisted_excitation, int train);
 void denormalize_convolutional_layer(convolutional_layer l);
 void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
 void forward_convolutional_layer(const convolutional_layer layer, network_state state);
diff --git a/src/crnn_layer.c b/src/crnn_layer.c
index 588db7411a0..cbeaa5087bc 100644
--- a/src/crnn_layer.c
+++ b/src/crnn_layer.c
@@ -26,11 +26,12 @@ static void increment_layer(layer *l, int steps)
 #endif
 }
 
-layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int groups, int steps, int size, int stride, int dilation, int pad, ACTIVATION activation, int batch_normalize, int xnor)
+layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int groups, int steps, int size, int stride, int dilation, int pad, ACTIVATION activation, int batch_normalize, int xnor, int train)
 {
     fprintf(stderr, "CRNN Layer: %d x %d x %d image, %d filters\n", h,w,c,output_filters);
     batch = batch / steps;
     layer l = { (LAYER_TYPE)0 };
+    l.train = train;
     l.batch = batch;
     l.type = CRNN;
     l.steps = steps;
@@ -50,17 +51,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
     l.state = (float*)calloc(l.hidden * l.batch * (l.steps + 1), sizeof(float));
 
     l.input_layer = (layer*)calloc(1, sizeof(layer));
-    *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+    *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
     l.input_layer->batch = batch;
     if (l.workspace_size < l.input_layer->workspace_size) l.workspace_size = l.input_layer->workspace_size;
 
     l.self_layer = (layer*)calloc(1, sizeof(layer));
-    *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+    *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
     l.self_layer->batch = batch;
     if (l.workspace_size < l.self_layer->workspace_size) l.workspace_size = l.self_layer->workspace_size;
 
     l.output_layer = (layer*)calloc(1, sizeof(layer));
-    *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0);
+    *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, groups, size, stride, stride, dilation, pad, activation, batch_normalize, 0, xnor, 0, 0, 0, 0, NULL, 0, train);
     l.output_layer->batch = batch;
     if (l.workspace_size < l.output_layer->workspace_size) l.workspace_size = l.output_layer->workspace_size;
 
diff --git a/src/crnn_layer.h b/src/crnn_layer.h
index 33560aae462..c2dc7a25258 100644
--- a/src/crnn_layer.h
+++ b/src/crnn_layer.h
@@ -9,7 +9,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int groups, int steps, int size, int stride, int dilation, int pad, ACTIVATION activation, int batch_normalize, int xnor);
+layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int groups, int steps, int size, int stride, int dilation, int pad, ACTIVATION activation, int batch_normalize, int xnor, int train);
 void resize_crnn_layer(layer *l, int w, int h);
 void free_state_crnn(layer l);
 
diff --git a/src/gemm.c b/src/gemm.c
index 151388284b1..9f5cb882c61 100644
--- a/src/gemm.c
+++ b/src/gemm.c
@@ -1949,7 +1949,7 @@ void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, i
                         }
                     }
                     dst[out_index] = max;
-                    indexes[out_index] = max_i;
+                    if (indexes) indexes[out_index] = max_i;
                 }
             }
         }
@@ -2452,7 +2452,7 @@ void forward_maxpool_layer_avx(float *src, float *dst, int *indexes, int size, i
                         }
                     }
                     dst[out_index] = max;
-                    indexes[out_index] = max_i;
+                    if (indexes) indexes[out_index] = max_i;
                 }
             }
         }
diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c
index a05074ecc74..54aa9c0ed91 100644
--- a/src/maxpool_layer.c
+++ b/src/maxpool_layer.c
@@ -46,10 +46,11 @@ void cudnn_maxpool_setup(layer *l)
 }
 
 
-maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels, int antialiasing)
+maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels, int antialiasing, int train)
 {
     maxpool_layer l = { (LAYER_TYPE)0 };
     l.type = MAXPOOL;
+    l.train = train;
 
     const int blur_stride_x = stride_x;
     const int blur_stride_y = stride_y;
@@ -82,21 +83,25 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
     l.stride_x = stride_x;
     l.stride_y = stride_y;
     int output_size = l.out_h * l.out_w * l.out_c * batch;
-    l.indexes = (int*)calloc(output_size, sizeof(int));
+
+    if (train) {
+        l.indexes = (int*)calloc(output_size, sizeof(int));
+        l.delta = (float*)calloc(output_size, sizeof(float));
+    }
     l.output = (float*)calloc(output_size, sizeof(float));
-    l.delta = (float*)calloc(output_size, sizeof(float));
     l.forward = forward_maxpool_layer;
     l.backward = backward_maxpool_layer;
-    #ifdef GPU
+#ifdef GPU
     l.forward_gpu = forward_maxpool_layer_gpu;
     l.backward_gpu = backward_maxpool_layer_gpu;
-    l.indexes_gpu = cuda_make_int_array(output_size);
+    if (train) {
+        l.indexes_gpu = cuda_make_int_array(output_size);
+        l.delta_gpu = cuda_make_array(l.delta, output_size);
+    }
     l.output_gpu  = cuda_make_array(l.output, output_size);
-    l.delta_gpu   = cuda_make_array(l.delta, output_size);
-
     cudnn_maxpool_setup(&l);
 
-    #endif  // GPU
+#endif  // GPU
 	l.bflops = (l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.;
     if (maxpool_depth)
         fprintf(stderr, "max-depth         %2dx%2d/%2d   %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
@@ -114,7 +119,7 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
             blur_size = 2;
             blur_pad = 0;
         }
-        *(l.input_layer) = make_convolutional_layer(batch, 1, l.out_h, l.out_w, l.out_c, l.out_c, l.out_c, blur_size, blur_stride_x, blur_stride_y, 1, blur_pad, LINEAR, 0, 0, 0, 0, 0, 1, 0, NULL, 0);
+        *(l.input_layer) = make_convolutional_layer(batch, 1, l.out_h, l.out_w, l.out_c, l.out_c, l.out_c, blur_size, blur_stride_x, blur_stride_y, 1, blur_pad, LINEAR, 0, 0, 0, 0, 0, 1, 0, NULL, 0, train);
         const int blur_nweights = l.out_c * blur_size * blur_size;  // (n / n) * n * blur_size * blur_size;
         int i;
         if (blur_size == 2) {
@@ -163,17 +168,22 @@ void resize_maxpool_layer(maxpool_layer *l, int w, int h)
     l->outputs = l->out_w * l->out_h * l->out_c;
     int output_size = l->outputs * l->batch;
 
-    l->indexes = (int*)realloc(l->indexes, output_size * sizeof(int));
+    if (l->train) {
+        l->indexes = (int*)realloc(l->indexes, output_size * sizeof(int));
+        l->delta = (float*)realloc(l->delta, output_size * sizeof(float));
+    }
     l->output = (float*)realloc(l->output, output_size * sizeof(float));
-    l->delta = (float*)realloc(l->delta, output_size * sizeof(float));
 
 #ifdef GPU
-    CHECK_CUDA(cudaFree((float *)l->indexes_gpu));
     CHECK_CUDA(cudaFree(l->output_gpu));
-    CHECK_CUDA(cudaFree(l->delta_gpu));
-    l->indexes_gpu = cuda_make_int_array(output_size);
     l->output_gpu  = cuda_make_array(l->output, output_size);
-    l->delta_gpu   = cuda_make_array(l->delta,  output_size);
+
+    if (l->train) {
+        CHECK_CUDA(cudaFree((float *)l->indexes_gpu));
+        CHECK_CUDA(cudaFree(l->delta_gpu));
+        l->indexes_gpu = cuda_make_int_array(output_size);
+        l->delta_gpu = cuda_make_array(l->delta, output_size);
+    }
 
     cudnn_maxpool_setup(l);
 #endif
@@ -203,7 +213,7 @@ void forward_maxpool_layer(const maxpool_layer l, network_state state)
                             max = (val > max) ? val : max;
                         }
                         l.output[out_index] = max;
-                        l.indexes[out_index] = max_i;
+                        if (l.indexes) l.indexes[out_index] = max_i;
                     }
                 }
             }
@@ -245,7 +255,7 @@ void forward_maxpool_layer(const maxpool_layer l, network_state state)
                             }
                         }
                         l.output[out_index] = max;
-                        l.indexes[out_index] = max_i;
+                        if (l.indexes) l.indexes[out_index] = max_i;
                     }
                 }
             }
diff --git a/src/maxpool_layer.h b/src/maxpool_layer.h
index cfedf9d9ee6..3986a4c6360 100644
--- a/src/maxpool_layer.h
+++ b/src/maxpool_layer.h
@@ -12,7 +12,7 @@ typedef layer maxpool_layer;
 extern "C" {
 #endif
 image get_maxpool_image(maxpool_layer l);
-maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels, int antialiasing);
+maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels, int antialiasing, int train);
 void resize_maxpool_layer(maxpool_layer *l, int w, int h);
 void forward_maxpool_layer(const maxpool_layer l, network_state state);
 void backward_maxpool_layer(const maxpool_layer l, network_state state);
diff --git a/src/maxpool_layer_kernels.cu b/src/maxpool_layer_kernels.cu
index 8677b4d713d..c15143c556b 100644
--- a/src/maxpool_layer_kernels.cu
+++ b/src/maxpool_layer_kernels.cu
@@ -36,7 +36,7 @@ __global__ void forward_maxpool_depth_layer_kernel(int n, int w, int h, int c, i
             max = (val > max) ? val : max;
         }
         output[out_index] = max;
-        indexes[out_index] = max_i;
+        if (indexes) indexes[out_index] = max_i;
     }
 }
 
@@ -88,7 +88,7 @@ __global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c
         }
     }
     output[out_index] = max;
-    indexes[out_index] = max_i;
+    if (indexes) indexes[out_index] = max_i;
 }
 
 __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride_x, int stride_y, int size, int pad, float *delta, float *prev_delta, int *indexes)
diff --git a/src/network.c b/src/network.c
index cfe994343e8..c2249a54df6 100644
--- a/src/network.c
+++ b/src/network.c
@@ -1071,6 +1071,7 @@ void fuse_conv_batchnorm(network net)
                     }
                 }
 
+                free_convolutional_batchnorm(l);
                 l->batch_normalize = 0;
 #ifdef GPU
                 if (gpu_index >= 0) {
diff --git a/src/parser.c b/src/parser.c
index e0b65f3bf3b..1e5ad78c781 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -130,6 +130,7 @@ typedef struct size_params{
     int c;
     int index;
     int time_steps;
+    int train;
     network net;
 } size_params;
 
@@ -199,7 +200,7 @@ convolutional_layer parse_convolutional(list *options, size_params params)
     int xnor = option_find_int_quiet(options, "xnor", 0);
     int use_bin_output = option_find_int_quiet(options, "bin_output", 0);
 
-    convolutional_layer layer = make_convolutional_layer(batch,1,h,w,c,n,groups,size,stride_x,stride_y,dilation,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index, antialiasing, share_layer, assisted_excitation);
+    convolutional_layer layer = make_convolutional_layer(batch,1,h,w,c,n,groups,size,stride_x,stride_y,dilation,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index, antialiasing, share_layer, assisted_excitation, params.train);
     layer.flipped = option_find_int_quiet(options, "flipped", 0);
     layer.dot = option_find_float_quiet(options, "dot", 0);
 
@@ -230,7 +231,7 @@ layer parse_crnn(list *options, size_params params)
     int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
     int xnor = option_find_int_quiet(options, "xnor", 0);
 
-    layer l = make_crnn_layer(params.batch, params.h, params.w, params.c, hidden_filters, output_filters, groups, params.time_steps, size, stride, dilation, padding, activation, batch_normalize, xnor);
+    layer l = make_crnn_layer(params.batch, params.h, params.w, params.c, hidden_filters, output_filters, groups, params.time_steps, size, stride, dilation, padding, activation, batch_normalize, xnor, params.train);
 
     l.shortcut = option_find_int_quiet(options, "shortcut", 0);
 
@@ -291,7 +292,7 @@ layer parse_conv_lstm(list *options, size_params params)
     int xnor = option_find_int_quiet(options, "xnor", 0);
     int peephole = option_find_int_quiet(options, "peephole", 0);
 
-    layer l = make_conv_lstm_layer(params.batch, params.h, params.w, params.c, output_filters, groups, params.time_steps, size, stride, dilation, padding, activation, batch_normalize, peephole, xnor);
+    layer l = make_conv_lstm_layer(params.batch, params.h, params.w, params.c, output_filters, groups, params.time_steps, size, stride, dilation, padding, activation, batch_normalize, peephole, xnor, params.train);
 
     l.state_constrain = option_find_int_quiet(options, "state_constrain", params.time_steps * 32);
     l.shortcut = option_find_int_quiet(options, "shortcut", 0);
@@ -630,7 +631,7 @@ maxpool_layer parse_maxpool(list *options, size_params params)
     batch=params.batch;
     if(!(h && w && c)) error("Layer before maxpool layer must output image.");
 
-    maxpool_layer layer = make_maxpool_layer(batch, h, w, c, size, stride_x, stride_y, padding, maxpool_depth, out_channels, antialiasing);
+    maxpool_layer layer = make_maxpool_layer(batch, h, w, c, size, stride_x, stride_y, padding, maxpool_depth, out_channels, antialiasing, params.train);
     return layer;
 }
 
@@ -684,7 +685,7 @@ layer parse_shortcut(list *options, size_params params, network net)
     layer from = net.layers[index];
     if (from.antialiasing) from = *from.input_layer;
 
-    layer s = make_shortcut_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c, assisted_excitation);
+    layer s = make_shortcut_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c, assisted_excitation, params.train);
 
     char *activation_s = option_find_str(options, "activation", "linear");
     ACTIVATION activation = get_activation(activation_s);
@@ -944,6 +945,9 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
     net.gpu_index = gpu_index;
     size_params params;
 
+    if (batch > 0) params.train = 0;    // allocates memory for Detection only
+    else params.train = 1;              // allocates memory for Detection & Training
+
     section *s = (section *)n->val;
     list *options = s->options;
     if(!is_network(s)) error("First section must be [net] or [network]");
diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c
index 8345858e47d..615a5ea384b 100644
--- a/src/shortcut_layer.c
+++ b/src/shortcut_layer.c
@@ -5,11 +5,12 @@
 #include <stdio.h>
 #include <assert.h>
 
-layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2, int assisted_excitation)
+layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2, int assisted_excitation, int train)
 {
     if(assisted_excitation) fprintf(stderr, "Shortcut Layer - AE: %d\n", index);
     else fprintf(stderr,"Shortcut Layer: %d\n", index);
     layer l = { (LAYER_TYPE)0 };
+    l.train = train;
     l.type = SHORTCUT;
     l.batch = batch;
     l.w = w2;
@@ -27,7 +28,7 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int
 
     l.index = index;
 
-    l.delta = (float*)calloc(l.outputs * batch, sizeof(float));
+    if (train) l.delta = (float*)calloc(l.outputs * batch, sizeof(float));
     l.output = (float*)calloc(l.outputs * batch, sizeof(float));
 
     l.forward = forward_shortcut_layer;
@@ -36,7 +37,7 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int
     l.forward_gpu = forward_shortcut_layer_gpu;
     l.backward_gpu = backward_shortcut_layer_gpu;
 
-    l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
+    if (train) l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
     l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
     if (l.assisted_excitation)
     {
@@ -56,14 +57,17 @@ void resize_shortcut_layer(layer *l, int w, int h)
     l->h = l->out_h = h;
     l->outputs = w*h*l->out_c;
     l->inputs = l->outputs;
-    l->delta = (float*)realloc(l->delta, l->outputs * l->batch * sizeof(float));
+    if (l->train) l->delta = (float*)realloc(l->delta, l->outputs * l->batch * sizeof(float));
     l->output = (float*)realloc(l->output, l->outputs * l->batch * sizeof(float));
 
 #ifdef GPU
     cuda_free(l->output_gpu);
-    cuda_free(l->delta_gpu);
     l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
-    l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch);
+
+    if (l->train) {
+        cuda_free(l->delta_gpu);
+        l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch);
+    }
 #endif
 
 }
diff --git a/src/shortcut_layer.h b/src/shortcut_layer.h
index ad8d45f3e28..89f22ceb3be 100644
--- a/src/shortcut_layer.h
+++ b/src/shortcut_layer.h
@@ -7,7 +7,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2, int assisted_excitation);
+layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2, int assisted_excitation, int train);
 void forward_shortcut_layer(const layer l, network_state state);
 void backward_shortcut_layer(const layer l, network_state state);
 void resize_shortcut_layer(layer *l, int w, int h);
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index 906ed427f01..06d2b513624 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -282,7 +282,6 @@ void forward_yolo_layer(const layer l, network_state state)
                     box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.w*l.h);
                     float best_iou = 0;
                     int best_t = 0;
-                    int class_id_match = 0;
                     for (t = 0; t < l.max_boxes; ++t) {
                         box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
                         int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
@@ -298,6 +297,7 @@ void forward_yolo_layer(const layer l, network_state state)
                         int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
                         float objectness = l.output[obj_index];
                         int pred_class_id = get_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness);
+                        int class_id_match = 0;
                         if (class_id == pred_class_id) class_id_match = 1;
                         else class_id_match = 0;
 

From 20659fc1a8ec858d24ad2b70f55add810dc5caff Mon Sep 17 00:00:00 2001
From: 7FM <41307817+7FM@users.noreply.github.com>
Date: Mon, 11 Nov 2019 22:08:06 +0100
Subject: [PATCH 54/86] Fix undefined behavior caused by delete

Avoid multiple deletes of the same address and memory leaks
---
 src/image_opencv.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/image_opencv.cpp b/src/image_opencv.cpp
index 912a0b1b6d2..6951fb9a80e 100644
--- a/src/image_opencv.cpp
+++ b/src/image_opencv.cpp
@@ -734,7 +734,7 @@ int wait_for_stream(cap_cv *cap, cv::Mat* src, int dont_close)
             delete src;// cvReleaseImage(&src);
             int z = 0;
             for (z = 0; z < 20; ++z) {
-                get_capture_frame_cv(cap);
+                src = get_capture_frame_cv(cap);
                 delete src;// cvReleaseImage(&src);
             }
             src = new cv::Mat(416, 416, CV_8UC(3)); // cvCreateImage(cvSize(416, 416), IPL_DEPTH_8U, 3);

From 9fe201807e244398e8aeaeed43c8823491fd421b Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Tue, 12 Nov 2019 15:05:34 +0300
Subject: [PATCH 55/86] Revert to old Assisted Excitation version.

---
 src/convolutional_kernels.cu |  8 ++++----
 src/parser.c                 | 10 ++++++++++
 src/route_layer.c            |  2 +-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index 1a6b5f8affb..e5d57eabde9 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -1019,8 +1019,8 @@ void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
         for (t = 0; t < state.net.num_boxes; ++t) {
             box truth = float_to_box_stride(truth_cpu + t*(4 + 1) + b*l.truths, 1);
             if (!truth.x) break;  // continue;
-            //float beta = 0;
-            float beta = 1 - alpha; // from 0 to 1
+            float beta = 0;
+            //float beta = 1 - alpha; // from 0 to 1
             float dw = (1 - truth.w) * beta;
             float dh = (1 - truth.h) * beta;
             //printf(" alpha = %f, beta = %f, truth.w = %f, dw = %f, tw+dw = %f, l.out_w = %d \n", alpha, beta, truth.w, dw, truth.w+dw, l.out_w);
@@ -1052,9 +1052,9 @@ void assisted_excitation_forward_gpu(convolutional_layer l, network_state state)
     //CHECK_CUDA(cudaPeekAtLastError());
 
     // calc new output
-    assisted_activation2_gpu(1, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);  // AE3: gt increases (beta = 1 - alpha = 0)
+    //assisted_activation2_gpu(1, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);  // AE3: gt increases (beta = 1 - alpha = 0)
     //assisted_activation2_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
-    //assisted_activation_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
+    assisted_activation_gpu(alpha, l.output_gpu, l.gt_gpu, l.a_avg_gpu, l.out_w * l.out_h, l.out_c, l.batch);
     //cudaStreamSynchronize(get_cuda_stream());
     //CHECK_CUDA(cudaPeekAtLastError());
 
diff --git a/src/parser.c b/src/parser.c
index 1e5ad78c781..4d259f43a96 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -797,6 +797,16 @@ route_layer parse_route(list *options, size_params params)
     }
     layer.out_c = layer.out_c / layer.groups;
 
+    layer.w = first.w;
+    layer.h = first.h;
+    layer.c = layer.out_c;
+
+    if (n > 3) fprintf(stderr, " \t  ");
+    else if (n > 1) fprintf(stderr, " \t          ");
+    else fprintf(stderr, " \t\t          ");
+
+    fprintf(stderr, "                 -> %4d x%4d x%4d \n", layer.w, layer.h, layer.c, layer.out_w, layer.out_h, layer.out_c);
+
     return layer;
 }
 
diff --git a/src/route_layer.c b/src/route_layer.c
index b636d4824e8..8e3f15f8a9d 100644
--- a/src/route_layer.c
+++ b/src/route_layer.c
@@ -20,10 +20,10 @@ route_layer make_route_layer(int batch, int n, int *input_layers, int *input_siz
         fprintf(stderr," %d", input_layers[i]);
         outputs += input_sizes[i];
     }
-    fprintf(stderr, "\n");
     outputs = outputs / groups;
     l.outputs = outputs;
     l.inputs = outputs;
+    //fprintf(stderr, " inputs = %d \t outputs = %d, groups = %d, group_id = %d \n", l.inputs, l.outputs, l.groups, l.group_id);
     l.delta = (float*)calloc(outputs * batch, sizeof(float));
     l.output = (float*)calloc(outputs * batch, sizeof(float));
 

From ded620ac6b23443b102bd58897877b0ce9100fa4 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Tue, 12 Nov 2019 15:11:33 +0300
Subject: [PATCH 56/86] fixed memory deallocation

---
 include/darknet.h         |  1 +
 src/convolutional_layer.c | 40 +++++++++++++++++++--------------------
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/include/darknet.h b/include/darknet.h
index 8be704061b1..0acac8a60b7 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -873,6 +873,7 @@ LIB_API void free_layer(layer);
 LIB_API void free_data(data d);
 LIB_API pthread_t load_data(load_args args);
 LIB_API pthread_t load_data_in_thread(load_args args);
+LIB_API void *load_thread(void *ptr);
 
 // dark_cuda.h
 LIB_API void cuda_pull_array(float *x_gpu, float *x, size_t n);
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 7f2bac9690b..4dd2cf96f9b 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -337,28 +337,28 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 void free_convolutional_batchnorm(convolutional_layer *l)
 {
     if (!l->share_layer) {
-        free(l->scales);
-        free(l->scale_updates);
-        free(l->mean);
-        free(l->variance);
-        free(l->mean_delta);
-        free(l->variance_delta);
-        free(l->rolling_mean);
-        free(l->rolling_variance);
-        free(l->x);
-        free(l->x_norm);
+        free(l->scales);            l->scales = NULL;
+        free(l->scale_updates);     l->scale_updates = NULL;
+        free(l->mean);              l->mean = NULL;
+        free(l->variance);          l->variance = NULL;
+        free(l->mean_delta);        l->mean_delta = NULL;
+        free(l->variance_delta);    l->variance_delta = NULL;
+        free(l->rolling_mean);      l->rolling_mean = NULL;
+        free(l->rolling_variance);  l->rolling_variance = NULL;
+        free(l->x);                 l->x = NULL;
+        free(l->x_norm);            l->x_norm = NULL;
 
 #ifdef GPU
-        cuda_free(l->scales_gpu);
-        cuda_free(l->scale_updates_gpu);
-        cuda_free(l->mean_gpu);
-        cuda_free(l->variance_gpu);
-        cuda_free(l->mean_delta_gpu);
-        cuda_free(l->variance_delta_gpu);
-        cuda_free(l->rolling_mean_gpu);
-        cuda_free(l->rolling_variance_gpu);
-        cuda_free(l->x_gpu);
-        cuda_free(l->x_norm_gpu);
+        cuda_free(l->scales_gpu);           l->scales_gpu = NULL;
+        cuda_free(l->scale_updates_gpu);    l->scale_updates_gpu = NULL;
+        cuda_free(l->mean_gpu);             l->mean_gpu = NULL;
+        cuda_free(l->variance_gpu);         l->variance_gpu = NULL;
+        cuda_free(l->mean_delta_gpu);       l->mean_delta_gpu = NULL;
+        cuda_free(l->variance_delta_gpu);   l->variance_delta_gpu = NULL;
+        cuda_free(l->rolling_mean_gpu);     l->rolling_mean_gpu = NULL;
+        cuda_free(l->rolling_variance_gpu); l->rolling_variance_gpu = NULL;
+        cuda_free(l->x_gpu);                l->x_gpu = NULL;
+        cuda_free(l->x_norm_gpu);           l->x_norm_gpu = NULL;
 #endif
     }
 }

From 70bf88ce01950e93f8bd54b53c585b98bae1146a Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Tue, 12 Nov 2019 15:21:21 +0300
Subject: [PATCH 57/86] Fixed swish-activation for [shortcut_layer]

---
 src/convolutional_layer.c |  2 +-
 src/shortcut_layer.c      | 26 ++++++++++++++++++++++----
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 4dd2cf96f9b..92e72732c1c 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -513,7 +513,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     if (l.activation == SWISH || l.activation == MISH) l.activation_input = (float*)calloc(total_batch*l.outputs, sizeof(float));
 
 #ifdef GPU
-    if (l.activation == SWISH || l.activation == MISH) l.activation_input_gpu = cuda_make_array(l.activation_input, total_batch*out_h*out_w*n);
+    if (l.activation == SWISH || l.activation == MISH) l.activation_input_gpu = cuda_make_array(l.activation_input, total_batch*l.outputs);
 
     l.forward_gpu = forward_convolutional_layer_gpu;
     l.backward_gpu = backward_convolutional_layer_gpu;
diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c
index 615a5ea384b..a1dd3724e47 100644
--- a/src/shortcut_layer.c
+++ b/src/shortcut_layer.c
@@ -2,6 +2,7 @@
 #include "convolutional_layer.h"
 #include "dark_cuda.h"
 #include "blas.h"
+#include "gemm.h"
 #include <stdio.h>
 #include <assert.h>
 
@@ -33,7 +34,11 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int
 
     l.forward = forward_shortcut_layer;
     l.backward = backward_shortcut_layer;
+
+    if (l.activation == SWISH || l.activation == MISH) l.activation_input = (float*)calloc(l.batch*l.outputs, sizeof(float));
 #ifdef GPU
+    if (l.activation == SWISH || l.activation == MISH) l.activation_input_gpu = cuda_make_array(l.activation_input, l.batch*l.outputs);
+
     l.forward_gpu = forward_shortcut_layer_gpu;
     l.backward_gpu = backward_shortcut_layer_gpu;
 
@@ -85,14 +90,21 @@ void forward_shortcut_layer(const layer l, network_state state)
         copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
         shortcut_cpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output);
     }
-    activate_array(l.output, l.outputs*l.batch, l.activation);
+
+    //activate_array(l.output, l.outputs*l.batch, l.activation);
+    if (l.activation == SWISH) activate_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.output);
+    else if (l.activation == MISH) activate_array_mish(l.output, l.outputs*l.batch, l.activation_input, l.output);
+    else activate_array_cpu_custom(l.output, l.outputs*l.batch, l.activation);
 
     if (l.assisted_excitation && state.train) assisted_excitation_forward(l, state);
 }
 
 void backward_shortcut_layer(const layer l, network_state state)
 {
-    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+    if (l.activation == SWISH) gradient_array_swish(l.output, l.outputs*l.batch, l.activation_input, l.delta);
+    else if (l.activation == MISH) gradient_array_mish(l.outputs*l.batch, l.activation_input, l.delta);
+    else gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+
     axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, state.delta, 1);
     shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, state.net.layers[l.index].delta);
 }
@@ -104,14 +116,20 @@ void forward_shortcut_layer_gpu(const layer l, network_state state)
     //simple_copy_ongpu(l.outputs*l.batch, state.input, l.output_gpu);
     //shortcut_gpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
     input_shortcut_gpu(state.input, l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
-    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+
+    if (l.activation == SWISH) activate_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
+    else if (l.activation == MISH) activate_array_mish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.output_gpu);
+    else activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
 
     if (l.assisted_excitation && state.train) assisted_excitation_forward_gpu(l, state);
 }
 
 void backward_shortcut_layer_gpu(const layer l, network_state state)
 {
-    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    if (l.activation == SWISH) gradient_array_swish_ongpu(l.output_gpu, l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu);
+    else if (l.activation == MISH) gradient_array_mish_ongpu(l.outputs*l.batch, l.activation_input_gpu, l.delta_gpu);
+    else gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+
     axpy_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1, state.delta, 1);
     shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, state.net.layers[l.index].delta_gpu);
 }

From 3652d7d3745a29ab161cf3a77492f1ac8b04b090 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Tue, 12 Nov 2019 15:31:07 +0300
Subject: [PATCH 58/86] Fixed MISH instability

---
 src/activation_kernels.cu | 24 ++++++++++++++++++------
 src/activations.c         | 23 ++++++++++++++++++-----
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu
index d8ff25f42e3..67504e71611 100644
--- a/src/activation_kernels.cu
+++ b/src/activation_kernels.cu
@@ -275,15 +275,27 @@ __global__ void gradient_array_swish_kernel(float *x, int n, float *sigmoid_gpu,
 }
 
 // https://github.com/digantamisra98/Mish
-__global__ void gradient_array_mish_kernel(int n, float *activation_input, float *delta)
+__global__ void gradient_array_mish_kernel(int n, float *activation_input_gpu, float *delta)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (i < n) {
-        float x = activation_input[i];
-        float d = 2 * expf(x) + expf(2 * x) + 2;
-        float w = 4 * (x + 1) + 4 * expf(2 * x) + expf(3 * x) + expf(x)*(4 * x + 6);
-        float derivative = expf(x) * w / (d * d);
-        delta[i] *= derivative;
+        const float THRESHOLD = 20.0f;
+
+        // implementation from TensorFlow: https://github.com/tensorflow/addons/commit/093cdfa85d334cbe19a37624c33198f3140109ed
+        // implementation from Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L26-L31
+        float inp = activation_input_gpu[i];
+        const float sp = (inp < THRESHOLD) ? log1p(exp(inp)) : inp;
+        const float grad_sp = 1 - exp(-sp);
+        const float tsp = tanh(sp);
+        const float grad_tsp = (1 - tsp*tsp) * grad_sp;
+        const float grad = inp * grad_tsp + tsp;
+        delta[i] *= grad;
+
+        //float x = activation_input[i];
+        //float d = 2 * expf(x) + expf(2 * x) + 2;
+        //float w = 4 * (x + 1) + 4 * expf(2 * x) + expf(3 * x) + expf(x)*(4 * x + 6);
+        //float derivative = expf(x) * w / (d * d);
+        //delta[i] *= derivative;
     }
 }
 
diff --git a/src/activations.c b/src/activations.c
index da92af0af38..55b060bd94c 100644
--- a/src/activations.c
+++ b/src/activations.c
@@ -207,10 +207,23 @@ void gradient_array_mish(const int n, const float * activation_input, float * de
     int i;
     #pragma omp parallel for
     for (i = 0; i < n; ++i) {
-        float x = activation_input[i];
-        float d = 2 * expf(x) + expf(2 * x) + 2;
-        float w = 4 * (x + 1) + 4 * expf(2 * x) + expf(3 * x) + expf(x)*(4 * x + 6);
-        float derivative = expf(x) * w / (d * d);
-        delta[i] *= derivative;
+        const float THRESHOLD = 20.0f;
+
+        // implementation from TensorFlow: https://github.com/tensorflow/addons/commit/093cdfa85d334cbe19a37624c33198f3140109ed
+        // implementation from Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L26-L31
+        float inp = activation_input[i];
+        const float sp = (inp < THRESHOLD) ? log1p(exp(inp)) : inp;
+        const float grad_sp = 1 - exp(-sp);
+        const float tsp = tanh(sp);
+        const float grad_tsp = (1 - tsp*tsp) * grad_sp;
+        const float grad = inp * grad_tsp + tsp;
+        delta[i] *= grad;
+
+
+        //float x = activation_input[i];
+        //float d = 2 * expf(x) + expf(2 * x) + 2;
+        //float w = 4 * (x + 1) + 4 * expf(2 * x) + expf(3 * x) + expf(x)*(4 * x + 6);
+        //float derivative = expf(x) * w / (d * d);
+        //delta[i] *= derivative;
     }
 }

From c516b6cb0a08f82023067a649d10238ff18cf1e1 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Tue, 12 Nov 2019 20:20:28 +0300
Subject: [PATCH 59/86] Take TopK from obj.data file for Classifier

---
 src/classifier.c |  9 ++++++---
 src/parser.c     | 13 ++++++++-----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/classifier.c b/src/classifier.c
index 11fa92df913..86549ed8f40 100644
--- a/src/classifier.c
+++ b/src/classifier.c
@@ -63,6 +63,9 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
     char *label_list = option_find_str(options, "labels", "data/labels.list");
     char *train_list = option_find_str(options, "train", "data/train.list");
     int classes = option_find_int(options, "classes", 2);
+    int topk_data = option_find_int(options, "top", 5);
+    char topk_buff[10];
+    sprintf(topk_buff, "top%d", topk_data);
 
     char **labels = get_labels(label_list);
     list *plist = get_paths(train_list);
@@ -157,14 +160,14 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
         int draw_precision = 0;
         if (calc_topk && (i >= calc_topk_for_each || i == net.max_batches)) {
             iter_topk = i;
-            topk = validate_classifier_single(datacfg, cfgfile, weightfile, &net, 5); // calc TOP5
-            printf("\n accuracy TOP5 = %f \n", topk);
+            topk = validate_classifier_single(datacfg, cfgfile, weightfile, &net, topk_data); // calc TOP5
+            printf("\n accuracy %s = %f \n", topk_buff, topk);
             draw_precision = 1;
         }
 
         printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/ train_images_num, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
 #ifdef OPENCV
-        draw_train_loss(img, img_size, avg_loss, max_img_loss, i, net.max_batches, topk, draw_precision, "top5", dont_show, mjpeg_port);
+        draw_train_loss(img, img_size, avg_loss, max_img_loss, i, net.max_batches, topk, draw_precision, topk_buff, dont_show, mjpeg_port);
 #endif  // OPENCV
 
         if (i >= (iter_save + 1000)) {
diff --git a/src/parser.c b/src/parser.c
index 4d259f43a96..e47c2097526 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -801,11 +801,14 @@ route_layer parse_route(list *options, size_params params)
     layer.h = first.h;
     layer.c = layer.out_c;
 
-    if (n > 3) fprintf(stderr, " \t  ");
-    else if (n > 1) fprintf(stderr, " \t          ");
-    else fprintf(stderr, " \t\t          ");
-
-    fprintf(stderr, "                 -> %4d x%4d x%4d \n", layer.w, layer.h, layer.c, layer.out_w, layer.out_h, layer.out_c);
+    if (n > 3) fprintf(stderr, " \t    ");
+    else if (n > 1) fprintf(stderr, " \t            ");
+    else fprintf(stderr, " \t\t            ");
+
+    fprintf(stderr, "           ");
+    if (layer.groups > 1) fprintf(stderr, "%d/%d", layer.group_id, layer.groups);
+    else fprintf(stderr, "   ");
+    fprintf(stderr, " -> %4d x%4d x%4d \n", layer.out_w, layer.out_h, layer.out_c);
 
     return layer;
 }

From 11142d00bedbafb015991fb20a05a5eb048200d6 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Wed, 13 Nov 2019 20:58:53 +0300
Subject: [PATCH 60/86] Fixed non-square network for Training Classifier

---
 src/classifier.c          |  1 +
 src/convolutional_layer.c |  1 +
 src/data.c                | 36 ++++++++++++++++++++----------------
 src/data.h                |  6 +++---
 4 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/src/classifier.c b/src/classifier.c
index 86549ed8f40..c5f8e2f18ca 100644
--- a/src/classifier.c
+++ b/src/classifier.c
@@ -77,6 +77,7 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
     load_args args = {0};
     args.w = net.w;
     args.h = net.h;
+    args.c = net.c;
     args.threads = 32;
     args.hierarchy = net.hierarchy;
 
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 92e72732c1c..f9d66ebf3b6 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -609,6 +609,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
 
     //fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
     l.bflops = (2.0 * l.nweights * l.out_h*l.out_w) / 1000000000.;
+    if (l.xnor) l.bflops = l.bflops / 32;
     if (l.xnor && l.use_bin_output) fprintf(stderr, "convXB");
     else if (l.xnor) fprintf(stderr, "convX ");
     else if (l.share_layer) fprintf(stderr, "convS ");
diff --git a/src/data.c b/src/data.c
index 622e401ef75..f4b64b998e3 100644
--- a/src/data.c
+++ b/src/data.c
@@ -142,7 +142,7 @@ matrix load_image_paths(char **paths, int n, int w, int h)
     return X;
 }
 
-matrix load_image_augment_paths(char **paths, int n, int use_flip, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
+matrix load_image_augment_paths(char **paths, int n, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure)
 {
     int i;
     matrix X;
@@ -151,6 +151,7 @@ matrix load_image_augment_paths(char **paths, int n, int use_flip, int min, int
     X.cols = 0;
 
     for(i = 0; i < n; ++i){
+        int size = w > h ? w : h;
         image im = load_image_color(paths[i], 0, 0);
         image crop = random_augment_image(im, angle, aspect, min, max, size);
         int flip = use_flip ? random_gen() % 2 : 0;
@@ -158,14 +159,17 @@ matrix load_image_augment_paths(char **paths, int n, int use_flip, int min, int
             flip_image(crop);
         random_distort_image(crop, hue, saturation, exposure);
 
-        /*
-        show_image(im, "orig");
-        show_image(crop, "crop");
-        cvWaitKey(0);
-        */
+        image sized = resize_image(crop, w, h);
+
+        //show_image(im, "orig");
+        //show_image(sized, "sized");
+        //show_image(sized, paths[i]);
+        //wait_until_press_key_cv();
+        //printf("w = %d, h = %d \n", sized.w, sized.h);
+
         free_image(im);
-        X.vals[i] = crop.data;
-        X.cols = crop.h*crop.w*crop.c;
+        X.vals[i] = sized.data;
+        X.cols = sized.h*sized.w*sized.c;
     }
     return X;
 }
@@ -1165,7 +1169,7 @@ void *load_thread(void *ptr)
     if (a.type == OLD_CLASSIFICATION_DATA){
         *a.d = load_data_old(a.paths, a.n, a.m, a.labels, a.classes, a.w, a.h);
     } else if (a.type == CLASSIFICATION_DATA){
-        *a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.hierarchy, a.flip, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
+        *a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.hierarchy, a.flip, a.min, a.max, a.w, a.h, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
     } else if (a.type == SUPER_DATA){
         *a.d = load_data_super(a.paths, a.n, a.m, a.w, a.h, a.scale);
     } else if (a.type == WRITING_DATA){
@@ -1186,7 +1190,7 @@ void *load_thread(void *ptr)
         *(a.im) = load_image(a.path, 0, 0, a.c);
         *(a.resized) = letterbox_image(*(a.im), a.w, a.h);
     } else if (a.type == TAG_DATA){
-        *a.d = load_data_tag(a.paths, a.n, a.m, a.classes, a.flip, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
+        *a.d = load_data_tag(a.paths, a.n, a.m, a.classes, a.flip, a.min, a.max, a.w, a.h, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
     }
     free(ptr);
     return 0;
@@ -1310,25 +1314,25 @@ data load_data_super(char **paths, int n, int m, int w, int h, int scale)
     return d;
 }
 
-data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int use_flip, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
+data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure)
 {
     if(m) paths = get_random_paths(paths, n, m);
     data d = {0};
     d.shallow = 0;
-    d.X = load_image_augment_paths(paths, n, use_flip, min, max, size, angle, aspect, hue, saturation, exposure);
+    d.X = load_image_augment_paths(paths, n, use_flip, min, max, w, h, angle, aspect, hue, saturation, exposure);
     d.y = load_labels_paths(paths, n, labels, k, hierarchy);
     if(m) free(paths);
     return d;
 }
 
-data load_data_tag(char **paths, int n, int m, int k, int use_flip, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
+data load_data_tag(char **paths, int n, int m, int k, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure)
 {
     if(m) paths = get_random_paths(paths, n, m);
     data d = {0};
-    d.w = size;
-    d.h = size;
+    d.w = w;
+    d.h = h;
     d.shallow = 0;
-    d.X = load_image_augment_paths(paths, n, use_flip, min, max, size, angle, aspect, hue, saturation, exposure);
+    d.X = load_image_augment_paths(paths, n, use_flip, min, max, w, h, angle, aspect, hue, saturation, exposure);
     d.y = load_tags_paths(paths, n, k);
     if(m) free(paths);
     return d;
diff --git a/src/data.h b/src/data.h
index 17edc5cf468..250e655a9b5 100644
--- a/src/data.h
+++ b/src/data.h
@@ -88,10 +88,10 @@ data load_data_captcha_encode(char **paths, int n, int m, int w, int h);
 data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int h);
 data load_data_detection(int n, char **paths, int m, int w, int h, int c, int boxes, int classes, int use_flip, int use_blur, int use_mixup,
     float jitter, float hue, float saturation, float exposure, int mini_batch, int track, int augment_speed, int letter_box, int show_imgs);
-data load_data_tag(char **paths, int n, int m, int k, int use_flip, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
-matrix load_image_augment_paths(char **paths, int n, int use_flip, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
+data load_data_tag(char **paths, int n, int m, int k, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure);
+matrix load_image_augment_paths(char **paths, int n, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure);
 data load_data_super(char **paths, int n, int m, int w, int h, int scale);
-data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int use_flip, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
+data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int use_flip, int min, int max, int w, int h, float angle, float aspect, float hue, float saturation, float exposure);
 data load_go(char *filename);
 
 box_label *read_boxes(char *filename, int *n);

From a4012895c16aacf82416b3383b567fc143532c51 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Wed, 13 Nov 2019 21:12:30 +0300
Subject: [PATCH 61/86] accelerated the c++ example of usage DLL/SO library for
 images if compiled with OpenCV

---
 src/yolo_console_dll.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/yolo_console_dll.cpp b/src/yolo_console_dll.cpp
index 60da53f6516..f942fe13e3d 100644
--- a/src/yolo_console_dll.cpp
+++ b/src/yolo_console_dll.cpp
@@ -650,10 +650,12 @@ int main(int argc, char *argv[])
 
             }
             else {    // image file
+                // to achive high performance for multiple images do these 2 lines in another thread
                 cv::Mat mat_img = cv::imread(filename);
+                auto det_image = detector.mat_to_image_resize(mat_img);
 
                 auto start = std::chrono::steady_clock::now();
-                std::vector<bbox_t> result_vec = detector.detect(mat_img);
+                std::vector<bbox_t> result_vec = detector.detect_resized(*det_image, mat_img.size().width, mat_img.size().height);
                 auto end = std::chrono::steady_clock::now();
                 std::chrono::duration<double> spent = end - start;
                 std::cout << " Time: " << spent.count() << " sec \n";

From f42923350b9e9eeba372ec9792a2d162d31b0016 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Thu, 14 Nov 2019 03:08:35 +0300
Subject: [PATCH 62/86] Minor fix in batchnorm_layer (side effect in python
 samples)

---
 src/batchnorm_layer.c | 4 ++--
 src/parser.c          | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/batchnorm_layer.c b/src/batchnorm_layer.c
index 4f9536a99ce..cbcc14d85e4 100644
--- a/src/batchnorm_layer.c
+++ b/src/batchnorm_layer.c
@@ -182,9 +182,9 @@ void forward_batchnorm_layer_gpu(layer l, network_state state)
     if (l.type == BATCHNORM) simple_copy_ongpu(l.outputs*l.batch, state.input, l.output_gpu);
         //copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
 
-    simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_gpu);
-    //copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
     if (state.train) {
+        simple_copy_ongpu(l.outputs*l.batch, l.output_gpu, l.x_gpu);
+        //copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
 #ifdef CUDNN
         float one = 1;
         float zero = 0;
diff --git a/src/parser.c b/src/parser.c
index e47c2097526..b05a293f786 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -978,6 +978,7 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
     params.batch = net.batch;
     params.time_steps = net.time_steps;
     params.net = net;
+    printf("batch = %d, time_steps = %d, train = %d \n", net.batch, net.time_steps, params.train);
 
     float bflops = 0;
     size_t workspace_size = 0;

From 509ba13acf9b6738d9838eb1553a2f40e36c0c2e Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Thu, 14 Nov 2019 14:14:54 +0300
Subject: [PATCH 63/86] Fixed recent memory leak for Classifier

---
 src/data.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/data.c b/src/data.c
index f4b64b998e3..d15aca09b8c 100644
--- a/src/data.c
+++ b/src/data.c
@@ -168,6 +168,7 @@ matrix load_image_augment_paths(char **paths, int n, int use_flip, int min, int
         //printf("w = %d, h = %d \n", sized.w, sized.h);
 
         free_image(im);
+        free_image(crop);
         X.vals[i] = sized.data;
         X.cols = sized.h*sized.w*sized.c;
     }

From ee370e765d3de505df5657f33dde605625214d0b Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Thu, 14 Nov 2019 23:18:21 +0300
Subject: [PATCH 64/86] Fixed ignore_thresh

---
 src/yolo_layer.c | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index 06d2b513624..42e595c1fcf 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -129,23 +129,18 @@ box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw
 }
 
 
-int get_yolo_class(float *output, int classes, int class_index, int stride, float objectness)
+int compare_yolo_class(float *output, int classes, int class_index, int stride, float objectness, int class_id)
 {
-    int class_id = 0;
-    float max_prob = FLT_MIN;
+    const float conf_thresh = 0.25;
 
     int j;
     for (j = 0; j < classes; ++j) {
         float prob = objectness * output[class_index + stride*j];
-        if (prob > max_prob) {
-            max_prob = prob;
-            class_id = j;
+        if (prob > conf_thresh) {
+            return 1;
         }
-        //int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j);
-        //float prob = objectness*predictions[class_index];
-        //dets[count].prob[j] = (prob > thresh) ? prob : 0;
     }
-    return class_id;
+    return 0;
 }
 
 ious delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss)
@@ -280,6 +275,8 @@ void forward_yolo_layer(const layer l, network_state state)
                 for (n = 0; n < l.n; ++n) {
                     int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
                     box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.w*l.h);
+                    float best_match_iou = 0;
+                    int best_match_t = 0;
                     float best_iou = 0;
                     int best_t = 0;
                     for (t = 0; t < l.max_boxes; ++t) {
@@ -296,14 +293,14 @@ void forward_yolo_layer(const layer l, network_state state)
                         int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
                         int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
                         float objectness = l.output[obj_index];
-                        int pred_class_id = get_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness);
-                        int class_id_match = 0;
-                        if (class_id == pred_class_id) class_id_match = 1;
-                        else class_id_match = 0;
+                        int class_id_match = compare_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness, class_id);
 
                         float iou = box_iou(pred, truth);
-                        //if (iou > best_iou) {
-                        if (iou > best_iou && class_id_match == 1) {
+                        if (iou > best_match_iou && class_id_match == 1) {
+                            best_match_iou = iou;
+                            best_match_t = t;
+                        }
+                        if (iou > best_iou) {
                             best_iou = iou;
                             best_t = t;
                         }
@@ -311,7 +308,7 @@ void forward_yolo_layer(const layer l, network_state state)
                     int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
                     avg_anyobj += l.output[obj_index];
                     l.delta[obj_index] = l.cls_normalizer * (0 - l.output[obj_index]);
-                    if (best_iou > l.ignore_thresh) {
+                    if (best_match_iou > l.ignore_thresh) {
                         l.delta[obj_index] = 0;
                     }
                     if (best_iou > l.truth_thresh) {
@@ -376,9 +373,6 @@ void forward_yolo_layer(const layer l, network_state state)
 
                 ++count;
                 ++class_count;
-                //if(iou > .5) recall += 1;
-                //if(iou > .75) recall75 += 1;
-                //avg_iou += iou;
                 if (all_ious.iou > .5) recall += 1;
                 if (all_ious.iou > .75) recall75 += 1;
             }

From dd34fe156a1a03709b3444bd84d6ccf55f03c835 Mon Sep 17 00:00:00 2001
From: dccho <dongchan.cho@motiongestures.com>
Date: Thu, 14 Nov 2019 15:36:22 -0500
Subject: [PATCH 65/86] enable random resize training for efficient net

---
 src/convolutional_layer.c  |  4 ++++
 src/dropout_layer.c        |  3 ++-
 src/network.c              | 21 ++++++++++++++++-----
 src/scale_channels_layer.c |  7 ++++---
 src/scale_channels_layer.h |  2 +-
 5 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index f9d66ebf3b6..e784f5e4b89 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -745,6 +745,7 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h)
         //l->binary_input = realloc(l->inputs*l->batch, sizeof(float));
     }
 
+    if (l->activation == SWISH || l->activation == MISH) l->activation_input = (float*)realloc(l->activation_input, total_batch*l->outputs * sizeof(float));
 #ifdef GPU
     if (old_w < w || old_h < h) {
         if (l->train) {
@@ -767,6 +768,9 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h)
             cuda_free(l->binary_input_gpu);
             l->binary_input_gpu = cuda_make_array(0, l->inputs*l->batch);
         }
+
+        cuda_free(l->activation_input_gpu);
+        if (l->activation == SWISH || l->activation == MISH) l->activation_input_gpu = cuda_make_array(l->activation_input, total_batch*l->outputs);
     }
 #ifdef CUDNN
     cudnn_convolutional_setup(l, cudnn_fastest);
diff --git a/src/dropout_layer.c b/src/dropout_layer.c
index 0d34ed24b2c..c32c5c616bc 100644
--- a/src/dropout_layer.c
+++ b/src/dropout_layer.c
@@ -27,11 +27,12 @@ dropout_layer make_dropout_layer(int batch, int inputs, float probability)
 
 void resize_dropout_layer(dropout_layer *l, int inputs)
 {
+    l->inputs = l->outputs = inputs;
     l->rand = (float*)realloc(l->rand, l->inputs * l->batch * sizeof(float));
     #ifdef GPU
     cuda_free(l->rand_gpu);
 
-    l->rand_gpu = cuda_make_array(l->rand, inputs*l->batch);
+    l->rand_gpu = cuda_make_array(l->rand, l->inputs*l->batch);
     #endif
 }
 
diff --git a/src/network.c b/src/network.c
index c2249a54df6..96c935d9ce2 100644
--- a/src/network.c
+++ b/src/network.c
@@ -535,8 +535,16 @@ int resize_network(network *net, int w, int h)
             resize_route_layer(&l, net);
         }else if (l.type == SHORTCUT) {
             resize_shortcut_layer(&l, w, h);
-        //}else if (l.type == SCALE_CHANNELS) {
-        //    resize_scale_channels_layer(&l, w, h);
+        }else if (l.type == SCALE_CHANNELS) {
+            resize_scale_channels_layer(&l, net);
+        }else if (l.type == DROPOUT) {
+            resize_dropout_layer(&l, inputs);
+            l.output = net->layers[i - 1].output;
+            l.delta = net->layers[i - 1].delta;
+#ifdef GPU
+            l.output_gpu = net->layers[i-1].output_gpu;
+            l.delta_gpu = net->layers[i-1].delta_gpu;
+#endif  
         }else if (l.type == UPSAMPLE) {
             resize_upsample_layer(&l, w, h);
         }else if(l.type == REORG){
@@ -556,9 +564,12 @@ int resize_network(network *net, int w, int h)
         if(l.workspace_size > workspace_size) workspace_size = l.workspace_size;
         inputs = l.outputs;
         net->layers[i] = l;
-        w = l.out_w;
-        h = l.out_h;
-        if(l.type == AVGPOOL) break;
+        if(l.type != DROPOUT)
+        {
+            w = l.out_w;
+            h = l.out_h;
+        }
+        //if(l.type == AVGPOOL) break;
     }
 #ifdef GPU
     const int size = get_network_input_size(*net) * net->batch;
diff --git a/src/scale_channels_layer.c b/src/scale_channels_layer.c
index 80be5361126..bcb54c1b497 100644
--- a/src/scale_channels_layer.c
+++ b/src/scale_channels_layer.c
@@ -39,10 +39,11 @@ layer make_scale_channels_layer(int batch, int index, int w, int h, int c, int w
     return l;
 }
 
-void resize_scale_channels_layer(layer *l, int w, int h)
+void resize_scale_channels_layer(layer *l, network *net)
 {
-    l->out_w = w;
-    l->out_h = h;
+    layer first = net->layers[l->index];
+    l->out_w = first.out_w;
+    l->out_h = first.out_h;
     l->outputs = l->out_w*l->out_h*l->out_c;
     l->inputs = l->outputs;
     l->delta = (float*)realloc(l->delta, l->outputs * l->batch * sizeof(float));
diff --git a/src/scale_channels_layer.h b/src/scale_channels_layer.h
index a20c070397b..fdaa4b924fe 100644
--- a/src/scale_channels_layer.h
+++ b/src/scale_channels_layer.h
@@ -10,7 +10,7 @@ extern "C" {
 layer make_scale_channels_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2);
 void forward_scale_channels_layer(const layer l, network_state state);
 void backward_scale_channels_layer(const layer l, network_state state);
-void resize_scale_channels_layer(layer *l, int w, int h);
+void resize_scale_channels_layer(layer *l, network *net);
 
 #ifdef GPU
 void forward_scale_channels_layer_gpu(const layer l, network_state state);

From e7e85b358cb0531f7154fdd68306c4c4dc96b5d2 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 15 Nov 2019 01:14:55 +0300
Subject: [PATCH 66/86] Added delta_yolo_box_accumulate(). Added
 iou_thresh=0.213 parameter to [yolo] layer IoU(anchor,truth).

---
 include/darknet.h |   1 +
 src/parser.c      |   1 +
 src/yolo_layer.c  | 100 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/include/darknet.h b/include/darknet.h
index 0acac8a60b7..2bd70d1cde3 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -280,6 +280,7 @@ struct layer {
     int random;
     float ignore_thresh;
     float truth_thresh;
+    float iou_thresh;
     float thresh;
     float focus;
     int classfix;
diff --git a/src/parser.c b/src/parser.c
index b05a293f786..472b179d20d 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -380,6 +380,7 @@ layer parse_yolo(list *options, size_params params)
 
     l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
     l.truth_thresh = option_find_float(options, "truth_thresh", 1);
+    l.iou_thresh = option_find_float_quiet(options, "iou_thresh", 1); // recommended to use iou_thresh=0.213 in [yolo]
     l.random = option_find_int_quiet(options, "random", 0);
 
     char *map_file = option_find_str(options, "map", 0);
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index 42e595c1fcf..6c02310617a 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -193,6 +193,63 @@ ious delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i,
     return all_ious;
 }
 
+ious delta_yolo_box_accumulate(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss)
+{
+    ious all_ious = { 0 };
+    // i - step in layer width
+    // j - step in layer height
+    //  Returns a box in absolute coordinates
+    box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
+    all_ious.iou = box_iou(pred, truth);
+    all_ious.giou = box_giou(pred, truth);
+    // avoid nan in dx_box_iou
+    if (pred.w == 0) { pred.w = 1.0; }
+    if (pred.h == 0) { pred.h = 1.0; }
+    if (iou_loss == MSE)    // old loss
+    {
+        float tx = (truth.x*lw - i);
+        float ty = (truth.y*lh - j);
+        float tw = log(truth.w*w / biases[2 * n]);
+        float th = log(truth.h*h / biases[2 * n + 1]);
+
+        // accumulate delta
+        delta[index + 0 * stride] += scale * (tx - x[index + 0 * stride]);
+        delta[index + 1 * stride] += scale * (ty - x[index + 1 * stride]);
+        delta[index + 2 * stride] += scale * (tw - x[index + 2 * stride]);
+        delta[index + 3 * stride] += scale * (th - x[index + 3 * stride]);
+    }
+    else {
+        // https://github.com/generalized-iou/g-darknet
+        // https://arxiv.org/abs/1902.09630v2
+        // https://giou.stanford.edu/
+        all_ious.dx_iou = dx_box_iou(pred, truth, iou_loss);
+
+        // jacobian^t (transpose)
+        float dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
+        float dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
+        float dw = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
+        float dh = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
+
+        // predict exponential, apply gradient of e^delta_t ONLY for w,h
+        dw *= exp(x[index + 2 * stride]);
+        dh *= exp(x[index + 3 * stride]);
+
+        // normalize iou weight
+        dx *= iou_normalizer;
+        dy *= iou_normalizer;
+        dw *= iou_normalizer;
+        dh *= iou_normalizer;
+
+        // accumulate delta
+        delta[index + 0 * stride] += dx;
+        delta[index + 1 * stride] += dy;
+        delta[index + 2 * stride] += dw;
+        delta[index + 3 * stride] += dh;
+    }
+
+    return all_ious;
+}
+
 void delta_yolo_class(float *output, float *delta, int index, int class_id, int classes, int stride, float *avg_cat, int focal_loss)
 {
     int n;
@@ -254,6 +311,7 @@ void forward_yolo_layer(const layer l, network_state state)
     }
 #endif
 
+    // delta is zeroed
     memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
     if (!state.train) return;
     //float avg_iou = 0;
@@ -319,7 +377,7 @@ void forward_yolo_layer(const layer l, network_state state)
                         int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
                         delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, 0, l.focal_loss);
                         box truth = float_to_box_stride(state.truth + best_t*(4 + 1) + b*l.truths, 1);
-                        delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss);
+                        delta_yolo_box_accumulate(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss);
                     }
                 }
             }
@@ -353,7 +411,7 @@ void forward_yolo_layer(const layer l, network_state state)
             int mask_n = int_index(l.mask, best_n, l.n);
             if (mask_n >= 0) {
                 int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
-                ious all_ious = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss);
+                ious all_ious = delta_yolo_box_accumulate(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss);
 
                 // range is 0 <= 1
                 tot_iou += all_ious.iou;
@@ -376,6 +434,44 @@ void forward_yolo_layer(const layer l, network_state state)
                 if (all_ious.iou > .5) recall += 1;
                 if (all_ious.iou > .75) recall75 += 1;
             }
+
+            // iou_thresh
+            for (n = 0; n < l.total; ++n) {
+                int mask_n = int_index(l.mask, n, l.n);
+                if (mask_n >= 0 && n != best_n) {
+                    box pred = { 0 };
+                    pred.w = l.biases[2 * n] / state.net.w;
+                    pred.h = l.biases[2 * n + 1] / state.net.h;
+                    float iou = box_iou(pred, truth_shift);
+                    // iou, n
+
+                    if (iou > l.iou_thresh) {
+                        int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
+                        ious all_ious = delta_yolo_box_accumulate(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss);
+
+                        // range is 0 <= 1
+                        tot_iou += all_ious.iou;
+                        tot_iou_loss += 1 - all_ious.iou;
+                        // range is -1 <= giou <= 1
+                        tot_giou += all_ious.giou;
+                        tot_giou_loss += 1 - all_ious.giou;
+
+                        int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
+                        avg_obj += l.output[obj_index];
+                        l.delta[obj_index] = l.cls_normalizer * (1 - l.output[obj_index]);
+
+                        int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
+                        if (l.map) class_id = l.map[class_id];
+                        int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
+                        delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, &avg_cat, l.focal_loss);
+
+                        ++count;
+                        ++class_count;
+                        if (all_ious.iou > .5) recall += 1;
+                        if (all_ious.iou > .75) recall75 += 1;
+                    }
+                }
+            }
         }
     }
     //*(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);

From 3555beb91401cf746fd357b4029231143d00f58f Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 15 Nov 2019 22:51:06 +0300
Subject: [PATCH 67/86] Fixed some memory leaks in secondary functions

---
 src/classifier.c     |  2 ++
 src/coco.c           | 10 ++++++-
 src/detector.c       | 17 ++++++++++-
 src/gemm.c           | 68 ++++++++++++++++++++++----------------------
 src/go.c             |  1 +
 src/image_opencv.cpp |  4 ++-
 src/network.c        |  7 ++++-
 src/utils.c          |  4 ++-
 src/yolo.c           |  8 ++++++
 9 files changed, 82 insertions(+), 39 deletions(-)

diff --git a/src/classifier.c b/src/classifier.c
index c5f8e2f18ca..c077f61ae65 100644
--- a/src/classifier.c
+++ b/src/classifier.c
@@ -1288,4 +1288,6 @@ void run_classifier(int argc, char **argv)
     else if(0==strcmp(argv[2], "valid10")) validate_classifier_10(data, cfg, weights);
     else if(0==strcmp(argv[2], "validcrop")) validate_classifier_crop(data, cfg, weights);
     else if(0==strcmp(argv[2], "validfull")) validate_classifier_full(data, cfg, weights);
+
+    if (gpus && gpu_list && ngpus > 1) free(gpus);
 }
diff --git a/src/coco.c b/src/coco.c
index cdfd3dff391..03dd3a61415 100644
--- a/src/coco.c
+++ b/src/coco.c
@@ -226,6 +226,12 @@ void validate_coco(char *cfgfile, char *weightfile)
     fprintf(fp, "\n]\n");
     fclose(fp);
 
+    if (val) free(val);
+    if (val_resized) free(val_resized);
+    if (buf) free(buf);
+    if (buf_resized) free(buf_resized);
+    if (thr) free(thr);
+
     fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
 }
 
@@ -307,7 +313,9 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
         }
 
         fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
-        free(id);
+
+        if (fps) free(fps);
+        if (id) free(id);
         free_image(orig);
         free_image(sized);
     }
diff --git a/src/detector.c b/src/detector.c
index efe5571aa8d..7f32c4fa7a6 100644
--- a/src/detector.c
+++ b/src/detector.c
@@ -556,6 +556,7 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
     for (j = 0; j < classes; ++j) {
         if (fps) fclose(fps[j]);
     }
+    if (fps) free(fps);
     if (coco) {
 #ifdef WIN32
         fseek(fp, -3, SEEK_CUR);
@@ -563,8 +564,15 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
         fseek(fp, -2, SEEK_CUR);
 #endif
         fprintf(fp, "\n]\n");
-        fclose(fp);
     }
+    if (fp) fclose(fp);
+
+    if (val) free(val);
+    if (val_resized) free(val_resized);
+    if (thr) free(thr);
+    if (buf) free(buf);
+    if (buf_resized) free(buf_resized);
+
     fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)time(0) - start);
 }
 
@@ -1099,6 +1107,11 @@ float validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, floa
     else {
         free_network(net);
     }
+    if (val) free(val);
+    if (val_resized) free(val_resized);
+    if (thr) free(thr);
+    if (buf) free(buf);
+    if (buf_resized) free(buf_resized);
 
     return mean_average_precision;
 }
@@ -1505,4 +1518,6 @@ void run_detector(int argc, char **argv)
         free_list(options);
     }
     else printf(" There isn't such command: %s", argv[2]);
+
+    if (gpus && gpu_list && ngpus > 1) free(gpus);
 }
diff --git a/src/gemm.c b/src/gemm.c
index 9f5cb882c61..51f77cac7a9 100644
--- a/src/gemm.c
+++ b/src/gemm.c
@@ -324,7 +324,7 @@ void transpose_32x32_bits_my(uint32_t *A, uint32_t *B, int lda, int ldb)
     unsigned int x, y;
     for (y = 0; y < 32; ++y) {
         for (x = 0; x < 32; ++x) {
-            if (A[y * lda] & (1 << x)) B[x * ldb] |= (uint32_t)1 << y;
+            if (A[y * lda] & ((uint32_t)1 << x)) B[x * ldb] |= (uint32_t)1 << y;
         }
     }
 }
@@ -636,48 +636,48 @@ void check_cpu_features(void) {
     //  Detect Features
     if (nIds >= 0x00000001) {
         cpuid(info, 0x00000001);
-        HW_MMX = (info[3] & ((int)1 << 23)) != 0;
-        HW_SSE = (info[3] & ((int)1 << 25)) != 0;
-        HW_SSE2 = (info[3] & ((int)1 << 26)) != 0;
-        HW_SSE3 = (info[2] & ((int)1 << 0)) != 0;
+        HW_MMX = (info[3] & ((uint32_t)1 << 23)) != 0;
+        HW_SSE = (info[3] & ((uint32_t)1 << 25)) != 0;
+        HW_SSE2 = (info[3] & ((uint32_t)1 << 26)) != 0;
+        HW_SSE3 = (info[2] & ((uint32_t)1 << 0)) != 0;
 
-        HW_SSSE3 = (info[2] & ((int)1 << 9)) != 0;
-        HW_SSE41 = (info[2] & ((int)1 << 19)) != 0;
-        HW_SSE42 = (info[2] & ((int)1 << 20)) != 0;
-        HW_AES = (info[2] & ((int)1 << 25)) != 0;
+        HW_SSSE3 = (info[2] & ((uint32_t)1 << 9)) != 0;
+        HW_SSE41 = (info[2] & ((uint32_t)1 << 19)) != 0;
+        HW_SSE42 = (info[2] & ((uint32_t)1 << 20)) != 0;
+        HW_AES = (info[2] & ((uint32_t)1 << 25)) != 0;
 
-        HW_AVX = (info[2] & ((int)1 << 28)) != 0;
-        HW_FMA3 = (info[2] & ((int)1 << 12)) != 0;
+        HW_AVX = (info[2] & ((uint32_t)1 << 28)) != 0;
+        HW_FMA3 = (info[2] & ((uint32_t)1 << 12)) != 0;
 
-        HW_RDRAND = (info[2] & ((int)1 << 30)) != 0;
+        HW_RDRAND = (info[2] & ((uint32_t)1 << 30)) != 0;
     }
     if (nIds >= 0x00000007) {
         cpuid(info, 0x00000007);
-        HW_AVX2 = (info[1] & ((int)1 << 5)) != 0;
-
-        HW_BMI1 = (info[1] & ((int)1 << 3)) != 0;
-        HW_BMI2 = (info[1] & ((int)1 << 8)) != 0;
-        HW_ADX = (info[1] & ((int)1 << 19)) != 0;
-        HW_SHA = (info[1] & ((int)1 << 29)) != 0;
-        HW_PREFETCHWT1 = (info[2] & ((int)1 << 0)) != 0;
-
-        HW_AVX512F = (info[1] & ((int)1 << 16)) != 0;
-        HW_AVX512CD = (info[1] & ((int)1 << 28)) != 0;
-        HW_AVX512PF = (info[1] & ((int)1 << 26)) != 0;
-        HW_AVX512ER = (info[1] & ((int)1 << 27)) != 0;
-        HW_AVX512VL = (info[1] & ((int)1 << 31)) != 0;
-        HW_AVX512BW = (info[1] & ((int)1 << 30)) != 0;
-        HW_AVX512DQ = (info[1] & ((int)1 << 17)) != 0;
-        HW_AVX512IFMA = (info[1] & ((int)1 << 21)) != 0;
-        HW_AVX512VBMI = (info[2] & ((int)1 << 1)) != 0;
+        HW_AVX2 = (info[1] & ((uint32_t)1 << 5)) != 0;
+
+        HW_BMI1 = (info[1] & ((uint32_t)1 << 3)) != 0;
+        HW_BMI2 = (info[1] & ((uint32_t)1 << 8)) != 0;
+        HW_ADX = (info[1] & ((uint32_t)1 << 19)) != 0;
+        HW_SHA = (info[1] & ((uint32_t)1 << 29)) != 0;
+        HW_PREFETCHWT1 = (info[2] & ((uint32_t)1 << 0)) != 0;
+
+        HW_AVX512F = (info[1] & ((uint32_t)1 << 16)) != 0;
+        HW_AVX512CD = (info[1] & ((uint32_t)1 << 28)) != 0;
+        HW_AVX512PF = (info[1] & ((uint32_t)1 << 26)) != 0;
+        HW_AVX512ER = (info[1] & ((uint32_t)1 << 27)) != 0;
+        HW_AVX512VL = (info[1] & ((uint32_t)1 << 31)) != 0;
+        HW_AVX512BW = (info[1] & ((uint32_t)1 << 30)) != 0;
+        HW_AVX512DQ = (info[1] & ((uint32_t)1 << 17)) != 0;
+        HW_AVX512IFMA = (info[1] & ((uint32_t)1 << 21)) != 0;
+        HW_AVX512VBMI = (info[2] & ((uint32_t)1 << 1)) != 0;
     }
     if (nExIds >= 0x80000001) {
         cpuid(info, 0x80000001);
-        HW_x64 = (info[3] & ((int)1 << 29)) != 0;
-        HW_ABM = (info[2] & ((int)1 << 5)) != 0;
-        HW_SSE4a = (info[2] & ((int)1 << 6)) != 0;
-        HW_FMA4 = (info[2] & ((int)1 << 16)) != 0;
-        HW_XOP = (info[2] & ((int)1 << 11)) != 0;
+        HW_x64 = (info[3] & ((uint32_t)1 << 29)) != 0;
+        HW_ABM = (info[2] & ((uint32_t)1 << 5)) != 0;
+        HW_SSE4a = (info[2] & ((uint32_t)1 << 6)) != 0;
+        HW_FMA4 = (info[2] & ((uint32_t)1 << 16)) != 0;
+        HW_XOP = (info[2] & ((uint32_t)1 << 11)) != 0;
     }
 }
 
diff --git a/src/go.c b/src/go.c
index 5d507768e50..88da6c0d14d 100644
--- a/src/go.c
+++ b/src/go.c
@@ -47,6 +47,7 @@ moves load_go_moves(char *filename)
     printf("%d\n", count);
     m.n = count;
     m.data = (char**)realloc(m.data, count * sizeof(char*));
+    fclose(fp);
     return m;
 }
 
diff --git a/src/image_opencv.cpp b/src/image_opencv.cpp
index 6951fb9a80e..0a8ccd9841e 100644
--- a/src/image_opencv.cpp
+++ b/src/image_opencv.cpp
@@ -703,11 +703,12 @@ int set_capture_position_frame_cv(cap_cv *cap, int index)
 
 image get_image_from_stream_cpp(cap_cv *cap)
 {
-    cv::Mat *src = new cv::Mat();
+    cv::Mat *src = NULL;
     static int once = 1;
     if (once) {
         once = 0;
         do {
+            if (src) delete src;
             src = get_capture_frame_cv(cap);
             if (!src) return make_empty_image(0, 0, 0);
         } while (src->cols < 1 || src->rows < 1 || src->channels() < 1);
@@ -719,6 +720,7 @@ image get_image_from_stream_cpp(cap_cv *cap)
     if (!src) return make_empty_image(0, 0, 0);
     image im = mat_to_image(*src);
     rgbgr_image(im);
+    if (src) delete src;
     return im;
 }
 // ----------------------------------------
diff --git a/src/network.c b/src/network.c
index c2249a54df6..15c9a9bca80 100644
--- a/src/network.c
+++ b/src/network.c
@@ -809,6 +809,7 @@ char *detection_to_json(detection *dets, int nboxes, int classes, char **names,
     const float thresh = 0.005; // function get_network_boxes() has already filtred dets by actual threshold
 
     char *send_buf = (char *)calloc(1024, sizeof(char));
+    if (!send_buf) return 0;
     if (filename) {
         sprintf(send_buf, "{\n \"frame_id\":%lld, \n \"filename\":\"%s\", \n \"objects\": [ \n", frame_id, filename);
     }
@@ -826,6 +827,7 @@ char *detection_to_json(detection *dets, int nboxes, int classes, char **names,
                 if (class_id != -1) strcat(send_buf, ", \n");
                 class_id = j;
                 char *buf = (char *)calloc(2048, sizeof(char));
+                if (!buf) return 0;
                 //sprintf(buf, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f}",
                 //    image_id, j, dets[i].bbox.x, dets[i].bbox.y, dets[i].bbox.w, dets[i].bbox.h, dets[i].prob[j]);
 
@@ -836,7 +838,10 @@ char *detection_to_json(detection *dets, int nboxes, int classes, char **names,
                 int buf_len = strlen(buf);
                 int total_len = send_buf_len + buf_len + 100;
                 send_buf = (char *)realloc(send_buf, total_len * sizeof(char));
-                if (!send_buf) return 0;// exit(-1);
+                if (!send_buf) {
+                    if (buf) free(buf);
+                    return 0;// exit(-1);
+                }
                 strcat(send_buf, buf);
                 free(buf);
             }
diff --git a/src/utils.c b/src/utils.c
index bee427ed743..f18769ce76b 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -41,6 +41,7 @@ int *read_map(char *filename)
         map = (int*)realloc(map, n * sizeof(int));
         map[n-1] = atoi(str);
     }
+    if (file) fclose(file);
     return map;
 }
 
@@ -65,6 +66,7 @@ void shuffle(void *arr, size_t n, size_t size)
         memcpy((char*)arr+(j*size), (char*)arr+(i*size), size);
         memcpy((char*)arr+(i*size), swp,          size);
     }
+    free(swp);
 }
 
 void del_arg(int argc, char **argv, int index)
@@ -685,9 +687,9 @@ int max_index(float *a, int n)
 
 int top_max_index(float *a, int n, int k)
 {
+    if (n <= 0) return -1;
     float *values = (float*)calloc(k, sizeof(float));
     int *indexes = (int*)calloc(k, sizeof(int));
-    if (n <= 0) return -1;
     int i, j;
     for (i = 0; i < n; ++i) {
         for (j = 0; j < k; ++j) {
diff --git a/src/yolo.c b/src/yolo.c
index 711470eade2..339d49cd5ce 100644
--- a/src/yolo.c
+++ b/src/yolo.c
@@ -189,6 +189,14 @@ void validate_yolo(char *cfgfile, char *weightfile)
             free_image(val_resized[t]);
         }
     }
+
+    if (fps) free(fps);
+    if (val) free(val);
+    if (val_resized) free(val_resized);
+    if (buf) free(buf);
+    if (buf_resized) free(buf_resized);
+    if (thr) free(thr);
+
     fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
 }
 

From 71e835458904f782a905a06d28b4558d9e9830b4 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 15 Nov 2019 23:33:16 +0300
Subject: [PATCH 68/86] Fixed source and destination overlap in sprintf() (i.e.
 in strcpy() inside)

---
 src/utils.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils.c b/src/utils.c
index f18769ce76b..4651cc0a1e8 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -218,7 +218,7 @@ void find_replace_extension(char *str, char *orig, char *rep, char *output)
     int offset = (p - buffer);
     int chars_from_end = strlen(buffer) - offset;
     if (!p || chars_from_end != strlen(orig)) {  // Is 'orig' even in 'str' AND is 'orig' found at the end of 'str'?
-        sprintf(output, "%s", str);
+        sprintf(output, "%s", buffer);
         free(buffer);
         return;
     }

From 0cf4c16c9fca93502333fef383e1287b38ca38f8 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 16 Nov 2019 16:50:01 +0300
Subject: [PATCH 69/86] Added GIoU to [Gaussian_yolo]. Added iou_thresh=0.213
 to [Gaussian_yolo].

---
 src/gaussian_yolo_layer.c | 188 +++++++++++++++++++++++++++++++++-----
 src/parser.c              |  18 +++-
 src/yolo_layer.c          | 117 ++++++++++--------------
 3 files changed, 225 insertions(+), 98 deletions(-)

diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index 3b58cc5a404..b834c0b13ce 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -81,7 +81,7 @@ layer make_gaussian_yolo_layer(int batch, int w, int h, int n, int total, int *m
     */
 #endif
 
-    fprintf(stderr, "Gaussian_yolo\n");
+    //fprintf(stderr, "Gaussian_yolo\n");
     srand(time(0));
 
     return l;
@@ -140,32 +140,70 @@ box get_gaussian_yolo_box(float *x, float *biases, int n, int index, int i, int
     return b;
 }
 
-float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride)
+float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss, int accumulate)
 {
     box pred = get_gaussian_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
-    float iou = box_iou(pred, truth);
 
-    float tx = (truth.x*lw - i);
-    float ty = (truth.y*lh - j);
-    float tw = log(truth.w*w / biases[2*n]);
-    float th = log(truth.h*h / biases[2*n + 1]);
+    float iou;
+    ious all_ious = { 0 };
+    all_ious.iou = box_iou(pred, truth);
+    all_ious.giou = box_giou(pred, truth);
+    if (pred.w == 0) { pred.w = 1.0; }
+    if (pred.h == 0) { pred.h = 1.0; }
 
     float sigma_const = 0.3;
     float epsi = pow(10,-9);
 
-    float in_exp_x = (tx - x[index + 0*stride])/x[index+1*stride];
+    float dx, dy, dw, dh;
+
+    if (iou_loss == MSE) {
+        iou = all_ious.iou;
+
+        float tx = (truth.x*lw - i);
+        float ty = (truth.y*lh - j);
+        float tw = log(truth.w*w / biases[2 * n]);
+        float th = log(truth.h*h / biases[2 * n + 1]);
+
+        dx = (tx - x[index + 0 * stride]);
+        dy = (ty - x[index + 2 * stride]);
+        dw = (tw - x[index + 4 * stride]);
+        dh = (th - x[index + 6 * stride]);
+    }
+    else
+    {
+        iou = all_ious.giou;
+
+        // https://github.com/generalized-iou/g-darknet
+        // https://arxiv.org/abs/1902.09630v2
+        // https://giou.stanford.edu/
+        all_ious.dx_iou = dx_box_iou(pred, truth, iou_loss);
+
+        // jacobian^t (transpose)
+        dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
+        dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
+        dw = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
+        dh = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
+
+        // normalize iou weight
+        dx *= iou_normalizer;
+        dy *= iou_normalizer;
+        dw *= iou_normalizer;
+        dh *= iou_normalizer;
+    }
+
+    float in_exp_x = dx / x[index+1*stride];
     float in_exp_x_2 = pow(in_exp_x, 2);
     float normal_dist_x = exp(in_exp_x_2*(-1./2.))/(sqrt(M_PI * 2.0)*(x[index+1*stride]+sigma_const));
 
-    float in_exp_y = (ty - x[index + 2*stride])/x[index+3*stride];
+    float in_exp_y = dy / x[index+3*stride];
     float in_exp_y_2 = pow(in_exp_y, 2);
     float normal_dist_y = exp(in_exp_y_2*(-1./2.))/(sqrt(M_PI * 2.0)*(x[index+3*stride]+sigma_const));
 
-    float in_exp_w = (tw - x[index + 4*stride])/x[index+5*stride];
+    float in_exp_w = dw / x[index+5*stride];
     float in_exp_w_2 = pow(in_exp_w, 2);
     float normal_dist_w = exp(in_exp_w_2*(-1./2.))/(sqrt(M_PI * 2.0)*(x[index+5*stride]+sigma_const));
 
-    float in_exp_h = (th - x[index + 6*stride])/x[index+7*stride];
+    float in_exp_h = dh / x[index+7*stride];
     float in_exp_h_2 = pow(in_exp_h, 2);
     float normal_dist_h = exp(in_exp_h_2*(-1./2.))/(sqrt(M_PI * 2.0)*(x[index+7*stride]+sigma_const));
 
@@ -174,15 +212,26 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
     float temp_w = (1./2.) * 1./(normal_dist_w+epsi) * normal_dist_w * scale;
     float temp_h = (1./2.) * 1./(normal_dist_h+epsi) * normal_dist_h * scale;
 
-    delta[index + 0*stride] = temp_x * in_exp_x  * (1./x[index+1*stride]);
-    delta[index + 2*stride] = temp_y * in_exp_y  * (1./x[index+3*stride]);
-    delta[index + 4*stride] = temp_w * in_exp_w  * (1./x[index+5*stride]);
-    delta[index + 6*stride] = temp_h * in_exp_h  * (1./x[index+7*stride]);
+    if (!accumulate) {
+        delta[index + 0 * stride] = 0;
+        delta[index + 1 * stride] = 0;
+        delta[index + 2 * stride] = 0;
+        delta[index + 3 * stride] = 0;
+        delta[index + 4 * stride] = 0;
+        delta[index + 5 * stride] = 0;
+        delta[index + 6 * stride] = 0;
+        delta[index + 7 * stride] = 0;
+    }
 
-    delta[index + 1*stride] = temp_x * (in_exp_x_2/x[index+1*stride] - 1./(x[index+1*stride]+sigma_const));
-    delta[index + 3*stride] = temp_y * (in_exp_y_2/x[index+3*stride] - 1./(x[index+3*stride]+sigma_const));
-    delta[index + 5*stride] = temp_w * (in_exp_w_2/x[index+5*stride] - 1./(x[index+5*stride]+sigma_const));
-    delta[index + 7*stride] = temp_h * (in_exp_h_2/x[index+7*stride] - 1./(x[index+7*stride]+sigma_const));
+    delta[index + 0*stride] += temp_x * in_exp_x  * (1./x[index+1*stride]);
+    delta[index + 2*stride] += temp_y * in_exp_y  * (1./x[index+3*stride]);
+    delta[index + 4*stride] += temp_w * in_exp_w  * (1./x[index+5*stride]);
+    delta[index + 6*stride] += temp_h * in_exp_h  * (1./x[index+7*stride]);
+
+    delta[index + 1*stride] += temp_x * (in_exp_x_2/x[index+1*stride] - 1./(x[index+1*stride]+sigma_const));
+    delta[index + 3*stride] += temp_y * (in_exp_y_2/x[index+3*stride] - 1./(x[index+3*stride]+sigma_const));
+    delta[index + 5*stride] += temp_w * (in_exp_w_2/x[index+5*stride] - 1./(x[index+5*stride]+sigma_const));
+    delta[index + 7*stride] += temp_h * (in_exp_h_2/x[index+7*stride] - 1./(x[index+7*stride]+sigma_const));
     return iou;
 }
 
@@ -201,6 +250,18 @@ void delta_gaussian_yolo_class(float *output, float *delta, int index, int class
     }
 }
 
+int compare_gaussian_yolo_class(float *output, int classes, int class_index, int stride, float objectness, int class_id, float conf_thresh)
+{
+    int j;
+    for (j = 0; j < classes; ++j) {
+        float prob = objectness * output[class_index + stride*j];
+        if (prob > conf_thresh) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
 static int entry_gaussian_index(layer l, int batch, int location, int entry)
 {
     int n =   location / (l.w*l.h);
@@ -254,12 +315,31 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
                 for (n = 0; n < l.n; ++n) {
                     int box_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 0);
                     box pred = get_gaussian_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.w*l.h);
+                    float best_match_iou = 0;
+                    int best_match_t = 0;
                     float best_iou = 0;
                     int best_t = 0;
                     for(t = 0; t < l.max_boxes; ++t){
                         box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
+                        int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
+                        if (class_id >= l.classes) {
+                            printf(" Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes - 1);
+                            printf(" truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f, class_id = %d \n", truth.x, truth.y, truth.w, truth.h, class_id);
+                            getchar();
+                            continue; // if label contains class_id more than number of classes in the cfg-file
+                        }
                         if(!truth.x) break;
+
+                        int class_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 9);
+                        int obj_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 8);
+                        float objectness = l.output[obj_index];
+                        int class_id_match = compare_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness, class_id, 0.25f);
+
                         float iou = box_iou(pred, truth);
+                        if (iou > best_match_iou && class_id_match == 1) {
+                            best_match_iou = iou;
+                            best_match_t = t;
+                        }
                         if (iou > best_iou) {
                             best_iou = iou;
                             best_t = t;
@@ -267,19 +347,19 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
                     }
                     int obj_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 8);
                     avg_anyobj += l.output[obj_index];
-                    l.delta[obj_index] = 0 - l.output[obj_index];
-                    if (best_iou > l.ignore_thresh) {
+                    l.delta[obj_index] = l.cls_normalizer * (0 - l.output[obj_index]);
+                    if (best_match_iou > l.ignore_thresh) {
                         l.delta[obj_index] = 0;
                     }
                     if (best_iou > l.truth_thresh) {
-                        l.delta[obj_index] = 1 - l.output[obj_index];
+                        l.delta[obj_index] = l.cls_normalizer * (1 - l.output[obj_index]);
 
                         int class_id = state.truth[best_t*(4 + 1) + b*l.truths + 4];
                         if (l.map) class_id = l.map[class_id];
                         int class_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 9);
                         delta_gaussian_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, 0);
                         box truth = float_to_box_stride(state.truth + best_t*(4 + 1) + b*l.truths, 1);
-                        delta_gaussian_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
+                        delta_gaussian_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, 1);
                     }
                 }
             }
@@ -308,11 +388,11 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
             int mask_n = int_index(l.mask, best_n, l.n);
             if(mask_n >= 0){
                 int box_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
-                float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
+                float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, 1);
 
                 int obj_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 8);
                 avg_obj += l.output[obj_index];
-                l.delta[obj_index] = 1 - l.output[obj_index];
+                l.delta[obj_index] = l.cls_normalizer * (1 - l.output[obj_index]);
 
                 int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
                 if (l.map) class_id = l.map[class_id];
@@ -325,6 +405,64 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
                 if(iou > .75) recall75 += 1;
                 avg_iou += iou;
             }
+
+
+            // iou_thresh
+            for (n = 0; n < l.total; ++n) {
+                int mask_n = int_index(l.mask, n, l.n);
+                if (mask_n >= 0 && n != best_n && l.iou_thresh < 1.0f) {
+                    box pred = { 0 };
+                    pred.w = l.biases[2 * n] / state.net.w;
+                    pred.h = l.biases[2 * n + 1] / state.net.h;
+                    float iou = box_iou(pred, truth_shift);
+                    // iou, n
+
+                    if (iou > l.iou_thresh) {
+                        int box_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
+                        float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, 1);
+
+                        int obj_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 8);
+                        avg_obj += l.output[obj_index];
+                        l.delta[obj_index] = l.cls_normalizer * (1 - l.output[obj_index]);
+
+                        int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
+                        if (l.map) class_id = l.map[class_id];
+                        int class_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 9);
+                        delta_gaussian_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, &avg_cat);
+
+                        ++count;
+                        ++class_count;
+                        if (iou > .5) recall += 1;
+                        if (iou > .75) recall75 += 1;
+                        avg_iou += iou;
+                    }
+                }
+            }
+        }
+
+        // averages the deltas obtained by the function: delta_yolo_box()_accumulate
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    int box_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                    int class_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 9);
+                    const int stride = l.w*l.h;
+
+                    int classes_in_one_box = 0;
+                    for (n = 0; n < l.classes; ++n) {
+                        if (l.delta[class_index + stride*n] > 0) classes_in_one_box++;
+                    }
+
+                    l.delta[box_index + 0 * stride] /= classes_in_one_box;
+                    l.delta[box_index + 1 * stride] /= classes_in_one_box;
+                    l.delta[box_index + 2 * stride] /= classes_in_one_box;
+                    l.delta[box_index + 3 * stride] /= classes_in_one_box;
+                    l.delta[box_index + 4 * stride] /= classes_in_one_box;
+                    l.delta[box_index + 5 * stride] /= classes_in_one_box;
+                    l.delta[box_index + 6 * stride] /= classes_in_one_box;
+                    l.delta[box_index + 7 * stride] /= classes_in_one_box;
+                }
+            }
         }
     }
     *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
diff --git a/src/parser.c b/src/parser.c
index 472b179d20d..f0533f36e12 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -436,14 +436,28 @@ layer parse_gaussian_yolo(list *options, size_params params) // Gaussian_YOLOv3
     char *a = option_find_str(options, "mask", 0);
     int *mask = parse_gaussian_yolo_mask(a, &num);
     layer l = make_gaussian_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes, max_boxes);
-    assert(l.outputs == params.inputs);
+    if (l.outputs != params.inputs) {
+        printf("Error: l.outputs == params.inputs \n");
+        printf("filters= in the [convolutional]-layer doesn't correspond to classes= or mask= in [Gaussian_yolo]-layer \n");
+        exit(EXIT_FAILURE);
+    }
+    //assert(l.outputs == params.inputs);
 
     l.scale_x_y = option_find_float_quiet(options, "scale_x_y", 1);
-    l.max_boxes = option_find_int_quiet(options, "max", 90);
+    l.iou_normalizer = option_find_float_quiet(options, "iou_normalizer", 0.75);
+    l.cls_normalizer = option_find_float_quiet(options, "cls_normalizer", 1);
+    char *iou_loss = option_find_str_quiet(options, "iou_loss", "mse");   //  "iou");
+
+    if (strcmp(iou_loss, "mse") == 0) l.iou_loss = MSE;
+    else if (strcmp(iou_loss, "giou") == 0) l.iou_loss = GIOU;
+    else l.iou_loss = IOU;
+    fprintf(stderr, "[Gaussian_yolo] iou loss: %s, iou_norm: %2.2f, cls_norm: %2.2f, scale: %2.2f\n", (l.iou_loss == MSE ? "mse" : (l.iou_loss == GIOU ? "giou" : "iou")), l.iou_normalizer, l.cls_normalizer, l.scale_x_y);
+
     l.jitter = option_find_float(options, "jitter", .2);
 
     l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
     l.truth_thresh = option_find_float(options, "truth_thresh", 1);
+    l.iou_thresh = option_find_float_quiet(options, "iou_thresh", 1); // recommended to use iou_thresh=0.213 in [yolo]
     l.random = option_find_int_quiet(options, "random", 0);
 
     char *map_file = option_find_str(options, "map", 0);
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index 6c02310617a..b663b73da74 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -128,72 +128,7 @@ box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw
     return b;
 }
 
-
-int compare_yolo_class(float *output, int classes, int class_index, int stride, float objectness, int class_id)
-{
-    const float conf_thresh = 0.25;
-
-    int j;
-    for (j = 0; j < classes; ++j) {
-        float prob = objectness * output[class_index + stride*j];
-        if (prob > conf_thresh) {
-            return 1;
-        }
-    }
-    return 0;
-}
-
-ious delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss)
-{
-    ious all_ious = { 0 };
-    // i - step in layer width
-    // j - step in layer height
-    //  Returns a box in absolute coordinates
-    box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
-    all_ious.iou = box_iou(pred, truth);
-    all_ious.giou = box_giou(pred, truth);
-    // avoid nan in dx_box_iou
-    if (pred.w == 0) { pred.w = 1.0; }
-    if (pred.h == 0) { pred.h = 1.0; }
-    if (iou_loss == MSE)    // old loss
-    {
-        float tx = (truth.x*lw - i);
-        float ty = (truth.y*lh - j);
-        float tw = log(truth.w*w / biases[2 * n]);
-        float th = log(truth.h*h / biases[2 * n + 1]);
-
-        delta[index + 0 * stride] = scale * (tx - x[index + 0 * stride]);
-        delta[index + 1 * stride] = scale * (ty - x[index + 1 * stride]);
-        delta[index + 2 * stride] = scale * (tw - x[index + 2 * stride]);
-        delta[index + 3 * stride] = scale * (th - x[index + 3 * stride]);
-    }
-    else {
-        // https://github.com/generalized-iou/g-darknet
-        // https://arxiv.org/abs/1902.09630v2
-        // https://giou.stanford.edu/
-        all_ious.dx_iou = dx_box_iou(pred, truth, iou_loss);
-
-        // jacobian^t (transpose)
-        delta[index + 0 * stride] = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
-        delta[index + 1 * stride] = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
-        delta[index + 2 * stride] = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
-        delta[index + 3 * stride] = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
-
-        // predict exponential, apply gradient of e^delta_t ONLY for w,h
-        delta[index + 2 * stride] *= exp(x[index + 2 * stride]);
-        delta[index + 3 * stride] *= exp(x[index + 3 * stride]);
-
-        // normalize iou weight
-        delta[index + 0 * stride] *= iou_normalizer;
-        delta[index + 1 * stride] *= iou_normalizer;
-        delta[index + 2 * stride] *= iou_normalizer;
-        delta[index + 3 * stride] *= iou_normalizer;
-    }
-
-    return all_ious;
-}
-
-ious delta_yolo_box_accumulate(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss)
+ious delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss, int accumulate)
 {
     ious all_ious = { 0 };
     // i - step in layer width
@@ -240,6 +175,13 @@ ious delta_yolo_box_accumulate(box truth, float *x, float *biases, int n, int in
         dw *= iou_normalizer;
         dh *= iou_normalizer;
 
+        if (!accumulate) {
+            delta[index + 0 * stride] = 0;
+            delta[index + 1 * stride] = 0;
+            delta[index + 2 * stride] = 0;
+            delta[index + 3 * stride] = 0;
+        }
+
         // accumulate delta
         delta[index + 0 * stride] += dx;
         delta[index + 1 * stride] += dy;
@@ -287,6 +229,18 @@ void delta_yolo_class(float *output, float *delta, int index, int class_id, int
     }
 }
 
+int compare_yolo_class(float *output, int classes, int class_index, int stride, float objectness, int class_id, float conf_thresh)
+{
+    int j;
+    for (j = 0; j < classes; ++j) {
+        float prob = objectness * output[class_index + stride*j];
+        if (prob > conf_thresh) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
 static int entry_index(layer l, int batch, int location, int entry)
 {
     int n =   location / (l.w*l.h);
@@ -351,7 +305,7 @@ void forward_yolo_layer(const layer l, network_state state)
                         int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
                         int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
                         float objectness = l.output[obj_index];
-                        int class_id_match = compare_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness, class_id);
+                        int class_id_match = compare_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness, class_id, 0.25f);
 
                         float iou = box_iou(pred, truth);
                         if (iou > best_match_iou && class_id_match == 1) {
@@ -377,7 +331,7 @@ void forward_yolo_layer(const layer l, network_state state)
                         int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
                         delta_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, 0, l.focal_loss);
                         box truth = float_to_box_stride(state.truth + best_t*(4 + 1) + b*l.truths, 1);
-                        delta_yolo_box_accumulate(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss);
+                        delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, 1);
                     }
                 }
             }
@@ -411,7 +365,7 @@ void forward_yolo_layer(const layer l, network_state state)
             int mask_n = int_index(l.mask, best_n, l.n);
             if (mask_n >= 0) {
                 int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
-                ious all_ious = delta_yolo_box_accumulate(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss);
+                ious all_ious = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, 1);
 
                 // range is 0 <= 1
                 tot_iou += all_ious.iou;
@@ -438,7 +392,7 @@ void forward_yolo_layer(const layer l, network_state state)
             // iou_thresh
             for (n = 0; n < l.total; ++n) {
                 int mask_n = int_index(l.mask, n, l.n);
-                if (mask_n >= 0 && n != best_n) {
+                if (mask_n >= 0 && n != best_n && l.iou_thresh < 1.0f) {
                     box pred = { 0 };
                     pred.w = l.biases[2 * n] / state.net.w;
                     pred.h = l.biases[2 * n + 1] / state.net.h;
@@ -447,7 +401,7 @@ void forward_yolo_layer(const layer l, network_state state)
 
                     if (iou > l.iou_thresh) {
                         int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
-                        ious all_ious = delta_yolo_box_accumulate(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss);
+                        ious all_ious = delta_yolo_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, 1);
 
                         // range is 0 <= 1
                         tot_iou += all_ious.iou;
@@ -473,6 +427,27 @@ void forward_yolo_layer(const layer l, network_state state)
                 }
             }
         }
+
+        // averages the deltas obtained by the function: delta_yolo_box()_accumulate
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                    int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
+                    const int stride = l.w*l.h;
+
+                    int classes_in_one_box = 0;
+                    for (n = 0; n < l.classes; ++n) {
+                        if (l.delta[class_index + stride*n] > 0) classes_in_one_box++;
+                    }
+
+                    l.delta[box_index + 0 * stride] /= classes_in_one_box;
+                    l.delta[box_index + 1 * stride] /= classes_in_one_box;
+                    l.delta[box_index + 2 * stride] /= classes_in_one_box;
+                    l.delta[box_index + 3 * stride] /= classes_in_one_box;
+                }
+            }
+        }
     }
     //*(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
     //printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", state.index, avg_iou / count, avg_cat / class_count, avg_obj / count, avg_anyobj / (l.w*l.h*l.n*l.batch), recall / count, recall75 / count, count);

From 0d30db35fc2d3d8edf52ef2870c7f18b1a6eabe3 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 16 Nov 2019 17:14:50 +0300
Subject: [PATCH 70/86] gaussian_yolo: added uc_normalizer and minor fix for
 iou_normalizer for GIoU.

---
 src/gaussian_yolo_layer.c | 59 ++++++++++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 19 deletions(-)

diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index b834c0b13ce..7200fecd61b 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -140,7 +140,7 @@ box get_gaussian_yolo_box(float *x, float *biases, int n, int index, int i, int
     return b;
 }
 
-float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss, int accumulate)
+float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss, float uc_normalizer, int accumulate)
 {
     box pred = get_gaussian_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
 
@@ -157,6 +157,7 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
     float dx, dy, dw, dh;
 
     if (iou_loss == MSE) {
+        // MSE
         iou = all_ious.iou;
 
         float tx = (truth.x*lw - i);
@@ -171,6 +172,7 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
     }
     else
     {
+        // GIoU
         iou = all_ious.giou;
 
         // https://github.com/generalized-iou/g-darknet
@@ -183,14 +185,9 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
         dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
         dw = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
         dh = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
-
-        // normalize iou weight
-        dx *= iou_normalizer;
-        dy *= iou_normalizer;
-        dw *= iou_normalizer;
-        dh *= iou_normalizer;
     }
 
+    // Gaussian
     float in_exp_x = dx / x[index+1*stride];
     float in_exp_x_2 = pow(in_exp_x, 2);
     float normal_dist_x = exp(in_exp_x_2*(-1./2.))/(sqrt(M_PI * 2.0)*(x[index+1*stride]+sigma_const));
@@ -223,15 +220,39 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
         delta[index + 7 * stride] = 0;
     }
 
-    delta[index + 0*stride] += temp_x * in_exp_x  * (1./x[index+1*stride]);
-    delta[index + 2*stride] += temp_y * in_exp_y  * (1./x[index+3*stride]);
-    delta[index + 4*stride] += temp_w * in_exp_w  * (1./x[index+5*stride]);
-    delta[index + 6*stride] += temp_h * in_exp_h  * (1./x[index+7*stride]);
-
-    delta[index + 1*stride] += temp_x * (in_exp_x_2/x[index+1*stride] - 1./(x[index+1*stride]+sigma_const));
-    delta[index + 3*stride] += temp_y * (in_exp_y_2/x[index+3*stride] - 1./(x[index+3*stride]+sigma_const));
-    delta[index + 5*stride] += temp_w * (in_exp_w_2/x[index+5*stride] - 1./(x[index+5*stride]+sigma_const));
-    delta[index + 7*stride] += temp_h * (in_exp_h_2/x[index+7*stride] - 1./(x[index+7*stride]+sigma_const));
+    float delta_x = temp_x * in_exp_x  * (1. / x[index + 1 * stride]);
+    float delta_y = temp_y * in_exp_y  * (1. / x[index + 3 * stride]);
+    float delta_w = temp_w * in_exp_w  * (1. / x[index + 5 * stride]);
+    float delta_h = temp_h * in_exp_h  * (1. / x[index + 7 * stride]);
+
+    float delta_ux = temp_x * (in_exp_x_2 / x[index + 1 * stride] - 1. / (x[index + 1 * stride] + sigma_const));
+    float delta_uy = temp_y * (in_exp_y_2 / x[index + 3 * stride] - 1. / (x[index + 3 * stride] + sigma_const));
+    float delta_uw = temp_w * (in_exp_w_2 / x[index + 5 * stride] - 1. / (x[index + 5 * stride] + sigma_const));
+    float delta_uh = temp_h * (in_exp_h_2 / x[index + 7 * stride] - 1. / (x[index + 7 * stride] + sigma_const));
+
+    if (iou_loss != MSE) {
+        // normalize iou weight, for GIoU
+        delta_x *= iou_normalizer;
+        delta_y *= iou_normalizer;
+        delta_w *= iou_normalizer;
+        delta_h *= iou_normalizer;
+    }
+    // normalize Uncertainty weight
+    delta_ux *= uc_normalizer;
+    delta_uy *= uc_normalizer;
+    delta_uw *= uc_normalizer;
+    delta_uh *= uc_normalizer;
+
+
+    delta[index + 0 * stride] += delta_x;
+    delta[index + 2 * stride] += delta_y;
+    delta[index + 4 * stride] += delta_w;
+    delta[index + 6 * stride] += delta_h;
+
+    delta[index + 1 * stride] += delta_ux;
+    delta[index + 3 * stride] += delta_uy;
+    delta[index + 5 * stride] += delta_uw;
+    delta[index + 7 * stride] += delta_uh;
     return iou;
 }
 
@@ -359,7 +380,7 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
                         int class_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 9);
                         delta_gaussian_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, 0);
                         box truth = float_to_box_stride(state.truth + best_t*(4 + 1) + b*l.truths, 1);
-                        delta_gaussian_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, 1);
+                        delta_gaussian_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, l.uc_normalizer, 1);
                     }
                 }
             }
@@ -388,7 +409,7 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
             int mask_n = int_index(l.mask, best_n, l.n);
             if(mask_n >= 0){
                 int box_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
-                float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, 1);
+                float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, l.uc_normalizer, 1);
 
                 int obj_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 8);
                 avg_obj += l.output[obj_index];
@@ -419,7 +440,7 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
 
                     if (iou > l.iou_thresh) {
                         int box_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
-                        float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, 1);
+                        float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, l.uc_normalizer, 1);
 
                         int obj_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 8);
                         avg_obj += l.output[obj_index];

From 920de66bfa85a9e694d78b6138114c723c854d7c Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 16 Nov 2019 17:23:21 +0300
Subject: [PATCH 71/86] gaussian_yolo: added uc_normalizer and minor fix for
 iou_normalizer for GIoU.

---
 src/parser.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/parser.c b/src/parser.c
index f0533f36e12..3728c4422f9 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -444,8 +444,9 @@ layer parse_gaussian_yolo(list *options, size_params params) // Gaussian_YOLOv3
     //assert(l.outputs == params.inputs);
 
     l.scale_x_y = option_find_float_quiet(options, "scale_x_y", 1);
+    l.uc_normalizer = option_find_float_quiet(options, "uc_normalizer", 1.0);
     l.iou_normalizer = option_find_float_quiet(options, "iou_normalizer", 0.75);
-    l.cls_normalizer = option_find_float_quiet(options, "cls_normalizer", 1);
+    l.cls_normalizer = option_find_float_quiet(options, "cls_normalizer", 1.0);
     char *iou_loss = option_find_str_quiet(options, "iou_loss", "mse");   //  "iou");
 
     if (strcmp(iou_loss, "mse") == 0) l.iou_loss = MSE;

From 6e5bdf1282ad6b06ed0e962c3f5be67cf63d96dc Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 16 Nov 2019 18:16:37 +0300
Subject: [PATCH 72/86] Bug fix

---
 build/darknet/x64/cfg/Gaussian_yolov3_BDD.cfg | 807 ++++++++++++++++++
 cfg/Gaussian_yolov3_BDD.cfg                   | 807 ++++++++++++++++++
 include/darknet.h                             |   1 +
 src/gaussian_yolo_layer.c                     |  94 +-
 src/yolo_layer.c                              |  28 +-
 5 files changed, 1715 insertions(+), 22 deletions(-)
 create mode 100644 build/darknet/x64/cfg/Gaussian_yolov3_BDD.cfg
 create mode 100644 cfg/Gaussian_yolov3_BDD.cfg

diff --git a/build/darknet/x64/cfg/Gaussian_yolov3_BDD.cfg b/build/darknet/x64/cfg/Gaussian_yolov3_BDD.cfg
new file mode 100644
index 00000000000..2ca7ec600e3
--- /dev/null
+++ b/build/darknet/x64/cfg/Gaussian_yolov3_BDD.cfg
@@ -0,0 +1,807 @@
+[net]
+# Testing
+#batch=1
+#subdivisions=1
+# Training
+batch=64
+subdivisions=16
+width=512
+height=512
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.0001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+max_epochs = 300
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=57
+activation=linear
+
+
+[Gaussian_yolo]
+mask = 6,7,8
+anchors = 7,10, 14,24, 27,43, 32,97, 57,64, 92,109, 73,175, 141,178, 144,291
+classes=10
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+iou_thresh=0.213
+uc_normalizer=1.0
+cls_normalizer=1.0
+iou_normalizer=0.5
+iou_loss=giou
+scale_x_y=1.0
+random=1
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=57
+activation=linear
+
+
+[Gaussian_yolo]
+mask = 3,4,5
+anchors = 7,10, 14,24, 27,43, 32,97, 57,64, 92,109, 73,175, 141,178, 144,291
+classes=10
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+iou_thresh=0.213
+uc_normalizer=1.0
+cls_normalizer=1.0
+iou_normalizer=0.5
+iou_loss=giou
+scale_x_y=1.0
+random=1
+
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=57
+activation=linear
+
+
+[Gaussian_yolo]
+mask = 0,1,2
+anchors = 7,10, 14,24, 27,43, 32,97, 57,64, 92,109, 73,175, 141,178, 144,291
+classes=10
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+iou_thresh=0.213
+uc_normalizer=1.0
+cls_normalizer=1.0
+iou_normalizer=0.5
+iou_loss=giou
+scale_x_y=1.0
+random=1
diff --git a/cfg/Gaussian_yolov3_BDD.cfg b/cfg/Gaussian_yolov3_BDD.cfg
new file mode 100644
index 00000000000..2ca7ec600e3
--- /dev/null
+++ b/cfg/Gaussian_yolov3_BDD.cfg
@@ -0,0 +1,807 @@
+[net]
+# Testing
+#batch=1
+#subdivisions=1
+# Training
+batch=64
+subdivisions=16
+width=512
+height=512
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.0001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+max_epochs = 300
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=57
+activation=linear
+
+
+[Gaussian_yolo]
+mask = 6,7,8
+anchors = 7,10, 14,24, 27,43, 32,97, 57,64, 92,109, 73,175, 141,178, 144,291
+classes=10
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+iou_thresh=0.213
+uc_normalizer=1.0
+cls_normalizer=1.0
+iou_normalizer=0.5
+iou_loss=giou
+scale_x_y=1.0
+random=1
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=57
+activation=linear
+
+
+[Gaussian_yolo]
+mask = 3,4,5
+anchors = 7,10, 14,24, 27,43, 32,97, 57,64, 92,109, 73,175, 141,178, 144,291
+classes=10
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+iou_thresh=0.213
+uc_normalizer=1.0
+cls_normalizer=1.0
+iou_normalizer=0.5
+iou_loss=giou
+scale_x_y=1.0
+random=1
+
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=57
+activation=linear
+
+
+[Gaussian_yolo]
+mask = 0,1,2
+anchors = 7,10, 14,24, 27,43, 32,97, 57,64, 92,109, 73,175, 141,178, 144,291
+classes=10
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+iou_thresh=0.213
+uc_normalizer=1.0
+cls_normalizer=1.0
+iou_normalizer=0.5
+iou_loss=giou
+scale_x_y=1.0
+random=1
diff --git a/include/darknet.h b/include/darknet.h
index 2bd70d1cde3..7a906780c61 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -330,6 +330,7 @@ struct layer {
     float *weight_updates;
 
     float scale_x_y;
+    float uc_normalizer;
     float iou_normalizer;
     float cls_normalizer;
     IOU_LOSS iou_loss;
diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index 7200fecd61b..16971437872 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -256,6 +256,26 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
     return iou;
 }
 
+void averages_gaussian_yolo_deltas(int class_index, int box_index, int stride, int classes, float *delta)
+{
+
+    int classes_in_one_box = 0;
+    int c;
+    for (c = 0; c < classes; ++c) {
+        if (delta[class_index + stride*c] > 0) classes_in_one_box++;
+    }
+
+    if (classes_in_one_box > 0) {
+        delta[box_index + 0 * stride] /= classes_in_one_box;
+        delta[box_index + 1 * stride] /= classes_in_one_box;
+        delta[box_index + 2 * stride] /= classes_in_one_box;
+        delta[box_index + 3 * stride] /= classes_in_one_box;
+        delta[box_index + 4 * stride] /= classes_in_one_box;
+        delta[box_index + 5 * stride] /= classes_in_one_box;
+        delta[box_index + 6 * stride] /= classes_in_one_box;
+        delta[box_index + 7 * stride] /= classes_in_one_box;
+    }
+}
 
 void delta_gaussian_yolo_class(float *output, float *delta, int index, int class_id, int classes, int stride, float *avg_cat)
 {
@@ -469,25 +489,73 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
                     int class_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 9);
                     const int stride = l.w*l.h;
 
-                    int classes_in_one_box = 0;
-                    for (n = 0; n < l.classes; ++n) {
-                        if (l.delta[class_index + stride*n] > 0) classes_in_one_box++;
-                    }
+                    averages_gaussian_yolo_deltas(class_index, box_index, stride, l.classes, l.delta);
+                }
+            }
+        }
+    }
+
+
+    // calculate: Classification-loss, IoU-loss and Uncertainty-loss
+    const int stride = l.w*l.h;
+    float* classification_lost = (float *)calloc(l.batch * l.outputs, sizeof(float));
+    memcpy(classification_lost, l.delta, l.batch * l.outputs * sizeof(float));
+
 
-                    l.delta[box_index + 0 * stride] /= classes_in_one_box;
-                    l.delta[box_index + 1 * stride] /= classes_in_one_box;
-                    l.delta[box_index + 2 * stride] /= classes_in_one_box;
-                    l.delta[box_index + 3 * stride] /= classes_in_one_box;
-                    l.delta[box_index + 4 * stride] /= classes_in_one_box;
-                    l.delta[box_index + 5 * stride] /= classes_in_one_box;
-                    l.delta[box_index + 6 * stride] /= classes_in_one_box;
-                    l.delta[box_index + 7 * stride] /= classes_in_one_box;
+    for (b = 0; b < l.batch; ++b) {
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    int box_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+
+                    classification_lost[box_index + 0 * stride] = 0;
+                    classification_lost[box_index + 1 * stride] = 0;
+                    classification_lost[box_index + 2 * stride] = 0;
+                    classification_lost[box_index + 3 * stride] = 0;
+                    classification_lost[box_index + 4 * stride] = 0;
+                    classification_lost[box_index + 5 * stride] = 0;
+                    classification_lost[box_index + 6 * stride] = 0;
+                    classification_lost[box_index + 7 * stride] = 0;
                 }
             }
         }
     }
+    float class_loss = pow(mag_array(classification_lost, l.outputs * l.batch), 2);
+    free(classification_lost);
+
+
+    float* except_uncertainty_lost = (float *)calloc(l.batch * l.outputs, sizeof(float));
+    memcpy(except_uncertainty_lost, l.delta, l.batch * l.outputs * sizeof(float));
+    for (b = 0; b < l.batch; ++b) {
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    int box_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                    except_uncertainty_lost[box_index + 4 * stride] = 0;
+                    except_uncertainty_lost[box_index + 5 * stride] = 0;
+                    except_uncertainty_lost[box_index + 6 * stride] = 0;
+                    except_uncertainty_lost[box_index + 7 * stride] = 0;
+                }
+            }
+        }
+    }
+    float except_uc_loss = pow(mag_array(except_uncertainty_lost, l.outputs * l.batch), 2);
+    free(except_uncertainty_lost);
+
     *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
-    printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", state.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);
+
+    float loss = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+    float uc_loss = loss - except_uc_loss;
+    float iou_loss = except_uc_loss - class_loss;
+
+    loss /= l.batch;
+    class_loss /= l.batch;
+    uc_loss /= l.batch;
+    iou_loss /= l.batch;
+
+    printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d, loss = %.2f, class_loss = %.2f, iou_loss = %.2f, uc_loss = %.2f \n",
+        state.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count,
+        loss, class_loss, iou_loss, uc_loss);
 }
 
 void backward_gaussian_yolo_layer(const layer l, network_state state)
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index b663b73da74..34185937459 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -192,6 +192,23 @@ ious delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i,
     return all_ious;
 }
 
+void averages_yolo_deltas(int class_index, int box_index, int stride, int classes, float *delta)
+{
+
+    int classes_in_one_box = 0;
+    int c;
+    for (c = 0; c < classes; ++c) {
+        if (delta[class_index + stride*c] > 0) classes_in_one_box++;
+    }
+
+    if (classes_in_one_box > 0) {
+        delta[box_index + 0 * stride] /= classes_in_one_box;
+        delta[box_index + 1 * stride] /= classes_in_one_box;
+        delta[box_index + 2 * stride] /= classes_in_one_box;
+        delta[box_index + 3 * stride] /= classes_in_one_box;
+    }
+}
+
 void delta_yolo_class(float *output, float *delta, int index, int class_id, int classes, int stride, float *avg_cat, int focal_loss)
 {
     int n;
@@ -436,19 +453,12 @@ void forward_yolo_layer(const layer l, network_state state)
                     int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
                     const int stride = l.w*l.h;
 
-                    int classes_in_one_box = 0;
-                    for (n = 0; n < l.classes; ++n) {
-                        if (l.delta[class_index + stride*n] > 0) classes_in_one_box++;
-                    }
-
-                    l.delta[box_index + 0 * stride] /= classes_in_one_box;
-                    l.delta[box_index + 1 * stride] /= classes_in_one_box;
-                    l.delta[box_index + 2 * stride] /= classes_in_one_box;
-                    l.delta[box_index + 3 * stride] /= classes_in_one_box;
+                    averages_yolo_deltas(class_index, box_index, stride, l.classes, l.delta);
                 }
             }
         }
     }
+
     //*(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
     //printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", state.index, avg_iou / count, avg_cat / class_count, avg_obj / count, avg_anyobj / (l.w*l.h*l.n*l.batch), recall / count, recall75 / count, count);
 

From 5d0aa6ec522afd5d859c068f40eded77d6704fa0 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sun, 17 Nov 2019 01:18:28 +0300
Subject: [PATCH 73/86] Compile fix

---
 src/gaussian_yolo_layer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index 16971437872..93569cf0eac 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -374,7 +374,7 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
                         int class_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 9);
                         int obj_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 8);
                         float objectness = l.output[obj_index];
-                        int class_id_match = compare_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness, class_id, 0.25f);
+                        int class_id_match = compare_gaussian_yolo_class(l.output, l.classes, class_index, l.w*l.h, objectness, class_id, 0.25f);
 
                         float iou = box_iou(pred, truth);
                         if (iou > best_match_iou && class_id_match == 1) {

From 77e60aecfe7f92de24fb1b96b5554fb16bc0ac06 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sun, 17 Nov 2019 20:42:21 +0300
Subject: [PATCH 74/86] Minor fix

---
 src/box.c                 |  1 +
 src/detector.c            |  1 +
 src/gaussian_yolo_layer.c | 70 +++++++++++++++++++++------------------
 3 files changed, 39 insertions(+), 33 deletions(-)

diff --git a/src/box.c b/src/box.c
index c6a27ed587f..22f85884009 100644
--- a/src/box.c
+++ b/src/box.c
@@ -425,6 +425,7 @@ int nms_comparator_v3(const void *pa, const void *pb)
     float diff = 0;
     if (b.sort_class >= 0) {
         diff = a.prob[b.sort_class] - b.prob[b.sort_class];
+        //diff = a.objectness*a.prob[b.sort_class] - b.objectness*b.prob[b.sort_class];
     }
     else {
         diff = a.objectness - b.objectness;
diff --git a/src/detector.c b/src/detector.c
index 7f32c4fa7a6..8177343ef3d 100644
--- a/src/detector.c
+++ b/src/detector.c
@@ -801,6 +801,7 @@ float validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, floa
             }
             //detection *dets = get_network_boxes(&net, val[t].w, val[t].h, thresh, hier_thresh, 0, 1, &nboxes, letter_box); // for letter_box=1
             if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
+            //if (nms) do_nms_obj(dets, nboxes, l.classes, nms);
 
             char labelpath[4096];
             replace_image_to_label(path, labelpath);
diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index 93569cf0eac..236e28e7d4e 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -156,36 +156,17 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
 
     float dx, dy, dw, dh;
 
-    if (iou_loss == MSE) {
-        // MSE
-        iou = all_ious.iou;
-
-        float tx = (truth.x*lw - i);
-        float ty = (truth.y*lh - j);
-        float tw = log(truth.w*w / biases[2 * n]);
-        float th = log(truth.h*h / biases[2 * n + 1]);
-
-        dx = (tx - x[index + 0 * stride]);
-        dy = (ty - x[index + 2 * stride]);
-        dw = (tw - x[index + 4 * stride]);
-        dh = (th - x[index + 6 * stride]);
-    }
-    else
-    {
-        // GIoU
-        iou = all_ious.giou;
+    iou = all_ious.iou;
 
-        // https://github.com/generalized-iou/g-darknet
-        // https://arxiv.org/abs/1902.09630v2
-        // https://giou.stanford.edu/
-        all_ious.dx_iou = dx_box_iou(pred, truth, iou_loss);
+    float tx = (truth.x*lw - i);
+    float ty = (truth.y*lh - j);
+    float tw = log(truth.w*w / biases[2 * n]);
+    float th = log(truth.h*h / biases[2 * n + 1]);
 
-        // jacobian^t (transpose)
-        dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
-        dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
-        dw = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
-        dh = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
-    }
+    dx = (tx - x[index + 0 * stride]);
+    dy = (ty - x[index + 2 * stride]);
+    dw = (tw - x[index + 4 * stride]);
+    dh = (th - x[index + 6 * stride]);
 
     // Gaussian
     float in_exp_x = dx / x[index+1*stride];
@@ -231,19 +212,42 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
     float delta_uh = temp_h * (in_exp_h_2 / x[index + 7 * stride] - 1. / (x[index + 7 * stride] + sigma_const));
 
     if (iou_loss != MSE) {
+        // GIoU
+        iou = all_ious.giou;
+
+        // https://github.com/generalized-iou/g-darknet
+        // https://arxiv.org/abs/1902.09630v2
+        // https://giou.stanford.edu/
+        all_ious.dx_iou = dx_box_iou(pred, truth, iou_loss);
+
+        // jacobian^t (transpose)
+        float dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
+        float dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
+        float dw = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
+        float dh = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
+
+        // predict exponential, apply gradient of e^delta_t ONLY for w,h
+        dw *= exp(x[index + 4 * stride]);
+        dh *= exp(x[index + 6 * stride]);
+
         // normalize iou weight, for GIoU
-        delta_x *= iou_normalizer;
-        delta_y *= iou_normalizer;
-        delta_w *= iou_normalizer;
-        delta_h *= iou_normalizer;
+        dx *= iou_normalizer;
+        dy *= iou_normalizer;
+        dw *= iou_normalizer;
+        dh *= iou_normalizer;
+
+        delta_x = (delta_x + dx) / 2;
+        delta_y = (delta_y + dy) / 2;
+        delta_w = (delta_w + dw) / 2;
+        delta_h = (delta_h + dh) / 2;
     }
+
     // normalize Uncertainty weight
     delta_ux *= uc_normalizer;
     delta_uy *= uc_normalizer;
     delta_uw *= uc_normalizer;
     delta_uh *= uc_normalizer;
 
-
     delta[index + 0 * stride] += delta_x;
     delta[index + 2 * stride] += delta_y;
     delta[index + 4 * stride] += delta_w;

From 10c40551dcadec68050befa6a1cecc6f69049d0d Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Mon, 18 Nov 2019 02:56:17 +0300
Subject: [PATCH 75/86] GIoU + Gaussian fix

---
 src/box.c                 |  3 +--
 src/convolutional_layer.c |  5 +++++
 src/gaussian_yolo_layer.c | 11 ++++++-----
 src/yolo_layer.c          |  3 ++-
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/box.c b/src/box.c
index 22f85884009..cb28ce8e4d7 100644
--- a/src/box.c
+++ b/src/box.c
@@ -424,8 +424,7 @@ int nms_comparator_v3(const void *pa, const void *pb)
     detection b = *(detection *)pb;
     float diff = 0;
     if (b.sort_class >= 0) {
-        diff = a.prob[b.sort_class] - b.prob[b.sort_class];
-        //diff = a.objectness*a.prob[b.sort_class] - b.objectness*b.prob[b.sort_class];
+        diff = a.prob[b.sort_class] - b.prob[b.sort_class]; // there is already: prob = objectness*prob
     }
     else {
         diff = a.objectness - b.objectness;
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index f9d66ebf3b6..39b65a70d4c 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -405,6 +405,11 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     l.nweights = (c / groups) * n * size * size;
 
     if (l.share_layer) {
+        if (l.size != l.share_layer->size || l.nweights != l.share_layer->nweights || l.c != l.share_layer->c || l.n != l.share_layer->n) {
+            printf("Layer size, nweights, channels or filters don't match for the share_layer");
+            getchar();
+        }
+
         l.weights = l.share_layer->weights;
         l.weight_updates = l.share_layer->weight_updates;
 
diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index 236e28e7d4e..109eb522c3a 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -236,10 +236,10 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
         dw *= iou_normalizer;
         dh *= iou_normalizer;
 
-        delta_x = (delta_x + dx) / 2;
-        delta_y = (delta_y + dy) / 2;
-        delta_w = (delta_w + dw) / 2;
-        delta_h = (delta_h + dh) / 2;
+        delta_x = dx;
+        delta_y = dy;
+        delta_w = dw;
+        delta_h = dh;
     }
 
     // normalize Uncertainty weight
@@ -299,7 +299,8 @@ int compare_gaussian_yolo_class(float *output, int classes, int class_index, int
 {
     int j;
     for (j = 0; j < classes; ++j) {
-        float prob = objectness * output[class_index + stride*j];
+        //float prob = objectness * output[class_index + stride*j];
+        float prob = output[class_index + stride*j];
         if (prob > conf_thresh) {
             return 1;
         }
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index 34185937459..08577db5ab4 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -250,7 +250,8 @@ int compare_yolo_class(float *output, int classes, int class_index, int stride,
 {
     int j;
     for (j = 0; j < classes; ++j) {
-        float prob = objectness * output[class_index + stride*j];
+        //float prob = objectness * output[class_index + stride*j];
+        float prob = output[class_index + stride*j];
         if (prob > conf_thresh) {
             return 1;
         }

From b4c0fbaec86e5088848094cdaf58d56f94138a13 Mon Sep 17 00:00:00 2001
From: dccho <dongchan.cho@motiongestures.com>
Date: Mon, 18 Nov 2019 09:52:22 -0500
Subject: [PATCH 76/86] fix memory free bug

---
 src/convolutional_layer.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index e784f5e4b89..805da22863d 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -769,8 +769,10 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h)
             l->binary_input_gpu = cuda_make_array(0, l->inputs*l->batch);
         }
 
-        cuda_free(l->activation_input_gpu);
-        if (l->activation == SWISH || l->activation == MISH) l->activation_input_gpu = cuda_make_array(l->activation_input, total_batch*l->outputs);
+        if (l->activation == SWISH || l->activation == MISH) {
+            cuda_free(l->activation_input_gpu);
+            l->activation_input_gpu = cuda_make_array(l->activation_input, total_batch*l->outputs);
+        }
     }
 #ifdef CUDNN
     cudnn_convolutional_setup(l, cudnn_fastest);

From 5a77940bd58ac29841a7ba36e18f73d32272c48a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mos=C3=A8=20Giordano?= <m.giordano@ucl.ac.uk>
Date: Thu, 21 Nov 2019 01:53:31 +0000
Subject: [PATCH 77/86] Add $(OBJDIR) as prerequisite to $(LIBNAMESO)

The directory must exist before running this rule.  When running `make` with
several parallel jobs it can happen that $(LIBNAMESO) rule is run before
creating the directory.
---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 41e5fc8d737..c59991281ba 100644
--- a/Makefile
+++ b/Makefile
@@ -127,12 +127,12 @@ endif
 OBJS = $(addprefix $(OBJDIR), $(OBJ))
 DEPS = $(wildcard src/*.h) Makefile include/darknet.h
 
-all: obj backup results setchmod $(EXEC) $(LIBNAMESO) $(APPNAMESO)
+all: $(OBJDIR) backup results setchmod $(EXEC) $(LIBNAMESO) $(APPNAMESO)
 
 ifeq ($(LIBSO), 1)
 CFLAGS+= -fPIC
 
-$(LIBNAMESO): $(OBJS) include/yolo_v2_class.hpp src/yolo_v2_class.cpp
+$(LIBNAMESO): $(OBJDIR) $(OBJS) include/yolo_v2_class.hpp src/yolo_v2_class.cpp
 	$(CPP) -shared -std=c++11 -fvisibility=hidden -DLIB_EXPORTS $(COMMON) $(CFLAGS) $(OBJS) src/yolo_v2_class.cpp -o $@ $(LDFLAGS)
 
 $(APPNAMESO): $(LIBNAMESO) include/yolo_v2_class.hpp src/yolo_console_dll.cpp
@@ -151,8 +151,8 @@ $(OBJDIR)%.o: %.cpp $(DEPS)
 $(OBJDIR)%.o: %.cu $(DEPS)
 	$(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@
 
-obj:
-	mkdir -p obj
+$(OBJDIR):
+	mkdir -p $(OBJDIR)
 backup:
 	mkdir -p backup
 results:

From f7a6f7b87cdfd49c9930d2c2a2d7fa5b52b30940 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Thu, 21 Nov 2019 14:11:52 +0300
Subject: [PATCH 78/86] Fixed MISH as in thomasbrandon/mish-cuda implementation
 with 1 Threshold

---
 src/activation_kernels.cu | 11 ++++++++---
 src/activations.c         |  9 ++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu
index 67504e71611..6ef165ce768 100644
--- a/src/activation_kernels.cu
+++ b/src/activation_kernels.cu
@@ -204,9 +204,14 @@ __global__ void activate_array_mish_kernel(float *x, int n, float *activation_in
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (i < n) {
+        const float MISH_THRESHOLD = 20;
         float x_val = x[i];
         activation_input[i] = x_val;    // store value before activation
-        output_gpu[i] = x_val * tanh_activate_kernel(log(1 + expf(x_val)));
+        //output_gpu[i] = x_val * tanh_activate_kernel(log(1 + expf(x_val)));
+
+        // https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L17-L20
+        if (x_val < MISH_THRESHOLD) output_gpu[i] = x_val * tanh_activate_kernel(log(expf(x_val)));
+        else output_gpu[i] = x_val * tanh_activate_kernel(x_val);
     }
 }
 
@@ -279,12 +284,12 @@ __global__ void gradient_array_mish_kernel(int n, float *activation_input_gpu, f
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (i < n) {
-        const float THRESHOLD = 20.0f;
+        const float MISH_THRESHOLD = 20.0f;
 
         // implementation from TensorFlow: https://github.com/tensorflow/addons/commit/093cdfa85d334cbe19a37624c33198f3140109ed
         // implementation from Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L26-L31
         float inp = activation_input_gpu[i];
-        const float sp = (inp < THRESHOLD) ? log1p(exp(inp)) : inp;
+        const float sp = (inp < MISH_THRESHOLD) ? log1p(exp(inp)) : inp;
         const float grad_sp = 1 - exp(-sp);
         const float tsp = tanh(sp);
         const float grad_tsp = (1 - tsp*tsp) * grad_sp;
diff --git a/src/activations.c b/src/activations.c
index 55b060bd94c..83580cb24a5 100644
--- a/src/activations.c
+++ b/src/activations.c
@@ -137,12 +137,15 @@ void activate_array_swish(float *x, const int n, float * output_sigmoid, float *
 // https://github.com/digantamisra98/Mish
 void activate_array_mish(float *x, const int n, float * activation_input, float * output)
 {
+    const float MISH_THRESHOLD = 20;
     int i;
     #pragma omp parallel for
     for (i = 0; i < n; ++i) {
         float x_val = x[i];
         activation_input[i] = x_val;    // store value before activation
-        output[i] = x_val * tanh_activate(log(1 + expf(x_val)));
+        //output[i] = x_val * tanh_activate(log(1 + expf(x_val)));
+        if (x_val < MISH_THRESHOLD) output[i] = x_val * tanh_activate(log(expf(x_val)));
+        else output[i] = x_val * tanh_activate(x_val);
     }
 }
 
@@ -207,12 +210,12 @@ void gradient_array_mish(const int n, const float * activation_input, float * de
     int i;
     #pragma omp parallel for
     for (i = 0; i < n; ++i) {
-        const float THRESHOLD = 20.0f;
+        const float MISH_THRESHOLD = 20.0f;
 
         // implementation from TensorFlow: https://github.com/tensorflow/addons/commit/093cdfa85d334cbe19a37624c33198f3140109ed
         // implementation from Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L26-L31
         float inp = activation_input[i];
-        const float sp = (inp < THRESHOLD) ? log1p(exp(inp)) : inp;
+        const float sp = (inp < MISH_THRESHOLD) ? log1p(exp(inp)) : inp;
         const float grad_sp = 1 - exp(-sp);
         const float tsp = tanh(sp);
         const float grad_tsp = (1 - tsp*tsp) * grad_sp;

From 7713a0209c0fc5d213db49243ab86306411fec5d Mon Sep 17 00:00:00 2001
From: Alexey <AlexeyAB@users.noreply.github.com>
Date: Fri, 22 Nov 2019 02:43:33 +0300
Subject: [PATCH 79/86] Update Readme.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 5c0efdeff1a..13a13167dd4 100644
--- a/README.md
+++ b/README.md
@@ -632,6 +632,7 @@ Different tools for marking objects in images:
 2. in Python: https://github.com/tzutalin/labelImg
 3. in Python: https://github.com/Cartucho/OpenLabeling
 4. in C++: https://www.ccoderun.ca/darkmark/
+5. in JavaScript: https://github.com/opencv/cvat
 
 
 ## Using Yolo9000

From b9ca5ec781291f01174d6b496a9c3ebc59303c1f Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 22 Nov 2019 14:20:53 +0300
Subject: [PATCH 80/86] Fixed MISH activation with 2 thresholds in Softplus

---
 src/activation_kernels.cu | 23 +++++++++++++++--------
 src/activations.c         |  6 ++----
 src/activations.h         |  5 +++++
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu
index 6ef165ce768..5b357d3fce6 100644
--- a/src/activation_kernels.cu
+++ b/src/activation_kernels.cu
@@ -35,6 +35,11 @@ __device__ float relie_activate_kernel(float x){return (x>0) ? x : .01f*x;}
 __device__ float ramp_activate_kernel(float x){return x*(x>0)+.1f*x;}
 __device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1f*x;}
 __device__ float tanh_activate_kernel(float x){return (2/(1 + expf(-2*x)) - 1);}
+__device__ float softplus_kernel(float x, float threshold = 20) {
+    if (x > threshold) return x;                // too large
+    else if (x < -threshold) return expf(x);    // too small
+    return logf(expf(x) + 1);
+}
 __device__ float plse_activate_kernel(float x)
 {
     if(x < -4) return .01f * (x + 4);
@@ -207,11 +212,12 @@ __global__ void activate_array_mish_kernel(float *x, int n, float *activation_in
         const float MISH_THRESHOLD = 20;
         float x_val = x[i];
         activation_input[i] = x_val;    // store value before activation
-        //output_gpu[i] = x_val * tanh_activate_kernel(log(1 + expf(x_val)));
+        //output_gpu[i] = x_val * tanh_activate_kernel(logf(1 + expf(x_val)));
 
-        // https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L17-L20
-        if (x_val < MISH_THRESHOLD) output_gpu[i] = x_val * tanh_activate_kernel(log(expf(x_val)));
-        else output_gpu[i] = x_val * tanh_activate_kernel(x_val);
+        // Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L17-L20
+        // TF: https://github.com/tensorflow/addons/blob/093cdfa85d334cbe19a37624c33198f3140109ed/tensorflow_addons/custom_ops/activations/cc/kernels/mish_op.h#L40-L49
+        // log1p(x) == log(x + 1)
+        output_gpu[i] = x_val * tanh_activate_kernel( softplus_kernel(x_val, MISH_THRESHOLD) );
     }
 }
 
@@ -286,11 +292,12 @@ __global__ void gradient_array_mish_kernel(int n, float *activation_input_gpu, f
     if (i < n) {
         const float MISH_THRESHOLD = 20.0f;
 
-        // implementation from TensorFlow: https://github.com/tensorflow/addons/commit/093cdfa85d334cbe19a37624c33198f3140109ed
+        // implementation from TensorFlow: https://github.com/tensorflow/addons/blob/093cdfa85d334cbe19a37624c33198f3140109ed/tensorflow_addons/custom_ops/activations/cc/kernels/mish_op.h#L66-L80
         // implementation from Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L26-L31
-        float inp = activation_input_gpu[i];
-        const float sp = (inp < MISH_THRESHOLD) ? log1p(exp(inp)) : inp;
-        const float grad_sp = 1 - exp(-sp);
+        // log1p(x) == log(x + 1)
+        const float inp = activation_input_gpu[i];
+        const float sp = softplus_kernel(inp, MISH_THRESHOLD);
+        const float grad_sp = 1 - expf(-sp);
         const float tsp = tanh(sp);
         const float grad_tsp = (1 - tsp*tsp) * grad_sp;
         const float grad = inp * grad_tsp + tsp;
diff --git a/src/activations.c b/src/activations.c
index 83580cb24a5..347a13ac47b 100644
--- a/src/activations.c
+++ b/src/activations.c
@@ -143,9 +143,7 @@ void activate_array_mish(float *x, const int n, float * activation_input, float
     for (i = 0; i < n; ++i) {
         float x_val = x[i];
         activation_input[i] = x_val;    // store value before activation
-        //output[i] = x_val * tanh_activate(log(1 + expf(x_val)));
-        if (x_val < MISH_THRESHOLD) output[i] = x_val * tanh_activate(log(expf(x_val)));
-        else output[i] = x_val * tanh_activate(x_val);
+        output[i] = x_val * tanh_activate( softplus_activate(x_val, MISH_THRESHOLD) );
     }
 }
 
@@ -215,7 +213,7 @@ void gradient_array_mish(const int n, const float * activation_input, float * de
         // implementation from TensorFlow: https://github.com/tensorflow/addons/commit/093cdfa85d334cbe19a37624c33198f3140109ed
         // implementation from Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L26-L31
         float inp = activation_input[i];
-        const float sp = (inp < MISH_THRESHOLD) ? log1p(exp(inp)) : inp;
+        const float sp = softplus_activate(inp, MISH_THRESHOLD);
         const float grad_sp = 1 - exp(-sp);
         const float tsp = tanh(sp);
         const float grad_tsp = (1 - tsp*tsp) * grad_sp;
diff --git a/src/activations.h b/src/activations.h
index bba5ca8d10a..edd5b304ff8 100644
--- a/src/activations.h
+++ b/src/activations.h
@@ -53,6 +53,11 @@ static inline float relie_activate(float x){return (x>0) ? x : .01f*x;}
 static inline float ramp_activate(float x){return x*(x>0)+.1f*x;}
 static inline float leaky_activate(float x){return (x>0) ? x : .1f*x;}
 static inline float tanh_activate(float x){return (expf(2*x)-1)/(expf(2*x)+1);}
+static inline float softplus_activate(float x, float threshold) {
+    if (x > threshold) return x;                // too large
+    else if (x < -threshold) return expf(x);    // too small
+    return logf(expf(x) + 1);
+}
 static inline float plse_activate(float x)
 {
     if(x < -4) return .01f * (x + 4);

From 4f70fc14e9bd33d24efd64dff16e959e2e391d39 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 23 Nov 2019 16:42:43 +0300
Subject: [PATCH 81/86] Added DIoU and CIoU to [yolo] and [Gaussian_yolo]

---
 include/darknet.h         |  19 +-
 src/box.c                 | 443 ++++++++++++++++++++++++++++++++++----
 src/box.h                 |   1 +
 src/demo.c                |   5 +-
 src/detector.c            |  17 +-
 src/gaussian_yolo_layer.c | 102 +++++++--
 src/parser.c              |  36 +++-
 src/utils.c               |  19 ++
 src/utils.h               |   1 +
 src/yolo_layer.c          |  73 +++++++
 10 files changed, 644 insertions(+), 72 deletions(-)

diff --git a/include/darknet.h b/include/darknet.h
index 7a906780c61..851b65421c8 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -107,9 +107,20 @@ typedef enum {
 
 // parser.h
 typedef enum {
-    IOU, GIOU, MSE
+    IOU, GIOU, MSE, DIOU, CIOU
 } IOU_LOSS;
 
+// parser.h
+typedef enum {
+    DEFAULT_NMS, GREEDY_NMS, DIOU_NMS
+} NMS_KIND;
+
+// parser.h
+typedef enum {
+    YOLO_CENTER, YOLO_LEFT_TOP, YOLO_RIGHT_BOTTOM
+} YOLO_POINT;
+
+
 // image.h
 typedef enum{
     PNG, BMP, TGA, JPG
@@ -334,6 +345,9 @@ struct layer {
     float iou_normalizer;
     float cls_normalizer;
     IOU_LOSS iou_loss;
+    NMS_KIND nms_kind;
+    float beta_nms;
+    YOLO_POINT yolo_point;
 
     char *align_bit_weights_gpu;
     float *mean_arr_gpu;
@@ -719,7 +733,7 @@ typedef struct dxrep {
 
 // box.h
 typedef struct ious {
-    float iou, giou;
+    float iou, giou, diou, ciou;
     dxrep dx_iou;
     dxrep dx_giou;
 } ious;
@@ -835,6 +849,7 @@ LIB_API load_args get_base_args(network *net);
 // box.h
 LIB_API void do_nms_sort(detection *dets, int total, int classes, float thresh);
 LIB_API void do_nms_obj(detection *dets, int total, int classes, float thresh);
+LIB_API void diounms_sort(detection *dets, int total, int classes, float thresh, NMS_KIND nms_kind, float beta1);
 
 // network.h
 LIB_API float *network_predict(network net, float *input);
diff --git a/src/box.c b/src/box.c
index cb28ce8e4d7..e700dc7140c 100644
--- a/src/box.c
+++ b/src/box.c
@@ -1,8 +1,12 @@
-#include "box.h"
+﻿#include "box.h"
 #include <stdio.h>
 #include <math.h>
 #include <stdlib.h>
 
+#ifndef M_PI
+#define M_PI 3.141592
+#endif
+
 box float_to_box(float *f)
 {
     box b;
@@ -23,6 +27,23 @@ box float_to_box_stride(float *f, int stride)
     return b;
 }
 
+
+dbox derivative(box a, box b)
+{
+    dbox d;
+    d.dx = 0;
+    d.dw = 0;
+    d.dy = 0;
+    d.dh = 0;
+    d.dx = a.x < b.x ? 1.0 : -1.0;
+    d.dy = a.y < b.y ? 1.0 : -1.0;
+    d.dw = a.w < b.w ? 1.0 : -1.0;
+    d.dh = a.h < b.h ? 1.0 : -1.0;
+    return d;
+}
+
+
+/*
 dbox derivative(box a, box b)
 {
     dbox d;
@@ -73,6 +94,7 @@ dbox derivative(box a, box b)
     }
     return d;
 }
+*/
 
 // where c is the smallest box that fully encompases a and b
 boxabs box_c(box a, box b) {
@@ -155,13 +177,75 @@ float box_giou(box a, box b)
     return iou - giou_term;
 }
 
+float box_diou(box a, box b)
+{
+    boxabs ba = box_c(a, b);
+    float w = ba.right - ba.left;
+    float h = ba.bot - ba.top;
+    float c = w * w + h * h;
+    float iou = box_iou(a, b);
+    if (c == 0) {
+        return iou;
+    }
+    float d = (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+    float u = pow(d / c, 0.6);
+    float diou_term = u;
+#ifdef DEBUG_PRINTS
+    printf("  c: %f, u: %f, riou_term: %f\n", c, u, diou_term);
+#endif
+    return iou - diou_term;
+}
+
+float box_diounms(box a, box b, float beta1)
+{
+    boxabs ba = box_c(a, b);
+    float w = ba.right - ba.left;
+    float h = ba.bot - ba.top;
+    float c = w * w + h * h;
+    float iou = box_iou(a, b);
+    if (c == 0) {
+        return iou;
+    }
+    float d = (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+    float u = pow(d / c, beta1);
+    float diou_term = u;
+#ifdef DEBUG_PRINTS
+    printf("  c: %f, u: %f, riou_term: %f\n", c, u, diou_term);
+#endif
+    return iou - diou_term;
+}
+
+float box_ciou(box a, box b)
+{
+    boxabs ba = box_c(a, b);
+    float w = ba.right - ba.left;
+    float h = ba.bot - ba.top;
+    float c = w * w + h * h;
+    float iou = box_iou(a, b);
+    if (c == 0) {
+        return iou;
+    }
+    float u = (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+    float d = u / c;
+    float ar_gt = b.w / b.h;
+    float ar_pred = a.w / a.h;
+    float ar_loss = 4 / (M_PI * M_PI) * (atan(ar_gt) - atan(ar_pred)) * (atan(ar_gt) - atan(ar_pred));
+    float alpha = ar_loss / (1 - iou + ar_loss + 0.000001);
+    float ciou_term = d + alpha * ar_loss;                   //ciou
+#ifdef DEBUG_PRINTS
+    printf("  c: %f, u: %f, riou_term: %f\n", c, u, ciou_term);
+#endif
+    return iou - ciou_term;
+}
+
 dxrep dx_box_iou(box pred, box truth, IOU_LOSS iou_loss) {
-    boxabs pred_tblr = to_tblr(pred);
+ boxabs pred_tblr = to_tblr(pred);
     float pred_t = fmin(pred_tblr.top, pred_tblr.bot);
     float pred_b = fmax(pred_tblr.top, pred_tblr.bot);
     float pred_l = fmin(pred_tblr.left, pred_tblr.right);
     float pred_r = fmax(pred_tblr.left, pred_tblr.right);
-
+    //dbox dover = derivative(pred,truth);
+    //dbox diouu = diou(pred, truth);
     boxabs truth_tblr = to_tblr(truth);
 #ifdef DEBUG_PRINTS
     printf("\niou: %f, giou: %f\n", box_iou(pred, truth), box_giou(pred, truth));
@@ -170,24 +254,39 @@ dxrep dx_box_iou(box pred, box truth, IOU_LOSS iou_loss) {
 #endif
     //printf("pred (t,b,l,r): (%f, %f, %f, %f)\n", pred_t, pred_b, pred_l, pred_r);
     //printf("trut (t,b,l,r): (%f, %f, %f, %f)\n", truth_tblr.top, truth_tblr.bot, truth_tblr.left, truth_tblr.right);
-    dxrep dx = { 0 };
+    dxrep ddx = {0};
     float X = (pred_b - pred_t) * (pred_r - pred_l);
     float Xhat = (truth_tblr.bot - truth_tblr.top) * (truth_tblr.right - truth_tblr.left);
     float Ih = fmin(pred_b, truth_tblr.bot) - fmax(pred_t, truth_tblr.top);
     float Iw = fmin(pred_r, truth_tblr.right) - fmax(pred_l, truth_tblr.left);
     float I = Iw * Ih;
     float U = X + Xhat - I;
-
-    float Cw = fmax(pred_r, truth_tblr.right) - fmin(pred_l, truth_tblr.left);
-    float Ch = fmax(pred_b, truth_tblr.bot) - fmin(pred_t, truth_tblr.top);
-    float C = Cw * Ch;
-
-    // float IoU = I / U;
-    // Partial Derivatives, derivatives
+    float S = (pred.x-truth.x)*(pred.x-truth.x)+(pred.y-truth.y)*(pred.y-truth.y);
+    float giou_Cw = fmax(pred_r, truth_tblr.right) - fmin(pred_l, truth_tblr.left);
+    float giou_Ch = fmax(pred_b, truth_tblr.bot) - fmin(pred_t, truth_tblr.top);
+    float giou_C = giou_Cw * giou_Ch;
+    //float IoU = I / U;
+//#ifdef DEBUG_PRINTS
+    //printf("X: %f", X);
+    //printf(", Xhat: %f", Xhat);
+    //printf(", Ih: %f", Ih);
+    //printf(", Iw: %f", Iw);
+    //printf(", I: %f", I);
+    //printf(", U: %f", U);
+    //printf(", IoU: %f\n", I / U);
+//#endif
+
+    //Partial Derivatives, derivatives
     float dX_wrt_t = -1 * (pred_r - pred_l);
     float dX_wrt_b = pred_r - pred_l;
     float dX_wrt_l = -1 * (pred_b - pred_t);
     float dX_wrt_r = pred_b - pred_t;
+    // UNUSED
+    //// Ground truth
+    //float dXhat_wrt_t = -1 * (truth_tblr.right - truth_tblr.left);
+    //float dXhat_wrt_b = truth_tblr.right - truth_tblr.left;
+    //float dXhat_wrt_l = -1 * (truth_tblr.bot - truth_tblr.top);
+    //float dXhat_wrt_r = truth_tblr.bot - truth_tblr.top;
 
     // gradient of I min/max in IoU calc (prediction)
     float dI_wrt_t = pred_t > truth_tblr.top ? (-1 * Iw) : 0;
@@ -200,42 +299,262 @@ dxrep dx_box_iou(box pred, box truth, IOU_LOSS iou_loss) {
     float dU_wrt_l = dX_wrt_l - dI_wrt_l;
     float dU_wrt_r = dX_wrt_r - dI_wrt_r;
     // gradient of C min/max in IoU calc (prediction)
-    float dC_wrt_t = pred_t < truth_tblr.top ? (-1 * Cw) : 0;
-    float dC_wrt_b = pred_b > truth_tblr.bot ? Cw : 0;
-    float dC_wrt_l = pred_l < truth_tblr.left ? (-1 * Ch) : 0;
-    float dC_wrt_r = pred_r > truth_tblr.right ? Ch : 0;
+    float dC_wrt_t = pred_t < truth_tblr.top ? (-1 * giou_Cw) : 0;
+    float dC_wrt_b = pred_b > truth_tblr.bot ? giou_Cw : 0;
+    float dC_wrt_l = pred_l < truth_tblr.left ? (-1 * giou_Ch) : 0;
+    float dC_wrt_r = pred_r > truth_tblr.right ? giou_Ch : 0;
 
-    // Final IOU loss (prediction) (negative of IOU gradient, we want the negative loss)
     float p_dt = 0;
     float p_db = 0;
     float p_dl = 0;
     float p_dr = 0;
-    if (U > 0) {
-        p_dt = ((U * dI_wrt_t) - (I * dU_wrt_t)) / (U * U);
-        p_db = ((U * dI_wrt_b) - (I * dU_wrt_b)) / (U * U);
-        p_dl = ((U * dI_wrt_l) - (I * dU_wrt_l)) / (U * U);
-        p_dr = ((U * dI_wrt_r) - (I * dU_wrt_r)) / (U * U);
+    if (U > 0 ) {
+      p_dt = ((U * dI_wrt_t) - (I * dU_wrt_t)) / (U * U);
+      p_db = ((U * dI_wrt_b) - (I * dU_wrt_b)) / (U * U);
+      p_dl = ((U * dI_wrt_l) - (I * dU_wrt_l)) / (U * U);
+      p_dr = ((U * dI_wrt_r) - (I * dU_wrt_r)) / (U * U);
     }
+    // apply grad from prediction min/max for correct corner selection
+    p_dt = pred_tblr.top < pred_tblr.bot ? p_dt : p_db;
+    p_db = pred_tblr.top < pred_tblr.bot ? p_db : p_dt;
+    p_dl = pred_tblr.left < pred_tblr.right ? p_dl : p_dr;
+    p_dr = pred_tblr.left < pred_tblr.right ? p_dr : p_dl;
 
-    // GIoU = I/U - (C-U)/C
-    // C is the smallest convex hull that encloses both Detection and Truth
     if (iou_loss == GIOU) {
+      if (giou_C > 0) {
+        // apply "C" term from gIOU
+        p_dt += ((giou_C * dU_wrt_t) - (U * dC_wrt_t)) / (giou_C * giou_C);
+        p_db += ((giou_C * dU_wrt_b) - (U * dC_wrt_b)) / (giou_C * giou_C);
+        p_dl += ((giou_C * dU_wrt_l) - (U * dC_wrt_l)) / (giou_C * giou_C);
+        p_dr += ((giou_C * dU_wrt_r) - (U * dC_wrt_r)) / (giou_C * giou_C);
+      }
+	  if (Iw<=0||Ih<=0) {
+		p_dt = ((giou_C * dU_wrt_t) - (U * dC_wrt_t)) / (giou_C * giou_C);
+        p_db = ((giou_C * dU_wrt_b) - (U * dC_wrt_b)) / (giou_C * giou_C);
+        p_dl = ((giou_C * dU_wrt_l) - (U * dC_wrt_l)) / (giou_C * giou_C);
+        p_dr = ((giou_C * dU_wrt_r) - (U * dC_wrt_r)) / (giou_C * giou_C);
+	  }
+    }
+
+    float Ct = fmin(pred.y - pred.h / 2,truth.y - truth.h / 2);
+    float Cb = fmax(pred.y + pred.h / 2,truth.y + truth.h / 2);
+    float Cl = fmin(pred.x - pred.w / 2,truth.x - truth.w / 2);
+    float Cr = fmax(pred.x + pred.w / 2,truth.x + truth.w / 2);
+    float Cw = Cr - Cl;
+    float Ch = Cb - Ct;
+    float C = Cw * Cw + Ch * Ch;
+
+    float dCt_dx = 0;
+    float dCt_dy = pred_t < truth_tblr.top ? 1 : 0;
+    float dCt_dw = 0;
+    float dCt_dh = pred_t < truth_tblr.top ? -0.5 : 0;
+
+    float dCb_dx = 0;
+    float dCb_dy = pred_b > truth_tblr.bot ? 1 : 0;
+    float dCb_dw = 0;
+    float dCb_dh = pred_b > truth_tblr.bot ? 0.5: 0;
+
+    float dCl_dx = pred_l < truth_tblr.left ? 1 : 0;
+    float dCl_dy = 0;
+    float dCl_dw = pred_l < truth_tblr.left ? -0.5 : 0;
+    float dCl_dh = 0;
+
+    float dCr_dx = pred_r > truth_tblr.right ? 1 : 0;
+    float dCr_dy = 0;
+    float dCr_dw = pred_r > truth_tblr.right ? 0.5 : 0;
+    float dCr_dh = 0;
+
+    float dCw_dx = dCr_dx - dCl_dx;
+    float dCw_dy = dCr_dy - dCl_dy;
+    float dCw_dw = dCr_dw - dCl_dw;
+    float dCw_dh = dCr_dh - dCl_dh;
+
+    float dCh_dx = dCb_dx - dCt_dx;
+    float dCh_dy = dCb_dy - dCt_dy;
+    float dCh_dw = dCb_dw - dCt_dw;
+    float dCh_dh = dCb_dh - dCt_dh;
+
+    // UNUSED
+    //// ground truth
+    //float dI_wrt_xhat_t = pred_t < truth_tblr.top ? (-1 * Iw) : 0;
+    //float dI_wrt_xhat_b = pred_b > truth_tblr.bot ? Iw : 0;
+    //float dI_wrt_xhat_l = pred_l < truth_tblr.left ? (-1 * Ih) : 0;
+    //float dI_wrt_xhat_r = pred_r > truth_tblr.right ? Ih : 0;
+
+    // Final IOU loss (prediction) (negative of IOU gradient, we want the negative loss)
+    float p_dx = 0;
+    float p_dy = 0;
+    float p_dw = 0;
+    float p_dh = 0;
+
+    p_dx = p_dl + p_dr;           //p_dx, p_dy, p_dw and p_dh are the gradient of IoU or GIoU.
+    p_dy = p_dt + p_db;
+    p_dw = (p_dr - p_dl);         //For dw and dh, we do not divided by 2.
+    p_dh = (p_db - p_dt);
+    if (iou_loss == DIOU) {
+        if (C > 0) {
+            p_dx += (2*(truth.x-pred.x)*C-(2*Cw*dCw_dx+2*Ch*dCh_dx)*S) / (C * C);
+            p_dy += (2*(truth.y-pred.y)*C-(2*Cw*dCw_dy+2*Ch*dCh_dy)*S) / (C * C);
+            p_dw += (2*Cw*dCw_dw+2*Ch*dCh_dw)*S / (C * C);
+            p_dh += (2*Cw*dCw_dh+2*Ch*dCh_dh)*S / (C * C);
+        }
+	if (Iw<=0||Ih<=0){
+            p_dx = (2*(truth.x-pred.x)*C-(2*Cw*dCw_dx+2*Ch*dCh_dx)*S) / (C * C);
+            p_dy = (2*(truth.y-pred.y)*C-(2*Cw*dCw_dy+2*Ch*dCh_dy)*S) / (C * C);
+            p_dw = (2*Cw*dCw_dw+2*Ch*dCh_dw)*S / (C * C);
+            p_dh = (2*Cw*dCw_dh+2*Ch*dCh_dh)*S / (C * C);
+        }
+    }
+	//The following codes are calculating the gradient of ciou.
+
+    if (iou_loss == CIOU) {
+	float ar_gt = truth.w / truth.h;
+        float ar_pred = pred.w / pred.h;
+        float ar_loss = 4 / (M_PI * M_PI) * (atan(ar_gt) - atan(ar_pred)) * (atan(ar_gt) - atan(ar_pred));
+	float alpha = ar_loss / (1 - I/U + ar_loss + 0.000001);
+	float ar_dw=8/(M_PI*M_PI)*(atan(ar_gt)-atan(ar_pred))*pred.h;
+        float ar_dh=-8/(M_PI*M_PI)*(atan(ar_gt)-atan(ar_pred))*pred.w;
         if (C > 0) {
-            // apply "C" term from gIOU
-            p_dt += ((C * dU_wrt_t) - (U * dC_wrt_t)) / (C * C);
-            p_db += ((C * dU_wrt_b) - (U * dC_wrt_b)) / (C * C);
-            p_dl += ((C * dU_wrt_l) - (U * dC_wrt_l)) / (C * C);
-            p_dr += ((C * dU_wrt_r) - (U * dC_wrt_r)) / (C * C);
+        // dar*
+            p_dx += (2*(truth.x-pred.x)*C-(2*Cw*dCw_dx+2*Ch*dCh_dx)*S) / (C * C);
+            p_dy += (2*(truth.y-pred.y)*C-(2*Cw*dCw_dy+2*Ch*dCh_dy)*S) / (C * C);
+            p_dw += (2*Cw*dCw_dw+2*Ch*dCh_dw)*S / (C * C) + alpha * ar_dw;
+            p_dh += (2*Cw*dCw_dh+2*Ch*dCh_dh)*S / (C * C) + alpha * ar_dh;
+        }
+	if (Iw<=0||Ih<=0){
+            p_dx = (2*(truth.x-pred.x)*C-(2*Cw*dCw_dx+2*Ch*dCh_dx)*S) / (C * C);
+            p_dy = (2*(truth.y-pred.y)*C-(2*Cw*dCw_dy+2*Ch*dCh_dy)*S) / (C * C);
+            p_dw = (2*Cw*dCw_dw+2*Ch*dCh_dw)*S / (C * C) + alpha * ar_dw;
+            p_dh = (2*Cw*dCw_dh+2*Ch*dCh_dh)*S / (C * C) + alpha * ar_dh;
         }
     }
 
+    ddx.dt = p_dx;      //We follow the original code released from GDarknet. So in yolo_layer.c, dt, db, dl, dr are already dx, dy, dw, dh.
+    ddx.db = p_dy;
+    ddx.dl = p_dw;
+    ddx.dr = p_dh;
+
+    // UNUSED
+    //// ground truth
+    //float gt_dt = ((U * dI_wrt_xhat_t) - (I * (dXhat_wrt_t - dI_wrt_xhat_t))) / (U * U);
+    //float gt_db = ((U * dI_wrt_xhat_b) - (I * (dXhat_wrt_b - dI_wrt_xhat_b))) / (U * U);
+    //float gt_dl = ((U * dI_wrt_xhat_l) - (I * (dXhat_wrt_l - dI_wrt_xhat_l))) / (U * U);
+    //float gt_dr = ((U * dI_wrt_xhat_r) - (I * (dXhat_wrt_r - dI_wrt_xhat_r))) / (U * U);
+
+    // no min/max grad applied
+    //dx.dt = dt;
+    //dx.db = db;
+    //dx.dl = dl;
+    //dx.dr = dr;
+
+    //// sum in gt -- THIS DOESNT WORK
+    //dx.dt += gt_dt;
+    //dx.db += gt_db;
+    //dx.dl += gt_dl;
+    //dx.dr += gt_dr;
+
+    //// instead, look at the change between pred and gt, and weight t/b/l/r appropriately...
+    //// need the real derivative here (I think?)
+    //float delta_t = fmax(truth_tblr.top, pred_t) - fmin(truth_tblr.top, pred_t);
+    //float delta_b = fmax(truth_tblr.bot, pred_b) - fmin(truth_tblr.bot, pred_b);
+    //float delta_l = fmax(truth_tblr.left, pred_l) - fmin(truth_tblr.left, pred_l);
+    //float delta_r = fmax(truth_tblr.right, pred_r) - fmin(truth_tblr.right, pred_r);
+
+    //dx.dt *= delta_t / (delta_t + delta_b);
+    //dx.db *= delta_b / (delta_t + delta_b);
+    //dx.dl *= delta_l / (delta_l + delta_r);
+    //dx.dr *= delta_r / (delta_l + delta_r);
+
+    // UNUSED
+    //// ground truth
+    //float gt_dt = ((U * dI_wrt_xhat_t) - (I * (dXhat_wrt_t - dI_wrt_xhat_t))) / (U * U);
+    //float gt_db = ((U * dI_wrt_xhat_b) - (I * (dXhat_wrt_b - dI_wrt_xhat_b))) / (U * U);
+    //float gt_dl = ((U * dI_wrt_xhat_l) - (I * (dXhat_wrt_l - dI_wrt_xhat_l))) / (U * U);
+    //float gt_dr = ((U * dI_wrt_xhat_r) - (I * (dXhat_wrt_r - dI_wrt_xhat_r))) / (U * U);
+
+    // no min/max grad applied
+    //dx.dt = dt;
+    //dx.db = db;
+    //dx.dl = dl;
+    //dx.dr = dr;
+
     // apply grad from prediction min/max for correct corner selection
-    dx.dt = pred_tblr.top < pred_tblr.bot ? p_dt : p_db;
-    dx.db = pred_tblr.top < pred_tblr.bot ? p_db : p_dt;
-    dx.dl = pred_tblr.left < pred_tblr.right ? p_dl : p_dr;
-    dx.dr = pred_tblr.left < pred_tblr.right ? p_dr : p_dl;
+    //dx.dt = pred_tblr.top < pred_tblr.bot ? p_dt : p_db;
+    //dx.db = pred_tblr.top < pred_tblr.bot ? p_db : p_dt;
+    //dx.dl = pred_tblr.left < pred_tblr.right ? p_dl : p_dr;
+    //dx.dr = pred_tblr.left < pred_tblr.right ? p_dr : p_dl;
+
+    //// sum in gt -- THIS DOESNT WORK
+    //dx.dt += gt_dt;
+    //dx.db += gt_db;
+    //dx.dl += gt_dl;
+    //dx.dr += gt_dr;
+
+    //// instead, look at the change between pred and gt, and weight t/b/l/r appropriately...
+    //// need the real derivative here (I think?)
+    //float delta_t = fmax(truth_tblr.top, pred_t) - fmin(truth_tblr.top, pred_t);
+    //float delta_b = fmax(truth_tblr.bot, pred_b) - fmin(truth_tblr.bot, pred_b);
+    //float delta_l = fmax(truth_tblr.left, pred_l) - fmin(truth_tblr.left, pred_l);
+    //float delta_r = fmax(truth_tblr.right, pred_r) - fmin(truth_tblr.right, pred_r);
+
+    //dx.dt *= delta_t / (delta_t + delta_b);
+    //dx.db *= delta_b / (delta_t + delta_b);
+    //dx.dl *= delta_l / (delta_l + delta_r);
+    //dx.dr *= delta_r / (delta_l + delta_r);
+
+//#ifdef DEBUG_PRINTS
+    /*printf("  directions dt: ");
+    if ((pred_tblr.top < truth_tblr.top && dx.dt > 0) || (pred_tblr.top > truth_tblr.top && dx.dt < 0)) {
+      printf("✓");
+    } else {
+      printf("𝒙");
+    }
+    printf(", ");
+    if ((pred_tblr.bot < truth_tblr.bot && dx.db > 0) || (pred_tblr.bot > truth_tblr.bot && dx.db < 0)) {
+      printf("✓");
+    } else {
+      printf("𝒙");
+    }
+    printf(", ");
+    if ((pred_tblr.left < truth_tblr.left && dx.dl > 0) || (pred_tblr.left > truth_tblr.left && dx.dl < 0)) {
+      printf("✓");
+    } else {
+      printf("𝒙");
+    }
+    printf(", ");
+    if ((pred_tblr.right < truth_tblr.right && dx.dr > 0) || (pred_tblr.right > truth_tblr.right && dx.dr < 0)) {
+      printf("✓");
+    } else {
+      printf("𝒙");
+    }
+    printf("\n");
+
+    printf("dx dt:%f", dx.dt);
+    printf(", db: %f", dx.db);
+    printf(", dl: %f", dx.dl);
+    printf(", dr: %f | ", dx.dr);
+#endif
 
-    return dx;
+#ifdef DEBUG_NAN
+    if (isnan(dx.dt)) { printf("dt isnan\n"); }
+    if (isnan(dx.db)) { printf("db isnan\n"); }
+    if (isnan(dx.dl)) { printf("dl isnan\n"); }
+    if (isnan(dx.dr)) { printf("dr isnan\n"); }
+#endif
+
+//    // No update if 0 or nan
+//    if (dx.dt == 0 || isnan(dx.dt)) { dx.dt = 1; }
+//    if (dx.db == 0 || isnan(dx.db)) { dx.db = 1; }
+//    if (dx.dl == 0 || isnan(dx.dl)) { dx.dl = 1; }
+//    if (dx.dr == 0 || isnan(dx.dr)) { dx.dr = 1; }
+//
+//#ifdef DEBUG_PRINTS
+//    printf("dx dt:%f (t: %f, p: %f)", dx.dt, gt_dt, p_dt);
+//    printf(", db: %f (t: %f, p: %f)", dx.db, gt_db, p_db);
+//    printf(", dl: %f (t: %f, p: %f)", dx.dl, gt_dl, p_dl);
+//    printf(", dr: %f (t: %f, p: %f) | ", dx.dr, gt_dr, p_dr);
+//#endif */
+    return ddx;
 }
 
 float box_rmse(box a, box b)
@@ -351,13 +670,13 @@ void test_box()
 
 dbox diou(box a, box b)
 {
-    float u = box_union(a,b);
-    float i = box_intersection(a,b);
-    dbox di = dintersect(a,b);
-    dbox du = dunion(a,b);
-    dbox dd = {0,0,0,0};
+    float u = box_union(a, b);
+    float i = box_intersection(a, b);
+    dbox di = dintersect(a, b);
+    dbox du = dunion(a, b);
+    dbox dd = { 0,0,0,0 };
 
-    if(i <= 0 || 1) {
+    if (i <= 0 || 1) {
         dd.dx = b.x - a.x;
         dd.dy = b.y - a.y;
         dd.dw = b.w - a.w;
@@ -365,10 +684,10 @@ dbox diou(box a, box b)
         return dd;
     }
 
-    dd.dx = 2*pow((1-(i/u)),1)*(di.dx*u - du.dx*i)/(u*u);
-    dd.dy = 2*pow((1-(i/u)),1)*(di.dy*u - du.dy*i)/(u*u);
-    dd.dw = 2*pow((1-(i/u)),1)*(di.dw*u - du.dw*i)/(u*u);
-    dd.dh = 2*pow((1-(i/u)),1)*(di.dh*u - du.dh*i)/(u*u);
+    dd.dx = (di.dx*u - du.dx*i) / (u*u);
+    dd.dy = (di.dy*u - du.dy*i) / (u*u);
+    dd.dw = (di.dw*u - du.dw*i) / (u*u);
+    dd.dh = (di.dh*u - du.dh*i) / (u*u);
     return dd;
 }
 
@@ -524,6 +843,44 @@ void do_nms(box *boxes, float **probs, int total, int classes, float thresh)
     }
 }
 
+void diounms_sort(detection *dets, int total, int classes, float thresh, NMS_KIND nms_kind, float beta1)
+{
+    int i, j, k;
+    k = total - 1;
+    for (i = 0; i <= k; ++i) {
+        if (dets[i].objectness == 0) {
+            detection swap = dets[i];
+            dets[i] = dets[k];
+            dets[k] = swap;
+            --k;
+            --i;
+        }
+    }
+    total = k + 1;
+
+    for (k = 0; k < classes; ++k) {
+        for (i = 0; i < total; ++i) {
+            dets[i].sort_class = k;
+        }
+        qsort(dets, total, sizeof(detection), nms_comparator);
+        for (i = 0; i < total; ++i) {
+            if (dets[i].prob[k] == 0) continue;
+            box a = dets[i].bbox;
+            for (j = i + 1; j < total; ++j) {
+                box b = dets[j].bbox;
+                if (box_iou(a, b) > thresh && nms_kind == GREEDY_NMS) {
+                    dets[j].prob[k] = 0;
+                }
+                else {
+                    if (box_diounms(a, b, beta1) > thresh && nms_kind == DIOU_NMS) {
+                        dets[j].prob[k] = 0;
+                    }
+                }
+            }
+        }
+    }
+}
+
 box encode_box(box b, box anchor)
 {
     box encode;
diff --git a/src/box.h b/src/box.h
index 172c135293c..c7f0fb4394a 100644
--- a/src/box.h
+++ b/src/box.h
@@ -42,6 +42,7 @@ void do_nms(box *boxes, float **probs, int total, int classes, float thresh);
 void do_nms_sort_v2(box *boxes, float **probs, int total, int classes, float thresh);
 //LIB_API void do_nms_sort(detection *dets, int total, int classes, float thresh);
 //LIB_API void do_nms_obj(detection *dets, int total, int classes, float thresh);
+//LIB_API void diounms_sort(detection *dets, int total, int classes, float thresh, NMS_KIND nms_kind, float beta1);
 box decode_box(box b, box anchor);
 box encode_box(box b, box anchor);
 
diff --git a/src/demo.c b/src/demo.c
index 6c7f5d39848..63debde1a9d 100644
--- a/src/demo.c
+++ b/src/demo.c
@@ -213,7 +213,10 @@ void demo(char *cfgfile, char *weightfile, float thresh, float hier_thresh, int
             detection *local_dets = dets;
 
             //if (nms) do_nms_obj(local_dets, local_nboxes, l.classes, nms);    // bad results
-            if (nms) do_nms_sort(local_dets, local_nboxes, l.classes, nms);
+            if (nms) {
+                if (l.nms_kind == DEFAULT_NMS) do_nms_sort(local_dets, local_nboxes, l.classes, nms);
+                else diounms_sort(local_dets, local_nboxes, l.classes, nms, l.nms_kind, l.beta_nms);
+            }
 
             //printf("\033[2J");
             //printf("\033[1;1H");
diff --git a/src/detector.c b/src/detector.c
index 8177343ef3d..feb86d5583f 100644
--- a/src/detector.c
+++ b/src/detector.c
@@ -243,7 +243,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
         calc_map_for_each = fmax(calc_map_for_each, 100);
         int next_map_calc = iter_map + calc_map_for_each;
         next_map_calc = fmax(next_map_calc, net.burn_in);
-        next_map_calc = fmax(next_map_calc, 400);
+        //next_map_calc = fmax(next_map_calc, 400);
         if (calc_map) {
             printf("\n (next mAP calculation at %d iterations) ", next_map_calc);
             if (mean_average_precision > 0) printf("\n Last accuracy mAP@0.5 = %2.2f %%, best = %2.2f %% ", mean_average_precision * 100, best_map * 100);
@@ -537,7 +537,10 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
             int nboxes = 0;
             int letterbox = (args.type == LETTERBOX_DATA);
             detection *dets = get_network_boxes(&net, w, h, thresh, .5, map, 0, &nboxes, letterbox);
-            if (nms) do_nms_sort(dets, nboxes, classes, nms);
+            if (nms) {
+                if (l.nms_kind == DEFAULT_NMS) do_nms_sort(dets, nboxes, l.classes, nms);
+                else diounms_sort(dets, nboxes, l.classes, nms, l.nms_kind, l.beta_nms);
+            }
             if (coco) {
                 print_cocos(fp, path, dets, nboxes, classes, w, h);
             }
@@ -800,7 +803,10 @@ float validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, floa
                 dets = get_network_boxes(&net, 1, 1, thresh, hier_thresh, 0, 0, &nboxes, letter_box);
             }
             //detection *dets = get_network_boxes(&net, val[t].w, val[t].h, thresh, hier_thresh, 0, 1, &nboxes, letter_box); // for letter_box=1
-            if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
+            if (nms) {
+                if (l.nms_kind == DEFAULT_NMS) do_nms_sort(dets, nboxes, l.classes, nms);
+                else diounms_sort(dets, nboxes, l.classes, nms, l.nms_kind, l.beta_nms);
+            }
             //if (nms) do_nms_obj(dets, nboxes, l.classes, nms);
 
             char labelpath[4096];
@@ -1354,7 +1360,10 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
 
         int nboxes = 0;
         detection *dets = get_network_boxes(&net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes, letter_box);
-        if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
+        if (nms) {
+            if (l.nms_kind == DEFAULT_NMS) do_nms_sort(dets, nboxes, l.classes, nms);
+            else diounms_sort(dets, nboxes, l.classes, nms, l.nms_kind, l.beta_nms);
+        }
         draw_detections_v3(im, dets, nboxes, thresh, names, alphabet, l.classes, ext_output);
         save_image(im, "predictions");
         if (!dont_show) {
diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index 109eb522c3a..ae3ee648140 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -130,24 +130,43 @@ void resize_gaussian_yolo_layer(layer *l, int w, int h)
 #endif
 }
 
-box get_gaussian_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride)
+box get_gaussian_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride, YOLO_POINT yolo_point)
 {
     box b;
-    b.x = (i + x[index + 0*stride]) / lw;
-    b.y = (j + x[index + 2*stride]) / lh;
-    b.w = exp(x[index + 4*stride]) * biases[2*n]   / w;
-    b.h = exp(x[index + 6*stride]) * biases[2*n+1] / h;
+
+    if (yolo_point == YOLO_CENTER) {
+        b.w = exp(x[index + 4 * stride]) * biases[2 * n] / w;
+        b.h = exp(x[index + 6 * stride]) * biases[2 * n + 1] / h;
+        b.x = (i + x[index + 0 * stride]) / lw;
+        b.y = (j + x[index + 2 * stride]) / lh;
+    }
+    else if (yolo_point == YOLO_LEFT_TOP) {
+        b.w = exp(x[index + 4 * stride]) * biases[2 * n] / w;
+        b.h = exp(x[index + 6 * stride]) * biases[2 * n + 1] / h;
+        b.x = (i + x[index + 0 * stride]) / lw + b.w/2;
+        b.y = (j + x[index + 2 * stride]) / lh + b.h/2;
+    }
+    else if (yolo_point == YOLO_RIGHT_BOTTOM) {
+        b.w = exp(x[index + 4 * stride]) * biases[2 * n] / w;
+        b.h = exp(x[index + 6 * stride]) * biases[2 * n + 1] / h;
+        b.x = (i + x[index + 0 * stride]) / lw - b.w / 2;
+        b.y = (j + x[index + 2 * stride]) / lh - b.h / 2;
+    }
+
     return b;
 }
 
-float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss, float uc_normalizer, int accumulate)
+float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta,
+    float scale, int stride, float iou_normalizer, IOU_LOSS iou_loss, float uc_normalizer, int accumulate, YOLO_POINT yolo_point)
 {
-    box pred = get_gaussian_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
+    box pred = get_gaussian_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride, yolo_point);
 
     float iou;
     ious all_ious = { 0 };
     all_ious.iou = box_iou(pred, truth);
     all_ious.giou = box_giou(pred, truth);
+    all_ious.diou = box_diou(pred, truth);
+    all_ious.ciou = box_ciou(pred, truth);
     if (pred.w == 0) { pred.w = 1.0; }
     if (pred.h == 0) { pred.h = 1.0; }
 
@@ -158,10 +177,24 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
 
     iou = all_ious.iou;
 
-    float tx = (truth.x*lw - i);
-    float ty = (truth.y*lh - j);
-    float tw = log(truth.w*w / biases[2 * n]);
-    float th = log(truth.h*h / biases[2 * n + 1]);
+    float tx, ty, tw, th;
+
+    tw = log(truth.w*w / biases[2 * n]);
+    th = log(truth.h*h / biases[2 * n + 1]);
+
+    if (yolo_point == YOLO_CENTER) {
+        tx = (truth.x*lw - i);
+        ty = (truth.y*lh - j);
+
+    }
+    else if (yolo_point == YOLO_LEFT_TOP) {
+        tx = ((truth.x - truth.w / 2)*lw - i);
+        ty = ((truth.y - truth.h / 2)*lh - j);
+    }
+    else if (yolo_point == YOLO_RIGHT_BOTTOM) {
+        tx = ((truth.x + truth.w / 2)*lw - i);
+        ty = ((truth.y + truth.h / 2)*lh - j);
+    }
 
     dx = (tx - x[index + 0 * stride]);
     dy = (ty - x[index + 2 * stride]);
@@ -220,9 +253,24 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
         // https://giou.stanford.edu/
         all_ious.dx_iou = dx_box_iou(pred, truth, iou_loss);
 
+        float dx, dy;
+
+        if (yolo_point == YOLO_CENTER) {
+            dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
+            dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
+        }
+        else if (yolo_point == YOLO_LEFT_TOP) {
+            dx = all_ious.dx_iou.dl;
+            dy = all_ious.dx_iou.dt;
+        }
+        else if (yolo_point == YOLO_RIGHT_BOTTOM) {
+            dx = all_ious.dx_iou.dr;
+            dy = all_ious.dx_iou.db;
+        }
+
         // jacobian^t (transpose)
-        float dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
-        float dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
+        //float dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
+        //float dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
         float dw = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
         float dh = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
 
@@ -360,7 +408,7 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
             for (i = 0; i < l.w; ++i) {
                 for (n = 0; n < l.n; ++n) {
                     int box_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 0);
-                    box pred = get_gaussian_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.w*l.h);
+                    box pred = get_gaussian_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.w*l.h, l.yolo_point);
                     float best_match_iou = 0;
                     int best_match_t = 0;
                     float best_iou = 0;
@@ -405,7 +453,7 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
                         int class_index = entry_gaussian_index(l, b, n*l.w*l.h + j*l.w + i, 9);
                         delta_gaussian_yolo_class(l.output, l.delta, class_index, class_id, l.classes, l.w*l.h, 0);
                         box truth = float_to_box_stride(state.truth + best_t*(4 + 1) + b*l.truths, 1);
-                        delta_gaussian_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, l.uc_normalizer, 1);
+                        delta_gaussian_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, l.uc_normalizer, 1, l.yolo_point);
                     }
                 }
             }
@@ -416,8 +464,22 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
             if(!truth.x) break;
             float best_iou = 0;
             int best_n = 0;
-            i = (truth.x * l.w);
-            j = (truth.y * l.h);
+            //i = (truth.x * l.w);
+            //j = (truth.y * l.h);
+
+            if (l.yolo_point == YOLO_CENTER) {
+                i = (truth.x * l.w);
+                j = (truth.y * l.h);
+            }
+            else if (l.yolo_point == YOLO_LEFT_TOP) {
+                i = ((truth.x - truth.w / 2) * l.w);
+                j = ((truth.y - truth.h / 2) * l.h);
+            }
+            else if (l.yolo_point == YOLO_RIGHT_BOTTOM) {
+                i = ((truth.x + truth.w / 2) * l.w);
+                j = ((truth.y + truth.h / 2) * l.h);
+            }
+
             box truth_shift = truth;
             truth_shift.x = truth_shift.y = 0;
             for(n = 0; n < l.total; ++n){
@@ -434,7 +496,7 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
             int mask_n = int_index(l.mask, best_n, l.n);
             if(mask_n >= 0){
                 int box_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
-                float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, l.uc_normalizer, 1);
+                float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2-truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, l.uc_normalizer, 1, l.yolo_point);
 
                 int obj_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 8);
                 avg_obj += l.output[obj_index];
@@ -465,7 +527,7 @@ void forward_gaussian_yolo_layer(const layer l, network_state state)
 
                     if (iou > l.iou_thresh) {
                         int box_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
-                        float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, l.uc_normalizer, 1);
+                        float iou = delta_gaussian_yolo_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, state.net.w, state.net.h, l.delta, (2 - truth.w*truth.h), l.w*l.h, l.iou_normalizer, l.iou_loss, l.uc_normalizer, 1, l.yolo_point);
 
                         int obj_index = entry_gaussian_index(l, b, mask_n*l.w*l.h + j*l.w + i, 8);
                         avg_obj += l.output[obj_index];
@@ -671,7 +733,7 @@ int get_gaussian_yolo_detections(layer l, int w, int h, int netw, int neth, floa
 
             if (objectness > thresh) {
                 int box_index = entry_gaussian_index(l, 0, n*l.w*l.h + i, 0);
-                dets[count].bbox = get_gaussian_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
+                dets[count].bbox = get_gaussian_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h, l.yolo_point);
                 dets[count].objectness = objectness;
                 dets[count].classes = l.classes;
 
diff --git a/src/parser.c b/src/parser.c
index 3728c4422f9..e4e970695f0 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -372,8 +372,21 @@ layer parse_yolo(list *options, size_params params)
 
     if (strcmp(iou_loss, "mse") == 0) l.iou_loss = MSE;
     else if (strcmp(iou_loss, "giou") == 0) l.iou_loss = GIOU;
+    else if (strcmp(iou_loss, "diou") == 0) l.iou_loss = DIOU;
+    else if (strcmp(iou_loss, "ciou") == 0) l.iou_loss = CIOU;
     else l.iou_loss = IOU;
-    fprintf(stderr, "[yolo] params: iou loss: %s, iou_norm: %2.2f, cls_norm: %2.2f, scale_x_y: %2.2f\n", (l.iou_loss == MSE ? "mse" : (l.iou_loss == GIOU ? "giou" : "iou")), l.iou_normalizer, l.cls_normalizer, l.scale_x_y);
+    fprintf(stderr, "[yolo] params: iou loss: %s (%d), iou_norm: %2.2f, cls_norm: %2.2f, scale_x_y: %2.2f\n",
+        iou_loss, l.iou_loss, l.iou_normalizer, l.cls_normalizer, l.scale_x_y);
+
+    l.beta_nms = option_find_float_quiet(options, "beta_nms", 0.6);
+    char *nms_kind = option_find_str(options, "nms_kind", "default");
+    if (strcmp(nms_kind, "default") == 0) l.nms_kind = DEFAULT_NMS;
+    else {
+        if (strcmp(nms_kind, "greedynms") == 0) l.nms_kind = GREEDY_NMS;
+        else if (strcmp(nms_kind, "diounms") == 0) l.nms_kind = DIOU_NMS;
+        else l.nms_kind = DEFAULT_NMS;
+        printf("nms_kind: %s (%d), beta = %f \n", nms_kind, l.nms_kind, l.beta_nms);
+    }
 
     l.jitter = option_find_float(options, "jitter", .2);
     l.focal_loss = option_find_int_quiet(options, "focal_loss", 0);
@@ -451,8 +464,27 @@ layer parse_gaussian_yolo(list *options, size_params params) // Gaussian_YOLOv3
 
     if (strcmp(iou_loss, "mse") == 0) l.iou_loss = MSE;
     else if (strcmp(iou_loss, "giou") == 0) l.iou_loss = GIOU;
+    else if (strcmp(iou_loss, "diou") == 0) l.iou_loss = DIOU;
+    else if (strcmp(iou_loss, "ciou") == 0) l.iou_loss = CIOU;
     else l.iou_loss = IOU;
-    fprintf(stderr, "[Gaussian_yolo] iou loss: %s, iou_norm: %2.2f, cls_norm: %2.2f, scale: %2.2f\n", (l.iou_loss == MSE ? "mse" : (l.iou_loss == GIOU ? "giou" : "iou")), l.iou_normalizer, l.cls_normalizer, l.scale_x_y);
+
+    l.beta_nms = option_find_float_quiet(options, "beta_nms", 0.6);
+    char *nms_kind = option_find_str(options, "nms_kind", "default");
+    if (strcmp(nms_kind, "default") == 0) l.nms_kind = DEFAULT_NMS;
+    else {
+        if (strcmp(nms_kind, "greedynms") == 0) l.nms_kind = GREEDY_NMS;
+        else if (strcmp(nms_kind, "diounms") == 0) l.nms_kind = DIOU_NMS;
+        else l.nms_kind = DEFAULT_NMS;
+        printf("nms_kind: %s (%d), beta = %f \n", nms_kind, l.nms_kind, l.beta_nms);
+    }
+
+    char *yolo_point = option_find_str_quiet(options, "yolo_point", "center");
+    if (strcmp(yolo_point, "left_top") == 0) l.yolo_point = YOLO_LEFT_TOP;
+    else if (strcmp(yolo_point, "right_bottom") == 0) l.yolo_point = YOLO_RIGHT_BOTTOM;
+    else l.yolo_point = YOLO_CENTER;
+
+    fprintf(stderr, "[Gaussian_yolo] iou loss: %s (%d), iou_norm: %2.2f, cls_norm: %2.2f, scale: %2.2f, point: %d\n",
+        iou_loss, l.iou_loss, l.iou_normalizer, l.cls_normalizer, l.scale_x_y, l.yolo_point);
 
     l.jitter = option_find_float(options, "jitter", .2);
 
diff --git a/src/utils.c b/src/utils.c
index 4651cc0a1e8..af7cb9eaf01 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -916,4 +916,23 @@ int max_int_index(int *a, int n)
         }
     }
     return max_i;
+}
+
+// Absolute box from relative coordinate bounding box and image size
+boxabs box_to_boxabs(const box* b, const int img_w, const int img_h, const int bounds_check)
+{
+    boxabs ba;
+    ba.left = (b->x - b->w / 2.)*img_w;
+    ba.right = (b->x + b->w / 2.)*img_w;
+    ba.top = (b->y - b->h / 2.)*img_h;
+    ba.bot = (b->y + b->h / 2.)*img_h;
+
+    if (bounds_check) {
+        if (ba.left < 0) ba.left = 0;
+        if (ba.right > img_w - 1) ba.right = img_w - 1;
+        if (ba.top < 0) ba.top = 0;
+        if (ba.bot > img_h - 1) ba.bot = img_h - 1;
+    }
+
+    return ba;
 }
\ No newline at end of file
diff --git a/src/utils.h b/src/utils.h
index fe4efe04ef1..998209067dd 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -80,6 +80,7 @@ int check_array_is_inf(float *arr, int size);
 int int_index(int *a, int val, int n);
 int *random_index_order(int min, int max);
 int max_int_index(int *a, int n);
+boxabs box_to_boxabs(const box* b, const int img_w, const int img_h, const int bounds_check);
 
 #ifdef __cplusplus
 }
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index 08577db5ab4..7a7ba6da56e 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -137,6 +137,8 @@ ious delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i,
     box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
     all_ious.iou = box_iou(pred, truth);
     all_ious.giou = box_giou(pred, truth);
+    all_ious.diou = box_diou(pred, truth);
+    all_ious.ciou = box_ciou(pred, truth);
     // avoid nan in dx_box_iou
     if (pred.w == 0) { pred.w = 1.0; }
     if (pred.h == 0) { pred.h = 1.0; }
@@ -289,8 +291,12 @@ void forward_yolo_layer(const layer l, network_state state)
     //float avg_iou = 0;
     float tot_iou = 0;
     float tot_giou = 0;
+    float tot_diou = 0;
+    float tot_ciou = 0;
     float tot_iou_loss = 0;
     float tot_giou_loss = 0;
+    float tot_diou_loss = 0;
+    float tot_ciou_loss = 0;
     float recall = 0;
     float recall75 = 0;
     float avg_cat = 0;
@@ -392,6 +398,12 @@ void forward_yolo_layer(const layer l, network_state state)
                 tot_giou += all_ious.giou;
                 tot_giou_loss += 1 - all_ious.giou;
 
+                tot_diou += all_ious.diou;
+                tot_diou_loss += 1 - all_ious.diou;
+
+                tot_ciou += all_ious.ciou;
+                tot_ciou_loss += 1 - all_ious.ciou;
+
                 int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
                 avg_obj += l.output[obj_index];
                 l.delta[obj_index] = l.cls_normalizer * (1 - l.output[obj_index]);
@@ -428,6 +440,12 @@ void forward_yolo_layer(const layer l, network_state state)
                         tot_giou += all_ious.giou;
                         tot_giou_loss += 1 - all_ious.giou;
 
+                        tot_diou += all_ious.diou;
+                        tot_diou_loss += 1 - all_ious.diou;
+
+                        tot_ciou += all_ious.ciou;
+                        tot_ciou_loss += 1 - all_ious.ciou;
+
                         int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
                         avg_obj += l.output[obj_index];
                         l.delta[obj_index] = l.cls_normalizer * (1 - l.output[obj_index]);
@@ -508,6 +526,60 @@ void backward_yolo_layer(const layer l, network_state state)
    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
 }
 
+// Converts output of the network to detection boxes
+// w,h: image width,height
+// netw,neth: network width,height
+// relative: 1 (all callers seems to pass TRUE)
+void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
+{
+    int i;
+    // network height (or width)
+    int new_w = 0;
+    // network height (or width)
+    int new_h = 0;
+    // Compute scale given image w,h vs network w,h
+    // I think this "rotates" the image to match network to input image w/h ratio
+    // new_h and new_w are really just network width and height
+    if (((float)netw / w) < ((float)neth / h)) {
+        new_w = netw;
+        new_h = (h * netw) / w;
+    }
+    else {
+        new_h = neth;
+        new_w = (w * neth) / h;
+    }
+    // difference between network width and "rotated" width
+    float deltaw = netw - new_w;
+    // difference between network height and "rotated" height
+    float deltah = neth - new_h;
+    // ratio between rotated network width and network width
+    float ratiow = (float)new_w / netw;
+    // ratio between rotated network width and network width
+    float ratioh = (float)new_h / neth;
+    for (i = 0; i < n; ++i) {
+
+        box b = dets[i].bbox;
+        // x = ( x - (deltaw/2)/netw ) / ratiow;
+        //   x - [(1/2 the difference of the network width and rotated width) / (network width)]
+        b.x = (b.x - deltaw / 2. / netw) / ratiow;
+        b.y = (b.y - deltah / 2. / neth) / ratioh;
+        // scale to match rotation of incoming image
+        b.w *= 1 / ratiow;
+        b.h *= 1 / ratioh;
+
+        // relative seems to always be == 1, I don't think we hit this condition, ever.
+        if (!relative) {
+            b.x *= w;
+            b.w *= w;
+            b.y *= h;
+            b.h *= h;
+        }
+
+        dets[i].bbox = b;
+    }
+}
+
+/*
 void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative, int letter)
 {
     int i;
@@ -542,6 +614,7 @@ void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth
         dets[i].bbox = b;
     }
 }
+*/
 
 int yolo_num_detections(layer l, float thresh)
 {

From d43e09cdf24708b61cbd159822860dedbf756f1f Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 23 Nov 2019 16:44:45 +0300
Subject: [PATCH 82/86] Compile fix

---
 src/yolo_layer.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index 7a7ba6da56e..81d2de709ae 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -530,7 +530,7 @@ void backward_yolo_layer(const layer l, network_state state)
 // w,h: image width,height
 // netw,neth: network width,height
 // relative: 1 (all callers seems to pass TRUE)
-void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
+void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative, int letter)
 {
     int i;
     // network height (or width)
@@ -540,13 +540,19 @@ void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth
     // Compute scale given image w,h vs network w,h
     // I think this "rotates" the image to match network to input image w/h ratio
     // new_h and new_w are really just network width and height
-    if (((float)netw / w) < ((float)neth / h)) {
-        new_w = netw;
-        new_h = (h * netw) / w;
+    if (letter) {
+        if (((float)netw / w) < ((float)neth / h)) {
+            new_w = netw;
+            new_h = (h * netw) / w;
+        }
+        else {
+            new_h = neth;
+            new_w = (w * neth) / h;
+        }
     }
     else {
+        new_w = netw;
         new_h = neth;
-        new_w = (w * neth) / h;
     }
     // difference between network width and "rotated" width
     float deltaw = netw - new_w;

From 14212154d9340790634e0b5e664ae27802f45e38 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 23 Nov 2019 17:37:28 +0300
Subject: [PATCH 83/86] Minor compile fix and references to DIoU-darknet repo
 and paper

---
 src/box.c | 9 +++++++++
 src/box.h | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/src/box.c b/src/box.c
index e700dc7140c..0b3e9a92d14 100644
--- a/src/box.c
+++ b/src/box.c
@@ -177,6 +177,8 @@ float box_giou(box a, box b)
     return iou - giou_term;
 }
 
+// https://github.com/Zzh-tju/DIoU-darknet
+// https://arxiv.org/abs/1911.08287
 float box_diou(box a, box b)
 {
     boxabs ba = box_c(a, b);
@@ -215,6 +217,8 @@ float box_diounms(box a, box b, float beta1)
     return iou - diou_term;
 }
 
+// https://github.com/Zzh-tju/DIoU-darknet
+// https://arxiv.org/abs/1911.08287
 float box_ciou(box a, box b)
 {
     boxabs ba = box_c(a, b);
@@ -391,6 +395,9 @@ dxrep dx_box_iou(box pred, box truth, IOU_LOSS iou_loss) {
     p_dy = p_dt + p_db;
     p_dw = (p_dr - p_dl);         //For dw and dh, we do not divided by 2.
     p_dh = (p_db - p_dt);
+
+    // https://github.com/Zzh-tju/DIoU-darknet
+    // https://arxiv.org/abs/1911.08287
     if (iou_loss == DIOU) {
         if (C > 0) {
             p_dx += (2*(truth.x-pred.x)*C-(2*Cw*dCw_dx+2*Ch*dCh_dx)*S) / (C * C);
@@ -843,6 +850,8 @@ void do_nms(box *boxes, float **probs, int total, int classes, float thresh)
     }
 }
 
+// https://github.com/Zzh-tju/DIoU-darknet
+// https://arxiv.org/abs/1911.08287
 void diounms_sort(detection *dets, int total, int classes, float thresh, NMS_KIND nms_kind, float beta1)
 {
     int i, j, k;
diff --git a/src/box.h b/src/box.h
index c7f0fb4394a..608a4c5869d 100644
--- a/src/box.h
+++ b/src/box.h
@@ -36,6 +36,8 @@ float box_iou(box a, box b);
 float box_rmse(box a, box b);
 dxrep dx_box_iou(box a, box b, IOU_LOSS iou_loss);
 float box_giou(box a, box b);
+float box_diou(box a, box b);
+float box_ciou(box a, box b);
 dbox diou(box a, box b);
 boxabs to_tblr(box a);
 void do_nms(box *boxes, float **probs, int total, int classes, float thresh);

From 8cb3ee4e7956efbb4f858469256c9b139c511ab2 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 23 Nov 2019 18:59:29 +0300
Subject: [PATCH 84/86] diounms_sort() fixed

---
 include/darknet.h         |  5 +++--
 src/box.c                 | 32 +++++++++++++++++++++++++++++---
 src/gaussian_yolo_layer.c |  3 +++
 src/parser.c              |  1 +
 4 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/include/darknet.h b/include/darknet.h
index 851b65421c8..20f87475b3d 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -112,12 +112,12 @@ typedef enum {
 
 // parser.h
 typedef enum {
-    DEFAULT_NMS, GREEDY_NMS, DIOU_NMS
+    DEFAULT_NMS, GREEDY_NMS, DIOU_NMS, CORNERS_NMS
 } NMS_KIND;
 
 // parser.h
 typedef enum {
-    YOLO_CENTER, YOLO_LEFT_TOP, YOLO_RIGHT_BOTTOM
+    YOLO_CENTER = 1 << 0, YOLO_LEFT_TOP = 1 << 1, YOLO_RIGHT_BOTTOM = 1 << 2
 } YOLO_POINT;
 
 
@@ -748,6 +748,7 @@ typedef struct detection{
     float objectness;
     int sort_class;
     float *uc; // Gaussian_YOLOv3 - tx,ty,tw,th uncertainty
+    int points; // bit-0 - center, bit-1 - top-left-corner, bit-2 - bottom-right-corner
 } detection;
 
 // matrix.h
diff --git a/src/box.c b/src/box.c
index 0b3e9a92d14..5e39a2548fd 100644
--- a/src/box.c
+++ b/src/box.c
@@ -871,13 +871,36 @@ void diounms_sort(detection *dets, int total, int classes, float thresh, NMS_KIN
         for (i = 0; i < total; ++i) {
             dets[i].sort_class = k;
         }
-        qsort(dets, total, sizeof(detection), nms_comparator);
-        for (i = 0; i < total; ++i) {
+        qsort(dets, total, sizeof(detection), nms_comparator_v3);
+        for (i = 0; i < total; ++i)
+        {
             if (dets[i].prob[k] == 0) continue;
             box a = dets[i].bbox;
             for (j = i + 1; j < total; ++j) {
                 box b = dets[j].bbox;
-                if (box_iou(a, b) > thresh && nms_kind == GREEDY_NMS) {
+                if (box_iou(a, b) > thresh && nms_kind == CORNERS_NMS)
+                {
+                    float sum_prob = pow(dets[i].prob[k], 2) + pow(dets[j].prob[k], 2);
+                    float alpha_prob = pow(dets[i].prob[k], 2) / sum_prob;
+                    float beta_prob = pow(dets[j].prob[k], 2) / sum_prob;
+                    //dets[i].bbox.x = (dets[i].bbox.x*alpha_prob + dets[j].bbox.x*beta_prob);
+                    //dets[i].bbox.y = (dets[i].bbox.y*alpha_prob + dets[j].bbox.y*beta_prob);
+                    //dets[i].bbox.w = (dets[i].bbox.w*alpha_prob + dets[j].bbox.w*beta_prob);
+                    //dets[i].bbox.h = (dets[i].bbox.h*alpha_prob + dets[j].bbox.h*beta_prob);
+                    /*
+                    if (dets[j].points == YOLO_CENTER && (dets[i].points & dets[j].points) == 0) {
+                        dets[i].bbox.x = (dets[i].bbox.x*alpha_prob + dets[j].bbox.x*beta_prob);
+                        dets[i].bbox.y = (dets[i].bbox.y*alpha_prob + dets[j].bbox.y*beta_prob);
+                    }
+                    else if ((dets[i].points & dets[j].points) == 0) {
+                        dets[i].bbox.w = (dets[i].bbox.w*alpha_prob + dets[j].bbox.w*beta_prob);
+                        dets[i].bbox.h = (dets[i].bbox.h*alpha_prob + dets[j].bbox.h*beta_prob);
+                    }
+                    dets[i].points |= dets[j].points;
+                    */
+                    dets[j].prob[k] = 0;
+                }
+                else if (box_iou(a, b) > thresh && nms_kind == GREEDY_NMS) {
                     dets[j].prob[k] = 0;
                 }
                 else {
@@ -886,6 +909,9 @@ void diounms_sort(detection *dets, int total, int classes, float thresh, NMS_KIN
                     }
                 }
             }
+
+            //if ((nms_kind == CORNERS_NMS) && (dets[i].points != (YOLO_CENTER | YOLO_LEFT_TOP | YOLO_RIGHT_BOTTOM)))
+            //    dets[i].prob[k] = 0;
         }
     }
 }
diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index ae3ee648140..2e663651b0b 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -742,6 +742,9 @@ int get_gaussian_yolo_detections(layer l, int w, int h, int netw, int neth, floa
                 dets[count].uc[2] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 5)]; // tw uncertainty
                 dets[count].uc[3] = predictions[entry_gaussian_index(l, 0, n*l.w*l.h + i, 7)]; // th uncertainty
 
+                dets[count].points = l.yolo_point;
+                //if (l.yolo_point != YOLO_CENTER) dets[count].objectness = objectness = 0;
+
                 for (j = 0; j < l.classes; ++j) {
                     int class_index = entry_gaussian_index(l, 0, n*l.w*l.h + i, 9 + j);
                     float uc_aver = (dets[count].uc[0] + dets[count].uc[1] + dets[count].uc[2] + dets[count].uc[3]) / 4.0;
diff --git a/src/parser.c b/src/parser.c
index e4e970695f0..f7086060483 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -474,6 +474,7 @@ layer parse_gaussian_yolo(list *options, size_params params) // Gaussian_YOLOv3
     else {
         if (strcmp(nms_kind, "greedynms") == 0) l.nms_kind = GREEDY_NMS;
         else if (strcmp(nms_kind, "diounms") == 0) l.nms_kind = DIOU_NMS;
+        else if (strcmp(nms_kind, "cornersnms") == 0) l.nms_kind = CORNERS_NMS;
         else l.nms_kind = DEFAULT_NMS;
         printf("nms_kind: %s (%d), beta = %f \n", nms_kind, l.nms_kind, l.beta_nms);
     }

From 61f8f569b765f12caf1801745180024e0a0903ac Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 23 Nov 2019 19:45:14 +0300
Subject: [PATCH 85/86] CIOU and DIOU fix

---
 src/gaussian_yolo_layer.c | 37 +++++++++++++++++++------------------
 src/yolo_layer.c          | 14 ++++++++++----
 2 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index 2e663651b0b..0fe8a5e16f5 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -134,21 +134,18 @@ box get_gaussian_yolo_box(float *x, float *biases, int n, int index, int i, int
 {
     box b;
 
+    b.w = exp(x[index + 4 * stride]) * biases[2 * n] / w;
+    b.h = exp(x[index + 6 * stride]) * biases[2 * n + 1] / h;
+
     if (yolo_point == YOLO_CENTER) {
-        b.w = exp(x[index + 4 * stride]) * biases[2 * n] / w;
-        b.h = exp(x[index + 6 * stride]) * biases[2 * n + 1] / h;
         b.x = (i + x[index + 0 * stride]) / lw;
         b.y = (j + x[index + 2 * stride]) / lh;
     }
     else if (yolo_point == YOLO_LEFT_TOP) {
-        b.w = exp(x[index + 4 * stride]) * biases[2 * n] / w;
-        b.h = exp(x[index + 6 * stride]) * biases[2 * n + 1] / h;
-        b.x = (i + x[index + 0 * stride]) / lw + b.w/2;
-        b.y = (j + x[index + 2 * stride]) / lh + b.h/2;
+        b.x = (i + x[index + 0 * stride]) / lw + b.w / 2;
+        b.y = (j + x[index + 2 * stride]) / lh + b.h / 2;
     }
     else if (yolo_point == YOLO_RIGHT_BOTTOM) {
-        b.w = exp(x[index + 4 * stride]) * biases[2 * n] / w;
-        b.h = exp(x[index + 6 * stride]) * biases[2 * n + 1] / h;
         b.x = (i + x[index + 0 * stride]) / lw - b.w / 2;
         b.y = (j + x[index + 2 * stride]) / lh - b.h / 2;
     }
@@ -185,7 +182,6 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
     if (yolo_point == YOLO_CENTER) {
         tx = (truth.x*lw - i);
         ty = (truth.y*lh - j);
-
     }
     else if (yolo_point == YOLO_LEFT_TOP) {
         tx = ((truth.x - truth.w / 2)*lw - i);
@@ -251,28 +247,33 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
         // https://github.com/generalized-iou/g-darknet
         // https://arxiv.org/abs/1902.09630v2
         // https://giou.stanford.edu/
+        // https://arxiv.org/abs/1911.08287v1
+        // https://github.com/Zzh-tju/DIoU-darknet
         all_ious.dx_iou = dx_box_iou(pred, truth, iou_loss);
 
-        float dx, dy;
+        float dx, dy, dw, dh;
+
+        dx = all_ious.dx_iou.dt;
+        dy = all_ious.dx_iou.db;
+        dw = all_ious.dx_iou.dl;
+        dh = all_ious.dx_iou.dr;
 
         if (yolo_point == YOLO_CENTER) {
-            dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
-            dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
         }
         else if (yolo_point == YOLO_LEFT_TOP) {
-            dx = all_ious.dx_iou.dl;
-            dy = all_ious.dx_iou.dt;
+            dx = dx - dw/2;
+            dy = dy - dh/2;
         }
         else if (yolo_point == YOLO_RIGHT_BOTTOM) {
-            dx = all_ious.dx_iou.dr;
-            dy = all_ious.dx_iou.db;
+            dx = dx + dw / 2;
+            dy = dy + dh / 2;
         }
 
         // jacobian^t (transpose)
         //float dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
         //float dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
-        float dw = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
-        float dh = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
+        //float dw = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
+        //float dh = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
 
         // predict exponential, apply gradient of e^delta_t ONLY for w,h
         dw *= exp(x[index + 4 * stride]);
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index 81d2de709ae..40e73878a75 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -162,10 +162,16 @@ ious delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i,
         all_ious.dx_iou = dx_box_iou(pred, truth, iou_loss);
 
         // jacobian^t (transpose)
-        float dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
-        float dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
-        float dw = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
-        float dh = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
+        //float dx = (all_ious.dx_iou.dl + all_ious.dx_iou.dr);
+        //float dy = (all_ious.dx_iou.dt + all_ious.dx_iou.db);
+        //float dw = ((-0.5 * all_ious.dx_iou.dl) + (0.5 * all_ious.dx_iou.dr));
+        //float dh = ((-0.5 * all_ious.dx_iou.dt) + (0.5 * all_ious.dx_iou.db));
+
+        // jacobian^t (transpose)
+        float dx = all_ious.dx_iou.dt;
+        float dy = all_ious.dx_iou.db;
+        float dw = all_ious.dx_iou.dl;
+        float dh = all_ious.dx_iou.dr;
 
         // predict exponential, apply gradient of e^delta_t ONLY for w,h
         dw *= exp(x[index + 2 * stride]);

From 3abbd858084c9f3634a30307f36a0d23303796b6 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 23 Nov 2019 20:00:35 +0300
Subject: [PATCH 86/86] fixed consistency darknet and python code

---
 build/darknet/x64/darknet.py | 3 ++-
 darknet.py                   | 3 ++-
 src/gaussian_yolo_layer.c    | 8 ++++----
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/build/darknet/x64/darknet.py b/build/darknet/x64/darknet.py
index 5cfb26ba16e..55afcb0ea42 100644
--- a/build/darknet/x64/darknet.py
+++ b/build/darknet/x64/darknet.py
@@ -60,7 +60,8 @@ class DETECTION(Structure):
                 ("mask", POINTER(c_float)),
                 ("objectness", c_float),
                 ("sort_class", c_int),
-                ("uc", POINTER(c_float))]
+                ("uc", POINTER(c_float)),
+                ("points", c_int)]
 
 
 class IMAGE(Structure):
diff --git a/darknet.py b/darknet.py
index 5cfb26ba16e..55afcb0ea42 100644
--- a/darknet.py
+++ b/darknet.py
@@ -60,7 +60,8 @@ class DETECTION(Structure):
                 ("mask", POINTER(c_float)),
                 ("objectness", c_float),
                 ("sort_class", c_int),
-                ("uc", POINTER(c_float))]
+                ("uc", POINTER(c_float)),
+                ("points", c_int)]
 
 
 class IMAGE(Structure):
diff --git a/src/gaussian_yolo_layer.c b/src/gaussian_yolo_layer.c
index 0fe8a5e16f5..d179ddd83d5 100644
--- a/src/gaussian_yolo_layer.c
+++ b/src/gaussian_yolo_layer.c
@@ -136,10 +136,10 @@ box get_gaussian_yolo_box(float *x, float *biases, int n, int index, int i, int
 
     b.w = exp(x[index + 4 * stride]) * biases[2 * n] / w;
     b.h = exp(x[index + 6 * stride]) * biases[2 * n + 1] / h;
+    b.x = (i + x[index + 0 * stride]) / lw;
+    b.y = (j + x[index + 2 * stride]) / lh;
 
     if (yolo_point == YOLO_CENTER) {
-        b.x = (i + x[index + 0 * stride]) / lw;
-        b.y = (j + x[index + 2 * stride]) / lh;
     }
     else if (yolo_point == YOLO_LEFT_TOP) {
         b.x = (i + x[index + 0 * stride]) / lw + b.w / 2;
@@ -176,12 +176,12 @@ float delta_gaussian_yolo_box(box truth, float *x, float *biases, int n, int ind
 
     float tx, ty, tw, th;
 
+    tx = (truth.x*lw - i);
+    ty = (truth.y*lh - j);
     tw = log(truth.w*w / biases[2 * n]);
     th = log(truth.h*h / biases[2 * n + 1]);
 
     if (yolo_point == YOLO_CENTER) {
-        tx = (truth.x*lw - i);
-        ty = (truth.y*lh - j);
     }
     else if (yolo_point == YOLO_LEFT_TOP) {
         tx = ((truth.x - truth.w / 2)*lw - i);