From c8746e8f41608cf4fc3e9a4f8f76bfd2195e3613 Mon Sep 17 00:00:00 2001
From: Andriy <andre.ixar@gmail.com>
Date: Mon, 20 May 2024 17:18:20 -0400
Subject: [PATCH 1/2] (1) fixed Gabor for CPU, (2) accelerated Gabor for GPGPU

---
 src/nyx/feature_mgr.h              |   4 +
 src/nyx/feature_mgr_init.cpp       |   4 +
 src/nyx/features/gabor.cpp         | 182 ++++++++++++++++++++---------
 src/nyx/features/gabor.h           |  18 ++-
 src/nyx/features/gabor_nontriv.cpp |   2 +-
 src/nyx/gpu/gabor.cu               | 109 ++++++++++++-----
 src/nyx/gpu/gabor.cuh              |  12 +-
 src/nyx/main_nyxus.cpp             |  11 +-
 8 files changed, 249 insertions(+), 93 deletions(-)
diff --git a/src/nyx/feature_mgr.h b/src/nyx/feature_mgr.h
index 6a8330bf..6c7a3b89 100644
--- a/src/nyx/feature_mgr.h
+++ b/src/nyx/feature_mgr.h
@@ -18,6 +18,10 @@ class FeatureManager
 
 	void apply_user_selection();
 
+	// Initializes feature classes 
+	// (allocates lookup tables, precalculates filter banks, etc.)
+	bool init_feature_classes();
+
 	// After compiling, returns the number of user-requested features
 	int get_num_requested_features();
 
diff --git a/src/nyx/feature_mgr_init.cpp b/src/nyx/feature_mgr_init.cpp
index f91aa7f1..63fafa53 100644
--- a/src/nyx/feature_mgr_init.cpp
+++ b/src/nyx/feature_mgr_init.cpp
@@ -63,4 +63,8 @@ FeatureManager::FeatureManager()
 	register_feature(new PixelIntensityFeatures_3D());
 }
 
+bool FeatureManager::init_feature_classes()
+{
+	return GaborFeature::init_class();
+}
 
diff --git a/src/nyx/features/gabor.cpp b/src/nyx/features/gabor.cpp
index 1da1fd69..cb4de77d 100644
--- a/src/nyx/features/gabor.cpp
+++ b/src/nyx/features/gabor.cpp
@@ -4,7 +4,10 @@
 
 using namespace Nyxus;
 
+//
 // Static members changeable by user via class 'Environment'
+//
+
 double GaborFeature::gamma = 0.1;           
 double GaborFeature::sig2lam = 0.8;         
 int GaborFeature::n = 16;                    
@@ -18,6 +21,17 @@ std::vector<std::pair<double, double>> GaborFeature::f0_theta_pairs
     {M_PI_4*3.0, 64.0} 
 };
 
+#ifdef USE_GPU
+    int GaborFeature::n_bank_filters = -1;
+    std::vector<std::vector<double>> GaborFeature::filterbank;
+    std::vector<double> GaborFeature::ho_filterbank;
+    double* GaborFeature::dev_filterbank = nullptr;
+#endif
+
+//
+//
+//
+
 bool GaborFeature::required(const FeatureSet& fs) 
 { 
     return fs.isEnabled (Feature2D::GABOR); 
@@ -201,7 +215,7 @@ void GaborFeature::calculate_gpu_multi_filter (LR& r)
     int step_size = ceil(cufft_size / CUFFT_MAX_SIZE);
     if(step_size == 0) 
         step_size = 1;
-    int num_filters = ceil(8/step_size);
+    int num_filters = ceil(nFreqs/step_size) + 1;
 
     // Temp buffers
     // 
@@ -232,61 +246,44 @@ void GaborFeature::calculate_gpu_multi_filter (LR& r)
         thetas.push_back(ft.second);
     }
 
-    for(int i = 0; i < 8; i += num_filters)
-    {
-        // Compute the baseline score before applying high-pass Gabor filters
-        std::vector<std::vector<PixIntens>> e2_pix_plane_vec (num_filters, std::vector<PixIntens>(Im0.width * Im0.height));
-
-        std::vector<double> f(num_filters);
-
-        for(int j = i; j < i + num_filters; j++) 
-        {
-            if(j >= 8) 
-                break;
-            f[j-i] = freqs[j]; 
-        }
+    // Compute the baseline score before applying high-pass Gabor filters
+    std::vector<std::vector<PixIntens>> responses (num_filters, std::vector<PixIntens>(Im0.width * Im0.height));
 
-        // Calculate low-passed baseline and high-passed filter responses
-        GaborEnergyGPUMultiFilter (Im0, e2_pix_plane_vec, auxC.data(), auxG.data(), f, sig2lam, gamma, thetas, n, num_filters);
+    // Calculate low-passed baseline and high-passed filter responses
+    GaborEnergyGPUMultiFilter (Im0, responses, auxC.data(), auxG.data(), freqs, sig2lam, gamma, thetas, n, freqs.size());
 
-        // Examine the baseline signal
-        if (i == 0) 
-        {
-            std::vector<PixIntens>& pix_plane = e2_pix_plane_vec[0];
+    // Examine the baseline signal
+    std::vector<PixIntens>& pix_plane = responses[0];
 
-            PixIntens* e2img_ptr = e2img.writable_data_ptr();
+    PixIntens* e2img_ptr = e2img.writable_data_ptr();
 
-            for(int k = 0; k < Im0.height * Im0.width; ++k)
-            {
-                e2img_ptr[k] = pix_plane[k];
-            } 
+    for (int k = 0; k < Im0.height * Im0.width; ++k)
+        e2img_ptr[k] = pix_plane[k];
             
-            // Values that we need for scoring filter responses
-            Moments2 local_stats;
-            e2img.GetStats(local_stats);
-            maxval = local_stats.max__();
-            double cmpval = local_stats.min__();
-
-            // Score the baseline signal    
-            for (auto a : pix_plane)
-                if (double(a) > cmpval)
-                    baselineScore++;
-        }
+    // Values that we need for scoring filter responses
+    Moments2 local_stats;
+    e2img.GetStats(local_stats);
+    maxval = local_stats.max__();
+    double cmpval = local_stats.min__();
+
+    // Score the baseline signal    
+    for (auto a : pix_plane)
+        if (double(a) > cmpval)
+            baselineScore++;
         
-        // Iterate high-pass filter response signals and score them 
-        for (int i=0; i < nFreqs; i++)
-        {
-            std::vector<PixIntens>& e2_pix_plane_temp = e2_pix_plane_vec[i+1];
+    // Iterate high-pass filter response signals and score them 
+    for (int j=1; j< freqs.size(); j++)
+    {
+        std::vector<PixIntens>& e2_pix_plane_temp = responses [j];
 
-            // score it
-            unsigned long afterGaborScore = 0;
-            for (auto a : e2_pix_plane_temp)
-                if (double(a) / maxval > GRAYthr)
-                    afterGaborScore++;
+        // score it
+        unsigned long afterGaborScore = 0;
+        for (auto a : e2_pix_plane_temp)
+            if (double(a) / maxval > GRAYthr)
+                afterGaborScore++;
 
-            // save the score as feature value
-            fvals[i] = (double)afterGaborScore / (double)baselineScore;
-        }
+        // save the score as feature value
+        fvals[j-1] = (double)afterGaborScore / (double)baselineScore;
     }
 }
 #endif
@@ -372,7 +369,7 @@ void GaborFeature::conv_dud(
 }
 
 // Creates a normalized Gabor filter
-void GaborFeature::Gabor (double* Gex, double f0, double sig2lam, double gamma, double theta, double fi, int n)
+void GaborFeature::Gabor (double* Gex, double f0, double sig2lam, double gamma, double theta, double fi, int n, std::vector<double> & tx, std::vector<double> & ty)
 {
     double lambda = 2 * M_PI / f0,
         cos_theta = cos(theta), 
@@ -445,7 +442,7 @@ void GaborFeature::GaborEnergy (
     int n_gab = n;
 
     double fi = 0;
-    Gabor (Gexp, f0, sig2lam, gamma, theta, fi, n_gab);
+    Gabor (Gexp, f0, sig2lam, gamma, theta, fi, n_gab, tx, ty);
 
     readOnlyPixels pix_plane = Im.ReadablePixels();
 
@@ -506,7 +503,7 @@ void GaborFeature::GaborEnergyGPU (
     int n_gab = n;
 
     double fi = 0;
-    Gabor (Gexp, f0, sig2lam, gamma, theta, fi, n_gab);
+    Gabor (Gexp, f0, sig2lam, gamma, theta, fi, n_gab, tx, ty);
 
     readOnlyPixels pix_plane = Im.ReadablePixels();
 
@@ -550,6 +547,9 @@ void GaborFeature::GaborEnergyGPUMultiFilter (
 {
     int n_gab = n;
 
+    readOnlyPixels pix_plane = Im.ReadablePixels();
+
+#if 0   // ---now via cache---
     std::vector<double> g_filters;
     g_filters.resize(2 * n * n * num_filters);
 
@@ -558,7 +558,7 @@ void GaborFeature::GaborEnergyGPUMultiFilter (
     int idx = 0;
     for(int i = 0; i < num_filters; ++i) 
     {   
-        Gabor (Gexp, f0s[i], sig2lam, gamma, thetas[i], fi, n_gab);
+        Gabor (Gexp, f0s[i], sig2lam, gamma, thetas[i], fi, n_gab, tx, ty);
         for(int i = 0; i < 2*n*n; ++i) {
             g_filters[idx+i] = Gexp[i];
         }
@@ -566,14 +566,28 @@ void GaborFeature::GaborEnergyGPUMultiFilter (
         idx += 2*n*n;
     }
 
-    readOnlyPixels pix_plane = Im.ReadablePixels();
-    
     bool success = CuGabor::conv_dud_gpu_fft_multi_filter (auxC, pix_plane.data(), g_filters.data(), Im.width, Im.height, n_gab, n_gab, num_filters);
     if(!success) {
         std::cerr << "Unable to calculate Gabor features on GPU \n";
     }
+#endif
 
-    for (int i = 0; i < num_filters; ++i){
+    //--- via cache ---
+    bool success = CuGabor::conv_dud_gpu_fft_multi_filter (
+        auxC, 
+        pix_plane.data(), 
+        GaborFeature::ho_filterbank.data(), 
+        Im.width, 
+        Im.height, 
+        n_gab, 
+        n_gab, 
+        num_filters,
+        GaborFeature::dev_filterbank);
+    if (!success) 
+        std::cerr << "\n\n\nUnable to calculate Gabor features on GPU \n\n\n";
+
+    for (int idx, i = 0; i < num_filters; ++i)
+    {
         idx = 2 * i * (Im.width + n - 1) * (Im.height + n - 1);
         decltype(Im.height) b = 0;
         
@@ -639,3 +653,61 @@ double GaborFeature::get_theta_in_degrees (int i)
     return theta / M_PI * 180;
 }
 
+#ifdef USE_GPU
+
+void GaborFeature::create_filterbank()
+{
+    // frequencies and angles
+    std::vector<double> freqs = { f0LP },
+        thetas = { M_PI_2 };    // the lowpass filter goes at compromise pi/2 theta
+    for (auto& ft : GaborFeature::f0_theta_pairs)
+    {
+        freqs.push_back(ft.first);
+        thetas.push_back(ft.second);
+    }
+
+    // allocate the conv kernels buffer
+    GaborFeature::n_bank_filters = (int)freqs.size();
+    GaborFeature::filterbank.resize(GaborFeature::n_bank_filters);
+    for (auto& f : GaborFeature::filterbank)
+        f.resize (2 * GaborFeature::n * GaborFeature::n * GaborFeature::n_bank_filters);
+
+    ho_filterbank.resize(2 * GaborFeature::n * GaborFeature::n * GaborFeature::n_bank_filters);
+
+    // temps
+    std::vector<double> temp_tx, temp_ty;
+    temp_tx.resize (GaborFeature::n + 1);
+    temp_ty.resize (GaborFeature::n + 1);
+    double fi = 0;  // offset
+
+    // filter bank as a single chunk of memory
+    int idx = 0;
+    for (int i = 0; i < GaborFeature::n_bank_filters; i++)
+    {
+        auto filterData = GaborFeature::filterbank[i];
+        Gabor (filterData.data(), freqs[i], GaborFeature::sig2lam, GaborFeature::gamma, thetas[i], fi, GaborFeature::n, temp_tx, temp_ty);
+
+        for (int i = 0; i < 2 * GaborFeature::n * GaborFeature::n; ++i)
+            GaborFeature::ho_filterbank [idx + i] = filterData[i];
+
+        idx += 2 * GaborFeature::n * GaborFeature::n;
+    }
+}
+
+bool GaborFeature::send_filterbank_2_gpuside()
+{
+    return CuGabor::send_filterbank_2_gpuside (&GaborFeature::dev_filterbank, GaborFeature::ho_filterbank.data(), GaborFeature::ho_filterbank.size());
+}
+
+#endif
+
+bool GaborFeature::init_class()
+{
+    #ifdef USE_GPU
+    GaborFeature::create_filterbank();
+    if (! GaborFeature::send_filterbank_2_gpuside())
+        return false;
+    #endif
+
+    return true;
+}
\ No newline at end of file
diff --git a/src/nyx/features/gabor.h b/src/nyx/features/gabor.h
index f72ee626..afbc0b26 100644
--- a/src/nyx/features/gabor.h
+++ b/src/nyx/features/gabor.h
@@ -62,6 +62,8 @@ class GaborFeature: public FeatureMethod
     // Returns the angle of i-th pair of 'f0_theta_pairs'
     static double get_theta_in_degrees (int i);
 
+    static bool init_class();
+
 private:
 
     // Result cache
@@ -73,14 +75,16 @@ class GaborFeature: public FeatureMethod
     void conv_dud (double* c, const unsigned int* a, double* b, int na, int ma, int nb, int mb);
 
     // Creates a non-normalized Gabor filter
-    void Gabor (
+    static void Gabor (
         double* Gex,    // buffer of size n*n*2
         double f0, 
         double sig2lam, 
         double gamma, 
         double theta, 
         double fi, 
-        int n);
+        int n, 
+        std::vector<double>& tx, 
+        std::vector<double>& ty);
 
     std::vector<double> tx, ty;
 
@@ -143,5 +147,15 @@ class GaborFeature: public FeatureMethod
         int na, int ma, int nb, int mb);
 
     void GetStats_NT (WriteImageMatrix_nontriv& I, Moments2& moments2);
+
+    // members referenced from init_class()
+    static void create_filterbank();
+    static int n_bank_filters;
+    static std::vector<std::vector<double>> filterbank;
+    #ifdef USE_GPU
+        static bool send_filterbank_2_gpuside();
+        static std::vector<double> ho_filterbank;
+        static double* dev_filterbank;
+    #endif
 };
 
diff --git a/src/nyx/features/gabor_nontriv.cpp b/src/nyx/features/gabor_nontriv.cpp
index d74329df..ffa19694 100644
--- a/src/nyx/features/gabor_nontriv.cpp
+++ b/src/nyx/features/gabor_nontriv.cpp
@@ -89,7 +89,7 @@ void GaborFeature::GaborEnergy_NT2 (
 
     // Prepare a filter
     double fi = 0;
-    Gabor (Gexp, f0, sig2lam, gamma, theta, fi, n);
+    Gabor (Gexp, f0, sig2lam, gamma, theta, fi, n, tx, ty);
 
     // Helpful temps
     auto width = Im.get_width(),
diff --git a/src/nyx/gpu/gabor.cu b/src/nyx/gpu/gabor.cu
index 16c974e1..0986cb19 100644
--- a/src/nyx/gpu/gabor.cu
+++ b/src/nyx/gpu/gabor.cu
@@ -78,10 +78,8 @@ namespace CuGabor {
     bool conv_dud_gpu_fft(double* out, 
                             const unsigned int* image, 
                             double* kernel, 
-                            int image_n, int image_m, int kernel_n, int kernel_m){
-
-        
-        
+                            int image_n, int image_m, int kernel_n, int kernel_m)
+    {
         int batch_size = 1;
 
         // calculate new size of image based on padding size
@@ -216,10 +214,11 @@ namespace CuGabor {
         return true;                    
     }
 
-     bool conv_dud_gpu_fft_multi_filter(double* out, 
+    bool conv_dud_gpu_fft_multi_filter(double* out, 
                             const unsigned int* image, 
                             double* kernel, 
-                            int image_n, int image_m, int kernel_n, int kernel_m, int batch_size){
+                            int image_n, int image_m, int kernel_n, int kernel_m, int batch_size,
+                            double* dev_filterbank){
         
         typedef double2 Complex; // comment out to use float
         typedef cufftDoubleComplex CuComplex; // comment out to use float
@@ -262,30 +261,6 @@ namespace CuGabor {
             }
         }
 
-        for (int batch = 0; batch < batch_size; ++batch) {
-
-            batch_idx = batch * size;
-            batch_idx2 = batch * kernel_m * kernel_n;
-
-            for (int i = 0; i < row_size; ++i) {
-                for (int j = 0; j < col_size; ++j) {
-
-                    index = batch_idx + (i*col_size + j);
-                    index2 = batch_idx2 + (i*kernel_n + j);
-
-                    if (i < kernel_m && j < kernel_n) {
-
-                        linear_kernel[index].x = kernel[2*index2];
-                        linear_kernel[index].y = kernel[2*index2+1];
-
-                    } else {
-                        linear_kernel[index].x = 0;
-                        linear_kernel[index].y = 0;
-                    }
-                }
-            }
-        }
-
         CuComplex* d_image;
         CuComplex* d_result;
         CuComplex* d_kernel;
@@ -305,8 +280,9 @@ namespace CuGabor {
         ok = cudaMemcpy(d_image, linear_image.data(), batch_size*size*sizeof(CuComplex), cudaMemcpyHostToDevice);
         CHECKERR(ok);
         
-        ok = cudaMemcpy(d_kernel, linear_kernel.data(), batch_size*size*sizeof(CuComplex), cudaMemcpyHostToDevice);
-        CHECKERR(ok);
+        bool ok2 = dense_2_padded_filterbank (d_kernel, dev_filterbank, image_m, image_n, kernel_m, kernel_n, batch_size);
+        if (!ok2)
+            return false;
 
         cufftHandle plan;
         cufftHandle plan_k;
@@ -368,5 +344,74 @@ namespace CuGabor {
 
         return true;                    
     }
+
+     bool send_filterbank_2_gpuside (double** dev_filterbank, const double* ho_filterbank, size_t filterbank_len_all_batches)
+     {
+         auto szb = sizeof(dev_filterbank[0]) * filterbank_len_all_batches;
+
+         auto ok = cudaMalloc ((void**)dev_filterbank, szb);
+         CHECKERR(ok);
+
+         ok = cudaMemcpy (*dev_filterbank, ho_filterbank, szb, cudaMemcpyHostToDevice);
+         CHECKERR(ok);
+
+         return true;
+     }
+
+     // X --> Y
+     __global__ void ker_dense_2_padded (cufftDoubleComplex* Y, double* X, int y_rowsize, int y_colsize, int x_rowsize, int x_colsize, int batchsize)
+     {
+         int c = threadIdx.x + blockIdx.x * blockDim.x;
+         int r = threadIdx.y + blockIdx.y * blockDim.y;
+         int batch = threadIdx.z + blockIdx.z * blockDim.z;
+
+         // dimensions of the padded image
+         int row_size = y_rowsize + x_rowsize - 1; 
+         int col_size = y_colsize + x_colsize - 1; 
+
+         if (c >= row_size || r >= col_size || batch >= batchsize)
+             return;
+
+         int size = row_size * col_size;
+         int batch_idx = batch * size;
+         int batch_idx2 = batch * x_rowsize * x_colsize;
+
+         int index = batch_idx + (c * col_size + r);
+         int index2 = batch_idx2 + (c * x_colsize + r);
+
+         if (c < x_rowsize && r < x_colsize)
+         {
+             Y[index].x = X[2 * index2];
+             Y[index].y = X[2 * index2 + 1];
+         }
+         else
+         {
+             Y[index].x = 0.f;
+             Y[index].y = 0.f;
+         }
+     }
+
+     // X --> Y
+     bool dense_2_padded_filterbank (cufftDoubleComplex* Y, double* X, int y_rowsize, int y_colsize, int x_rowsize, int x_colsize, int batchsize)
+     {
+         int rowsize = y_rowsize + x_rowsize - 1;
+         int colsize = y_colsize + x_colsize - 1;
+
+         int blo = 4;
+         dim3 tpb (blo, blo, blo);
+         dim3 bpg (ceil(rowsize / blo) + 1, ceil(colsize / blo) + 1, ceil(batchsize / blo) + 1);
+
+         ker_dense_2_padded <<< bpg, tpb >>> (Y, X, y_rowsize, y_colsize, x_rowsize, x_colsize, batchsize);
+
+         cudaError_t ok = cudaDeviceSynchronize();
+         CHECKERR(ok);
+
+         // Check if kernel execution generated and error
+         ok = cudaGetLastError();
+         CHECKERR(ok);
+
+         return true;
+     }
+
 }
 
diff --git a/src/nyx/gpu/gabor.cuh b/src/nyx/gpu/gabor.cuh
index b6b9913d..7add765a 100644
--- a/src/nyx/gpu/gabor.cuh
+++ b/src/nyx/gpu/gabor.cuh
@@ -81,5 +81,15 @@ namespace CuGabor{
     bool conv_dud_gpu_fft_multi_filter(double* out, 
                     const unsigned int* image, 
                     double* kernel, 
-                    int image_n, int image_m, int kernel_n, int kernel_m, int batch_size);
+                    int image_n, int image_m, int kernel_n, int kernel_m, int batch_size,
+                    double* dev_filterbank);
+    /**
+    * @brief Allocates device memory and sends there a host-side precalculated filter bank 'ho_filterbank' 
+    */
+    bool send_filterbank_2_gpuside (double** dev_filterbank, const double* ho_filterbank, size_t filterbank_len_all_batches);
+
+    /**
+    * @brief Copies device-side dense filter bank 'dev_Src' to preallocated 'dev_Dst' converting the dense to padded layout
+    */
+    bool dense_2_padded_filterbank (cufftDoubleComplex* dev_Dst, double* dev_Src, int y_rowsize, int y_colsize, int x_rowsize, int x_colsize, int batchsize);
 }
\ No newline at end of file
diff --git a/src/nyx/main_nyxus.cpp b/src/nyx/main_nyxus.cpp
index 4969e922..3ea01de4 100644
--- a/src/nyx/main_nyxus.cpp
+++ b/src/nyx/main_nyxus.cpp
@@ -57,14 +57,21 @@ int main (int argc, char** argv)
 	// Have the feature manager prepare the feature toolset reflecting user's selection
 	if (!theFeatureMgr.compile())
 	{
-		std::cout << "Error: compiling feature methods failed\n";
+		std::cerr << "Error: compiling feature methods failed\n";
 		return 1;
 	}
 	theFeatureMgr.apply_user_selection();
 
 	// Current time stamp #1
 	auto startTS = getTimeStr();
-	VERBOSLVL1(std::cout << "\n>>> STARTING >>> " << startTS << "\n";)
+	VERBOSLVL1 (std::cout << "\n>>> STARTING >>> " << startTS << "\n");
+
+	// Initialize feature classes 
+	if (! theFeatureMgr.init_feature_classes())
+	{
+		std::cerr << "Error: initializing feature classes failed\n";
+		return 1;
+	}
 		
 	if (theEnvironment.dim() == 2)
 	{

From bf8833fc10122aa09b39506652eb25e343d44e32 Mon Sep 17 00:00:00 2001
From: Andriy <andre.ixar@gmail.com>
Date: Tue, 21 May 2024 14:40:07 -0400
Subject: [PATCH 2/2] removed commented code per reviewer request

---
 src/nyx/features/gabor.cpp | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/src/nyx/features/gabor.cpp b/src/nyx/features/gabor.cpp
index cb4de77d..f473d45d 100644
--- a/src/nyx/features/gabor.cpp
+++ b/src/nyx/features/gabor.cpp
@@ -549,30 +549,6 @@ void GaborFeature::GaborEnergyGPUMultiFilter (
 
     readOnlyPixels pix_plane = Im.ReadablePixels();
 
-#if 0   // ---now via cache---
-    std::vector<double> g_filters;
-    g_filters.resize(2 * n * n * num_filters);
-
-    double fi = 0;
-
-    int idx = 0;
-    for(int i = 0; i < num_filters; ++i) 
-    {   
-        Gabor (Gexp, f0s[i], sig2lam, gamma, thetas[i], fi, n_gab, tx, ty);
-        for(int i = 0; i < 2*n*n; ++i) {
-            g_filters[idx+i] = Gexp[i];
-        }
-
-        idx += 2*n*n;
-    }
-
-    bool success = CuGabor::conv_dud_gpu_fft_multi_filter (auxC, pix_plane.data(), g_filters.data(), Im.width, Im.height, n_gab, n_gab, num_filters);
-    if(!success) {
-        std::cerr << "Unable to calculate Gabor features on GPU \n";
-    }
-#endif
-
-    //--- via cache ---
     bool success = CuGabor::conv_dud_gpu_fft_multi_filter (
         auxC, 
         pix_plane.data(), 
@@ -608,9 +584,7 @@ void GaborFeature::GaborEnergyGPUMultiFilter (
             }
             b++;
         }
-        
     }
-
 }
 #endif