From c8746e8f41608cf4fc3e9a4f8f76bfd2195e3613 Mon Sep 17 00:00:00 2001 From: Andriy Date: Mon, 20 May 2024 17:18:20 -0400 Subject: [PATCH 1/2] (1) fixed Gabor for CPU, (2) accelerated Gabor for GPGPU --- src/nyx/feature_mgr.h | 4 + src/nyx/feature_mgr_init.cpp | 4 + src/nyx/features/gabor.cpp | 182 ++++++++++++++++++++--------- src/nyx/features/gabor.h | 18 ++- src/nyx/features/gabor_nontriv.cpp | 2 +- src/nyx/gpu/gabor.cu | 109 ++++++++++++----- src/nyx/gpu/gabor.cuh | 12 +- src/nyx/main_nyxus.cpp | 11 +- 8 files changed, 249 insertions(+), 93 deletions(-) diff --git a/src/nyx/feature_mgr.h b/src/nyx/feature_mgr.h index 6a8330bf..6c7a3b89 100644 --- a/src/nyx/feature_mgr.h +++ b/src/nyx/feature_mgr.h @@ -18,6 +18,10 @@ class FeatureManager void apply_user_selection(); + // Initializes feature classes + // (allocates lookup tables, precalculates filter banks, etc.) + bool init_feature_classes(); + // After compiling, returns the number of user-requested features int get_num_requested_features(); diff --git a/src/nyx/feature_mgr_init.cpp b/src/nyx/feature_mgr_init.cpp index f91aa7f1..63fafa53 100644 --- a/src/nyx/feature_mgr_init.cpp +++ b/src/nyx/feature_mgr_init.cpp @@ -63,4 +63,8 @@ FeatureManager::FeatureManager() register_feature(new PixelIntensityFeatures_3D()); } +bool FeatureManager::init_feature_classes() +{ + return GaborFeature::init_class(); +} diff --git a/src/nyx/features/gabor.cpp b/src/nyx/features/gabor.cpp index 1da1fd69..cb4de77d 100644 --- a/src/nyx/features/gabor.cpp +++ b/src/nyx/features/gabor.cpp @@ -4,7 +4,10 @@ using namespace Nyxus; +// // Static members changeable by user via class 'Environment' +// + double GaborFeature::gamma = 0.1; double GaborFeature::sig2lam = 0.8; int GaborFeature::n = 16; @@ -18,6 +21,17 @@ std::vector> GaborFeature::f0_theta_pairs {M_PI_4*3.0, 64.0} }; +#ifdef USE_GPU + int GaborFeature::n_bank_filters = -1; + std::vector> GaborFeature::filterbank; + std::vector GaborFeature::ho_filterbank; + double* GaborFeature::dev_filterbank = nullptr; +#endif + +// +// +// + bool GaborFeature::required(const FeatureSet& fs) { return fs.isEnabled (Feature2D::GABOR); @@ -201,7 +215,7 @@ void GaborFeature::calculate_gpu_multi_filter (LR& r) int step_size = ceil(cufft_size / CUFFT_MAX_SIZE); if(step_size == 0) step_size = 1; - int num_filters = ceil(8/step_size); + int num_filters = ceil(nFreqs/step_size) + 1; // Temp buffers // @@ -232,61 +246,44 @@ void GaborFeature::calculate_gpu_multi_filter (LR& r) thetas.push_back(ft.second); } - for(int i = 0; i < 8; i += num_filters) - { - // Compute the baseline score before applying high-pass Gabor filters - std::vector> e2_pix_plane_vec (num_filters, std::vector(Im0.width * Im0.height)); - - std::vector f(num_filters); - - for(int j = i; j < i + num_filters; j++) - { - if(j >= 8) - break; - f[j-i] = freqs[j]; - } + // Compute the baseline score before applying high-pass Gabor filters + std::vector> responses (num_filters, std::vector(Im0.width * Im0.height)); - // Calculate low-passed baseline and high-passed filter responses - GaborEnergyGPUMultiFilter (Im0, e2_pix_plane_vec, auxC.data(), auxG.data(), f, sig2lam, gamma, thetas, n, num_filters); + // Calculate low-passed baseline and high-passed filter responses + GaborEnergyGPUMultiFilter (Im0, responses, auxC.data(), auxG.data(), freqs, sig2lam, gamma, thetas, n, freqs.size()); - // Examine the baseline signal - if (i == 0) - { - std::vector& pix_plane = e2_pix_plane_vec[0]; + // Examine the baseline signal + std::vector& pix_plane = responses[0]; - PixIntens* e2img_ptr = e2img.writable_data_ptr(); + PixIntens* e2img_ptr = e2img.writable_data_ptr(); - for(int k = 0; k < Im0.height * Im0.width; ++k) - { - e2img_ptr[k] = pix_plane[k]; - } + for (int k = 0; k < Im0.height * Im0.width; ++k) + e2img_ptr[k] = pix_plane[k]; - // Values that we need for scoring filter responses - Moments2 local_stats; - e2img.GetStats(local_stats); - maxval = local_stats.max__(); - double cmpval = local_stats.min__(); - - // Score the baseline signal - for (auto a : pix_plane) - if (double(a) > cmpval) - baselineScore++; - } + // Values that we need for scoring filter responses + Moments2 local_stats; + e2img.GetStats(local_stats); + maxval = local_stats.max__(); + double cmpval = local_stats.min__(); + + // Score the baseline signal + for (auto a : pix_plane) + if (double(a) > cmpval) + baselineScore++; - // Iterate high-pass filter response signals and score them - for (int i=0; i < nFreqs; i++) - { - std::vector& e2_pix_plane_temp = e2_pix_plane_vec[i+1]; + // Iterate high-pass filter response signals and score them + for (int j=1; j< freqs.size(); j++) + { + std::vector& e2_pix_plane_temp = responses [j]; - // score it - unsigned long afterGaborScore = 0; - for (auto a : e2_pix_plane_temp) - if (double(a) / maxval > GRAYthr) - afterGaborScore++; + // score it + unsigned long afterGaborScore = 0; + for (auto a : e2_pix_plane_temp) + if (double(a) / maxval > GRAYthr) + afterGaborScore++; - // save the score as feature value - fvals[i] = (double)afterGaborScore / (double)baselineScore; - } + // save the score as feature value + fvals[j-1] = (double)afterGaborScore / (double)baselineScore; } } #endif @@ -372,7 +369,7 @@ void GaborFeature::conv_dud( } // Creates a normalized Gabor filter -void GaborFeature::Gabor (double* Gex, double f0, double sig2lam, double gamma, double theta, double fi, int n) +void GaborFeature::Gabor (double* Gex, double f0, double sig2lam, double gamma, double theta, double fi, int n, std::vector & tx, std::vector & ty) { double lambda = 2 * M_PI / f0, cos_theta = cos(theta), @@ -445,7 +442,7 @@ void GaborFeature::GaborEnergy ( int n_gab = n; double fi = 0; - Gabor (Gexp, f0, sig2lam, gamma, theta, fi, n_gab); + Gabor (Gexp, f0, sig2lam, gamma, theta, fi, n_gab, tx, ty); readOnlyPixels pix_plane = Im.ReadablePixels(); @@ -506,7 +503,7 @@ void GaborFeature::GaborEnergyGPU ( int n_gab = n; double fi = 0; - Gabor (Gexp, f0, sig2lam, gamma, theta, fi, n_gab); + Gabor (Gexp, f0, sig2lam, gamma, theta, fi, n_gab, tx, ty); readOnlyPixels pix_plane = Im.ReadablePixels(); @@ -550,6 +547,9 @@ void GaborFeature::GaborEnergyGPUMultiFilter ( { int n_gab = n; + readOnlyPixels pix_plane = Im.ReadablePixels(); + +#if 0 // ---now via cache--- std::vector g_filters; g_filters.resize(2 * n * n * num_filters); @@ -558,7 +558,7 @@ void GaborFeature::GaborEnergyGPUMultiFilter ( int idx = 0; for(int i = 0; i < num_filters; ++i) { - Gabor (Gexp, f0s[i], sig2lam, gamma, thetas[i], fi, n_gab); + Gabor (Gexp, f0s[i], sig2lam, gamma, thetas[i], fi, n_gab, tx, ty); for(int i = 0; i < 2*n*n; ++i) { g_filters[idx+i] = Gexp[i]; } @@ -566,14 +566,28 @@ void GaborFeature::GaborEnergyGPUMultiFilter ( idx += 2*n*n; } - readOnlyPixels pix_plane = Im.ReadablePixels(); - bool success = CuGabor::conv_dud_gpu_fft_multi_filter (auxC, pix_plane.data(), g_filters.data(), Im.width, Im.height, n_gab, n_gab, num_filters); if(!success) { std::cerr << "Unable to calculate Gabor features on GPU \n"; } +#endif - for (int i = 0; i < num_filters; ++i){ + //--- via cache --- + bool success = CuGabor::conv_dud_gpu_fft_multi_filter ( + auxC, + pix_plane.data(), + GaborFeature::ho_filterbank.data(), + Im.width, + Im.height, + n_gab, + n_gab, + num_filters, + GaborFeature::dev_filterbank); + if (!success) + std::cerr << "\n\n\nUnable to calculate Gabor features on GPU \n\n\n"; + + for (int idx, i = 0; i < num_filters; ++i) + { idx = 2 * i * (Im.width + n - 1) * (Im.height + n - 1); decltype(Im.height) b = 0; @@ -639,3 +653,61 @@ double GaborFeature::get_theta_in_degrees (int i) return theta / M_PI * 180; } +#ifdef USE_GPU + +void GaborFeature::create_filterbank() +{ + // frequencies and angles + std::vector freqs = { f0LP }, + thetas = { M_PI_2 }; // the lowpass filter goes at compromise pi/2 theta + for (auto& ft : GaborFeature::f0_theta_pairs) + { + freqs.push_back(ft.first); + thetas.push_back(ft.second); + } + + // allocate the conv kernels buffer + GaborFeature::n_bank_filters = (int)freqs.size(); + GaborFeature::filterbank.resize(GaborFeature::n_bank_filters); + for (auto& f : GaborFeature::filterbank) + f.resize (2 * GaborFeature::n * GaborFeature::n * GaborFeature::n_bank_filters); + + ho_filterbank.resize(2 * GaborFeature::n * GaborFeature::n * GaborFeature::n_bank_filters); + + // temps + std::vector temp_tx, temp_ty; + temp_tx.resize (GaborFeature::n + 1); + temp_ty.resize (GaborFeature::n + 1); + double fi = 0; // offset + + // filter bank as a single chunk of memory + int idx = 0; + for (int i = 0; i < GaborFeature::n_bank_filters; i++) + { + auto filterData = GaborFeature::filterbank[i]; + Gabor (filterData.data(), freqs[i], GaborFeature::sig2lam, GaborFeature::gamma, thetas[i], fi, GaborFeature::n, temp_tx, temp_ty); + + for (int i = 0; i < 2 * GaborFeature::n * GaborFeature::n; ++i) + GaborFeature::ho_filterbank [idx + i] = filterData[i]; + + idx += 2 * GaborFeature::n * GaborFeature::n; + } +} + +bool GaborFeature::send_filterbank_2_gpuside() +{ + return CuGabor::send_filterbank_2_gpuside (&GaborFeature::dev_filterbank, GaborFeature::ho_filterbank.data(), GaborFeature::ho_filterbank.size()); +} + +#endif + +bool GaborFeature::init_class() +{ + #ifdef USE_GPU + GaborFeature::create_filterbank(); + if (! GaborFeature::send_filterbank_2_gpuside()) + return false; + #endif + + return true; +} \ No newline at end of file diff --git a/src/nyx/features/gabor.h b/src/nyx/features/gabor.h index f72ee626..afbc0b26 100644 --- a/src/nyx/features/gabor.h +++ b/src/nyx/features/gabor.h @@ -62,6 +62,8 @@ class GaborFeature: public FeatureMethod // Returns the angle of i-th pair of 'f0_theta_pairs' static double get_theta_in_degrees (int i); + static bool init_class(); + private: // Result cache @@ -73,14 +75,16 @@ class GaborFeature: public FeatureMethod void conv_dud (double* c, const unsigned int* a, double* b, int na, int ma, int nb, int mb); // Creates a non-normalized Gabor filter - void Gabor ( + static void Gabor ( double* Gex, // buffer of size n*n*2 double f0, double sig2lam, double gamma, double theta, double fi, - int n); + int n, + std::vector& tx, + std::vector& ty); std::vector tx, ty; @@ -143,5 +147,15 @@ class GaborFeature: public FeatureMethod int na, int ma, int nb, int mb); void GetStats_NT (WriteImageMatrix_nontriv& I, Moments2& moments2); + + // members referenced from init_class() + static void create_filterbank(); + static int n_bank_filters; + static std::vector> filterbank; + #ifdef USE_GPU + static bool send_filterbank_2_gpuside(); + static std::vector ho_filterbank; + static double* dev_filterbank; + #endif }; diff --git a/src/nyx/features/gabor_nontriv.cpp b/src/nyx/features/gabor_nontriv.cpp index d74329df..ffa19694 100644 --- a/src/nyx/features/gabor_nontriv.cpp +++ b/src/nyx/features/gabor_nontriv.cpp @@ -89,7 +89,7 @@ void GaborFeature::GaborEnergy_NT2 ( // Prepare a filter double fi = 0; - Gabor (Gexp, f0, sig2lam, gamma, theta, fi, n); + Gabor (Gexp, f0, sig2lam, gamma, theta, fi, n, tx, ty); // Helpful temps auto width = Im.get_width(), diff --git a/src/nyx/gpu/gabor.cu b/src/nyx/gpu/gabor.cu index 16c974e1..0986cb19 100644 --- a/src/nyx/gpu/gabor.cu +++ b/src/nyx/gpu/gabor.cu @@ -78,10 +78,8 @@ namespace CuGabor { bool conv_dud_gpu_fft(double* out, const unsigned int* image, double* kernel, - int image_n, int image_m, int kernel_n, int kernel_m){ - - - + int image_n, int image_m, int kernel_n, int kernel_m) + { int batch_size = 1; // calculate new size of image based on padding size @@ -216,10 +214,11 @@ namespace CuGabor { return true; } - bool conv_dud_gpu_fft_multi_filter(double* out, + bool conv_dud_gpu_fft_multi_filter(double* out, const unsigned int* image, double* kernel, - int image_n, int image_m, int kernel_n, int kernel_m, int batch_size){ + int image_n, int image_m, int kernel_n, int kernel_m, int batch_size, + double* dev_filterbank){ typedef double2 Complex; // comment out to use float typedef cufftDoubleComplex CuComplex; // comment out to use float @@ -262,30 +261,6 @@ namespace CuGabor { } } - for (int batch = 0; batch < batch_size; ++batch) { - - batch_idx = batch * size; - batch_idx2 = batch * kernel_m * kernel_n; - - for (int i = 0; i < row_size; ++i) { - for (int j = 0; j < col_size; ++j) { - - index = batch_idx + (i*col_size + j); - index2 = batch_idx2 + (i*kernel_n + j); - - if (i < kernel_m && j < kernel_n) { - - linear_kernel[index].x = kernel[2*index2]; - linear_kernel[index].y = kernel[2*index2+1]; - - } else { - linear_kernel[index].x = 0; - linear_kernel[index].y = 0; - } - } - } - } - CuComplex* d_image; CuComplex* d_result; CuComplex* d_kernel; @@ -305,8 +280,9 @@ namespace CuGabor { ok = cudaMemcpy(d_image, linear_image.data(), batch_size*size*sizeof(CuComplex), cudaMemcpyHostToDevice); CHECKERR(ok); - ok = cudaMemcpy(d_kernel, linear_kernel.data(), batch_size*size*sizeof(CuComplex), cudaMemcpyHostToDevice); - CHECKERR(ok); + bool ok2 = dense_2_padded_filterbank (d_kernel, dev_filterbank, image_m, image_n, kernel_m, kernel_n, batch_size); + if (!ok2) + return false; cufftHandle plan; cufftHandle plan_k; @@ -368,5 +344,74 @@ namespace CuGabor { return true; } + + bool send_filterbank_2_gpuside (double** dev_filterbank, const double* ho_filterbank, size_t filterbank_len_all_batches) + { + auto szb = sizeof(dev_filterbank[0]) * filterbank_len_all_batches; + + auto ok = cudaMalloc ((void**)dev_filterbank, szb); + CHECKERR(ok); + + ok = cudaMemcpy (*dev_filterbank, ho_filterbank, szb, cudaMemcpyHostToDevice); + CHECKERR(ok); + + return true; + } + + // X --> Y + __global__ void ker_dense_2_padded (cufftDoubleComplex* Y, double* X, int y_rowsize, int y_colsize, int x_rowsize, int x_colsize, int batchsize) + { + int c = threadIdx.x + blockIdx.x * blockDim.x; + int r = threadIdx.y + blockIdx.y * blockDim.y; + int batch = threadIdx.z + blockIdx.z * blockDim.z; + + // dimensions of the padded image + int row_size = y_rowsize + x_rowsize - 1; + int col_size = y_colsize + x_colsize - 1; + + if (c >= row_size || r >= col_size || batch >= batchsize) + return; + + int size = row_size * col_size; + int batch_idx = batch * size; + int batch_idx2 = batch * x_rowsize * x_colsize; + + int index = batch_idx + (c * col_size + r); + int index2 = batch_idx2 + (c * x_colsize + r); + + if (c < x_rowsize && r < x_colsize) + { + Y[index].x = X[2 * index2]; + Y[index].y = X[2 * index2 + 1]; + } + else + { + Y[index].x = 0.f; + Y[index].y = 0.f; + } + } + + // X --> Y + bool dense_2_padded_filterbank (cufftDoubleComplex* Y, double* X, int y_rowsize, int y_colsize, int x_rowsize, int x_colsize, int batchsize) + { + int rowsize = y_rowsize + x_rowsize - 1; + int colsize = y_colsize + x_colsize - 1; + + int blo = 4; + dim3 tpb (blo, blo, blo); + dim3 bpg (ceil(rowsize / blo) + 1, ceil(colsize / blo) + 1, ceil(batchsize / blo) + 1); + + ker_dense_2_padded <<< bpg, tpb >>> (Y, X, y_rowsize, y_colsize, x_rowsize, x_colsize, batchsize); + + cudaError_t ok = cudaDeviceSynchronize(); + CHECKERR(ok); + + // Check if kernel execution generated and error + ok = cudaGetLastError(); + CHECKERR(ok); + + return true; + } + } diff --git a/src/nyx/gpu/gabor.cuh b/src/nyx/gpu/gabor.cuh index b6b9913d..7add765a 100644 --- a/src/nyx/gpu/gabor.cuh +++ b/src/nyx/gpu/gabor.cuh @@ -81,5 +81,15 @@ namespace CuGabor{ bool conv_dud_gpu_fft_multi_filter(double* out, const unsigned int* image, double* kernel, - int image_n, int image_m, int kernel_n, int kernel_m, int batch_size); + int image_n, int image_m, int kernel_n, int kernel_m, int batch_size, + double* dev_filterbank); + /** + * @brief Allocates device memory and sends there a host-side precalculated filter bank 'ho_filterbank' + */ + bool send_filterbank_2_gpuside (double** dev_filterbank, const double* ho_filterbank, size_t filterbank_len_all_batches); + + /** + * @brief Copies device-side dense filter bank 'dev_Src' to preallocated 'dev_Dst' converting the dense to padded layout + */ + bool dense_2_padded_filterbank (cufftDoubleComplex* dev_Dst, double* dev_Src, int y_rowsize, int y_colsize, int x_rowsize, int x_colsize, int batchsize); } \ No newline at end of file diff --git a/src/nyx/main_nyxus.cpp b/src/nyx/main_nyxus.cpp index 4969e922..3ea01de4 100644 --- a/src/nyx/main_nyxus.cpp +++ b/src/nyx/main_nyxus.cpp @@ -57,14 +57,21 @@ int main (int argc, char** argv) // Have the feature manager prepare the feature toolset reflecting user's selection if (!theFeatureMgr.compile()) { - std::cout << "Error: compiling feature methods failed\n"; + std::cerr << "Error: compiling feature methods failed\n"; return 1; } theFeatureMgr.apply_user_selection(); // Current time stamp #1 auto startTS = getTimeStr(); - VERBOSLVL1(std::cout << "\n>>> STARTING >>> " << startTS << "\n";) + VERBOSLVL1 (std::cout << "\n>>> STARTING >>> " << startTS << "\n"); + + // Initialize feature classes + if (! theFeatureMgr.init_feature_classes()) + { + std::cerr << "Error: initializing feature classes failed\n"; + return 1; + } if (theEnvironment.dim() == 2) { From bf8833fc10122aa09b39506652eb25e343d44e32 Mon Sep 17 00:00:00 2001 From: Andriy Date: Tue, 21 May 2024 14:40:07 -0400 Subject: [PATCH 2/2] removed commented code per reviewer request --- src/nyx/features/gabor.cpp | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/src/nyx/features/gabor.cpp b/src/nyx/features/gabor.cpp index cb4de77d..f473d45d 100644 --- a/src/nyx/features/gabor.cpp +++ b/src/nyx/features/gabor.cpp @@ -549,30 +549,6 @@ void GaborFeature::GaborEnergyGPUMultiFilter ( readOnlyPixels pix_plane = Im.ReadablePixels(); -#if 0 // ---now via cache--- - std::vector g_filters; - g_filters.resize(2 * n * n * num_filters); - - double fi = 0; - - int idx = 0; - for(int i = 0; i < num_filters; ++i) - { - Gabor (Gexp, f0s[i], sig2lam, gamma, thetas[i], fi, n_gab, tx, ty); - for(int i = 0; i < 2*n*n; ++i) { - g_filters[idx+i] = Gexp[i]; - } - - idx += 2*n*n; - } - - bool success = CuGabor::conv_dud_gpu_fft_multi_filter (auxC, pix_plane.data(), g_filters.data(), Im.width, Im.height, n_gab, n_gab, num_filters); - if(!success) { - std::cerr << "Unable to calculate Gabor features on GPU \n"; - } -#endif - - //--- via cache --- bool success = CuGabor::conv_dud_gpu_fft_multi_filter ( auxC, pix_plane.data(), @@ -608,9 +584,7 @@ void GaborFeature::GaborEnergyGPUMultiFilter ( } b++; } - } - } #endif