Add symmetric scheme support for reg_optimiser_gpu and reg_conjugateG…

…radient_gpu #92
KCL-BMEIS · Jul 24, 2023 · 7204698 · 7204698
1 parent a10fe1d
commit 7204698
Show file tree

Hide file tree

Showing 10 changed files with 235 additions and 172 deletions.
diff --git a/niftyreg_build_version.txt b/niftyreg_build_version.txt
@@ -1 +1 @@
-290
+291
diff --git a/reg-lib/Platform.cpp b/reg-lib/Platform.cpp
@@ -163,7 +163,7 @@ reg_optimiser<Type>* Platform::CreateOptimiser(F3dContent& con,
                           optimiseY,
                           optimiseZ,
                           maxIterationNumber,
-                          0, // currentIterationNumber,
+                          0,
                           &opt,
                           controlPointGridData,
                           transformationGradientData,

diff --git a/reg-lib/cpu/_reg_optimiser.cpp b/reg-lib/cpu/_reg_optimiser.cpp
@@ -18,7 +18,7 @@ reg_optimiser<T>::reg_optimiser() {
     this->currentDofBw = nullptr;
     this->bestDof = nullptr;
     this->bestDofBw = nullptr;
-    this->isBackwards = false;
+    this->isSymmetric = false;
     this->gradient = nullptr;
     this->currentIterationNumber = 0;
     this->currentObjFunctionValue = 0;
@@ -69,23 +69,21 @@ void reg_optimiser<T>::Initialise(size_t nvox,
     this->maxIterationNumber = maxIt;
     this->currentIterationNumber = startIt;
     this->currentDof = cppData;
-    if (this->bestDof != nullptr) free(this->bestDof);
+    this->gradient = gradData;
+
+    if (this->bestDof) free(this->bestDof);
     this->bestDof = (T*)malloc(this->dofNumber * sizeof(T));
-    memcpy(this->bestDof, this->currentDof, this->dofNumber * sizeof(T));
-    if (gradData)
-        this->gradient = gradData;
 
-    if (nvoxBw > 0)
+    this->isSymmetric = nvoxBw > 0 && cppDataBw && gradDataBw;
+    if (this->isSymmetric) {
         this->dofNumberBw = nvoxBw;
-    if (cppDataBw) {
         this->currentDofBw = cppDataBw;
-        this->isBackwards = true;
-        if (this->bestDofBw != nullptr) free(this->bestDofBw);
+        this->gradientBw = gradDataBw;
+        if (this->bestDofBw) free(this->bestDofBw);
         this->bestDofBw = (T*)malloc(this->dofNumberBw * sizeof(T));
-        memcpy(this->bestDofBw, this->currentDofBw, this->dofNumberBw * sizeof(T));
     }
-    if (gradDataBw)
-        this->gradientBw = gradDataBw;
+
+    this->StoreCurrentDof();
 
     this->intOpt = intOpt;
     this->bestObjFunctionValue = this->currentObjFunctionValue = this->intOpt->GetObjectiveFunctionValue();
@@ -97,33 +95,33 @@ void reg_optimiser<T>::Initialise(size_t nvox,
 /* *************************************************************** */
 template <class T>
 void reg_optimiser<T>::RestoreBestDof() {
-    // restore forward transformation
+    // Restore forward transformation
     memcpy(this->currentDof, this->bestDof, this->dofNumber * sizeof(T));
-    // restore backward transformation if required
-    if (this->currentDofBw && this->bestDofBw && this->dofNumberBw > 0)
+    // Restore backward transformation if required
+    if (this->isSymmetric)
         memcpy(this->currentDofBw, this->bestDofBw, this->dofNumberBw * sizeof(T));
 }
 /* *************************************************************** */
 template <class T>
 void reg_optimiser<T>::StoreCurrentDof() {
-    // save forward transformation
+    // Save forward transformation
     memcpy(this->bestDof, this->currentDof, this->dofNumber * sizeof(T));
-    // save backward transformation if required
-    if (this->currentDofBw && this->bestDofBw && this->dofNumberBw > 0)
+    // Save backward transformation if required
+    if (this->isSymmetric)
         memcpy(this->bestDofBw, this->currentDofBw, this->dofNumberBw * sizeof(T));
 }
 /* *************************************************************** */
 template <class T>
 void reg_optimiser<T>::Perturbation(float length) {
-    // initialise the randomiser
+    // Initialise the randomiser
     srand((unsigned)time(nullptr));
     // Reset the number of iteration
     this->currentIterationNumber = 0;
     // Create some perturbation for degree of freedom
     for (size_t i = 0; i < this->dofNumber; ++i) {
         this->currentDof[i] = this->bestDof[i] + length * (float)(rand() - RAND_MAX / 2) / ((float)RAND_MAX / 2.0f);
     }
-    if (this->isBackwards) {
+    if (this->isSymmetric) {
         for (size_t i = 0; i < this->dofNumberBw; ++i) {
             this->currentDofBw[i] = this->bestDofBw[i] + length * (float)(rand() % 2001 - 1000) / 1000.f;
         }
@@ -195,10 +193,9 @@ void reg_optimiser<T>::Optimise(T maxLength, T smallLength, T& startLength) {
 template <class T>
 reg_conjugateGradient<T>::reg_conjugateGradient(): reg_optimiser<T>::reg_optimiser() {
     this->array1 = nullptr;
-    this->array2 = nullptr;
     this->array1Bw = nullptr;
+    this->array2 = nullptr;
     this->array2Bw = nullptr;
-
 #ifndef NDEBUG
     reg_print_msg_debug("reg_conjugateGradient<T>::reg_conjugateGradient() called");
 #endif
@@ -210,22 +207,18 @@ reg_conjugateGradient<T>::~reg_conjugateGradient() {
         free(this->array1);
         this->array1 = nullptr;
     }
-
-    if (this->array2) {
-        free(this->array2);
-        this->array2 = nullptr;
-    }
-
     if (this->array1Bw) {
         free(this->array1Bw);
         this->array1Bw = nullptr;
     }
-
+    if (this->array2) {
+        free(this->array2);
+        this->array2 = nullptr;
+    }
     if (this->array2Bw) {
         free(this->array2Bw);
         this->array2Bw = nullptr;
     }
-
 #ifndef NDEBUG
     reg_print_msg_debug("reg_conjugateGradient<T>::~reg_conjugateGradient() called");
 #endif
@@ -252,7 +245,7 @@ void reg_conjugateGradient<T>::Initialise(size_t nvox,
     this->array1 = (T*)malloc(this->dofNumber * sizeof(T));
     this->array2 = (T*)malloc(this->dofNumber * sizeof(T));
 
-    if (cppDataBw && gradDataBw && nvoxBw > 0) {
+    if (this->isSymmetric) {
         if (this->array1Bw) free(this->array1Bw);
         if (this->array2Bw) free(this->array2Bw);
         this->array1Bw = (T*)malloc(this->dofNumberBw * sizeof(T));
@@ -296,7 +289,7 @@ void reg_conjugateGradient<T>::UpdateGradientValues() {
         for (i = 0; i < num; i++) {
             array2Ptr[i] = array1Ptr[i] = -gradientPtr[i];
         }
-        if (this->dofNumberBw > 0) {
+        if (this->isSymmetric) {
 #ifdef _OPENMP
 #pragma omp parallel for default(none) \
     shared(numBw,array1PtrBw,array2PtrBw,gradientPtrBw)
@@ -323,7 +316,7 @@ void reg_conjugateGradient<T>::UpdateGradientValues() {
         }
         double gam = dgg / gg;
 
-        if (this->dofNumberBw > 0) {
+        if (this->isSymmetric) {
             double dggBw = 0, ggBw = 0;
 #ifdef _OPENMP
 #pragma omp parallel for default(none) \
@@ -346,7 +339,7 @@ void reg_conjugateGradient<T>::UpdateGradientValues() {
             array2Ptr[i] = static_cast<T>(array1Ptr[i] + gam * array2Ptr[i]);
             gradientPtr[i] = -array2Ptr[i];
         }
-        if (this->dofNumberBw > 0) {
+        if (this->isSymmetric) {
 #ifdef _OPENMP
 #pragma omp parallel for default(none) \
     shared(numBw,array1PtrBw,array2PtrBw,gradientPtrBw,gam)
@@ -365,9 +358,7 @@ void reg_conjugateGradient<T>::Optimise(T maxLength,
                                         T smallLength,
                                         T &startLength) {
     this->UpdateGradientValues();
-    reg_optimiser<T>::Optimise(maxLength,
-                               smallLength,
-                               startLength);
+    reg_optimiser<T>::Optimise(maxLength, smallLength, startLength);
 }
 /* *************************************************************** */
 template <class T>
@@ -377,8 +368,7 @@ void reg_conjugateGradient<T>::Perturbation(float length) {
 }
 /* *************************************************************** */
 template <class T>
-reg_lbfgs<T>::reg_lbfgs()
-    :reg_optimiser<T>::reg_optimiser() {
+reg_lbfgs<T>::reg_lbfgs(): reg_optimiser<T>::reg_optimiser() {
     this->stepToKeep = 5;
     this->oldDof = nullptr;
     this->oldGrad = nullptr;

diff --git a/reg-lib/cpu/_reg_optimiser.h b/reg-lib/cpu/_reg_optimiser.h
@@ -29,7 +29,7 @@ class InterfaceOptimiser {
 template <class T>
 class reg_optimiser {
 protected:
-    bool isBackwards;
+    bool isSymmetric;
     size_t dofNumber;
     size_t dofNumberBw;
     size_t ndim;
@@ -131,10 +131,10 @@ class reg_optimiser {
                             size_t startIt,
                             InterfaceOptimiser *intOpt,
                             T *cppData,
-                            T *gradData = nullptr,
-                            size_t nvoxBw = 0,
-                            T *cppDataBw = nullptr,
-                            T *gradDataBw = nullptr);
+                            T *gradData,
+                            size_t nvoxBw,
+                            T *cppDataBw,
+                            T *gradDataBw);
     virtual void Optimise(T maxLength,
                           T smallLength,
                           T& startLength);
@@ -169,14 +169,14 @@ class reg_conjugateGradient: public reg_optimiser<T> {
                             size_t maxIt,
                             size_t startIt,
                             InterfaceOptimiser *intOpt,
-                            T *cppData = nullptr,
-                            T *gradData = nullptr,
-                            size_t nvoxBw = 0,
-                            T *cppDataBw = nullptr,
-                            T *gradDataBw = nullptr) override;
+                            T *cppData,
+                            T *gradData,
+                            size_t nvoxBw,
+                            T *cppDataBw,
+                            T *gradDataBw) override;
     virtual void Optimise(T maxLength,
                           T smallLength,
-                          T &startLength) override;
+                          T& startLength) override;
     virtual void Perturbation(float length) override;
 };
 /* *************************************************************** */
@@ -208,14 +208,14 @@ class reg_lbfgs: public reg_optimiser<T> {
                             size_t maxIt,
                             size_t startIt,
                             InterfaceOptimiser *intOpt,
-                            T *cppData = nullptr,
-                            T *gradData = nullptr,
-                            size_t nvoxBw = 0,
-                            T *cppDataBw = nullptr,
-                            T *gradDataBw = nullptr) override;
+                            T *cppData,
+                            T *gradData,
+                            size_t nvoxBw,
+                            T *cppDataBw,
+                            T *gradDataBw) override;
     virtual void Optimise(T maxLength,
                           T smallLength,
-                          T &startLength) override;
+                          T& startLength) override;
 };
 /* *************************************************************** */
 #include "_reg_optimiser.cpp"
diff --git a/reg-lib/cuda/BlockSize.hpp b/reg-lib/cuda/BlockSize.hpp
@@ -57,8 +57,8 @@ struct BlockSize {
     unsigned reg_defField_getJacobianMatrix;
     /* _reg_optimiser_gpu */
     unsigned reg_initialiseConjugateGradient;
-    unsigned reg_GetConjugateGradient1;
-    unsigned reg_GetConjugateGradient2;
+    unsigned reg_getConjugateGradient1;
+    unsigned reg_getConjugateGradient2;
     unsigned GetMaximalLength;
     unsigned reg_updateControlPointPosition;
     /* _reg_ssd_gpu */
@@ -122,8 +122,8 @@ struct BlockSize100: public BlockSize {
         reg_defField_getJacobianMatrix = 512; // 16 reg - 24 smem - 04 cmem
         /* _reg_optimiser_gpu */
         reg_initialiseConjugateGradient = 384; // 09 reg - 24 smem
-        reg_GetConjugateGradient1 = 320; // 12 reg - 24 smem
-        reg_GetConjugateGradient2 = 384; // 10 reg - 40 smem
+        reg_getConjugateGradient1 = 320; // 12 reg - 24 smem
+        reg_getConjugateGradient2 = 384; // 10 reg - 40 smem
         GetMaximalLength = 384; // 04 reg - 24 smem
         reg_updateControlPointPosition = 384; // 08 reg - 24 smem
         /* _reg_ssd_gpu */
@@ -191,8 +191,8 @@ struct BlockSize300: public BlockSize {
         reg_defField_getJacobianMatrix = 768; // 34 reg
         /* _reg_optimiser_gpu */
         reg_initialiseConjugateGradient = 1024; // 20 reg
-        reg_GetConjugateGradient1 = 1024; // 22 reg
-        reg_GetConjugateGradient2 = 1024; // 25 reg
+        reg_getConjugateGradient1 = 1024; // 22 reg
+        reg_getConjugateGradient2 = 1024; // 25 reg
         GetMaximalLength = 1024; // 20 reg
         reg_updateControlPointPosition = 1024; // 22 reg
         /* _reg_ssd_gpu */

diff --git a/reg-lib/cuda/CMakeLists.txt b/reg-lib/cuda/CMakeLists.txt
@@ -38,7 +38,7 @@ else(NOT COMPILE_RESULT_VAR)
     endif()
     #adjust for debug and release versions
     if(CMAKE_BUILD_TYPE STREQUAL "Debug")
-        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --ptxas-options=-v -g -G -lineinfo")
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --ptxas-options=-v -g -G")
     else(CMAKE_BUILD_TYPE STREQUAL "Debug")
         set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --ptxas-options=-O3")
     endif(CMAKE_BUILD_TYPE STREQUAL "Debug")

diff --git a/reg-lib/cuda/_reg_common_cuda_kernels.cu b/reg-lib/cuda/_reg_common_cuda_kernels.cu
@@ -8,49 +8,53 @@
 #pragma once
 
 /* *************************************************************** */
-__device__ __inline__ float2 operator*(float a, float2 b) {
+__device__ __inline__ float2 operator*(const float& a, const float2& b) {
     return { a * b.x, a * b.y };
 }
-__device__ __inline__ float3 operator*(float a, float3 b) {
+__device__ __inline__ float3 operator*(const float& a, const float3& b) {
     return { a * b.x, a * b.y, a * b.z };
 }
-__device__ __inline__ float3 operator*(float3 a, float3 b) {
+__device__ __inline__ float3 operator*(const float3& a, const float3& b) {
     return { a.x * b.x, a.y * b.y, a.z * b.z };
 }
-__device__ __inline__ float4 operator*(float4 a, float4 b) {
+__device__ __inline__ float4 operator*(const float4& a, const float4& b) {
     return { a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w };
 }
-__device__ __inline__ float4 operator*(float a, float4 b) {
+__device__ __inline__ float4 operator*(const float& a, const float4& b) {
     return { a * b.x, a * b.y, a * b.z, 0.0f };
 }
 /* *************************************************************** */
-__device__ __inline__ float2 operator/(float2 a, float2 b) {
+__device__ __inline__ float2 operator/(const float2& a, const float2& b) {
     return { a.x / b.x, a.y / b.y };
 }
-__device__ __inline__ float3 operator/(float3 a, float b) {
+__device__ __inline__ float3 operator/(const float3& a, const float& b) {
     return { a.x / b, a.y / b, a.z / b };
 }
-__device__ __inline__ float3 operator/(float3 a, float3 b) {
+__device__ __inline__ float3 operator/(const float3& a, const float3& b) {
     return { a.x / b.x, a.y / b.y, a.z / b.z };
 }
 /* *************************************************************** */
-__device__ __inline__ float2 operator+(float2 a, float2 b) {
+__device__ __inline__ float2 operator+(const float2& a, const float2& b) {
     return { a.x + b.x, a.y + b.y };
 }
-__device__ __inline__ float4 operator+(float4 a, float4 b) {
+__device__ __inline__ float4 operator+(const float4& a, const float4& b) {
     return { a.x + b.x, a.y + b.y, a.z + b.z, 0.0f };
 }
-__device__ __inline__ float3 operator+(float3 a, float3 b) {
+__device__ __inline__ float3 operator+(const float3& a, const float3& b) {
     return { a.x + b.x, a.y + b.y, a.z + b.z };
 }
 /* *************************************************************** */
-__device__ __inline__ float3 operator-(float3 a, float3 b) {
+__device__ __inline__ float3 operator-(const float3& a, const float3& b) {
     return { a.x - b.x, a.y - b.y, a.z - b.z };
 }
-__device__ __inline__ float4 operator-(float4 a, float4 b) {
+__device__ __inline__ float4 operator-(const float4& a, const float4& b) {
     return { a.x - b.x, a.y - b.y, a.z - b.z, 0.f };
 }
 /* *************************************************************** */
+__device__ __inline__ double2 operator+(const double2& a, const double2& b) {
+    return { a.x + b.x, a.y + b.y };
+}
+/* *************************************************************** */
 __device__ __inline__ void reg_mat33_mul_cuda(const mat33& mat, const float (&in)[3], const float& weight, float (&out)[3], const bool& is3d) {
     out[0] = weight * (mat.m[0][0] * in[0] + mat.m[0][1] * in[1] + mat.m[0][2] * in[2]);
     out[1] = weight * (mat.m[1][0] * in[0] + mat.m[1][1] * in[1] + mat.m[1][2] * in[2]);