Skip to content

Commit

Permalink
Add symmetric scheme support for reg_optimiser_gpu and reg_conjugateG…
Browse files Browse the repository at this point in the history
…radient_gpu #92
  • Loading branch information
onurulgen committed Jul 24, 2023
1 parent a10fe1d commit 7204698
Show file tree
Hide file tree
Showing 10 changed files with 235 additions and 172 deletions.
2 changes: 1 addition & 1 deletion niftyreg_build_version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
290
291
2 changes: 1 addition & 1 deletion reg-lib/Platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ reg_optimiser<Type>* Platform::CreateOptimiser(F3dContent& con,
optimiseY,
optimiseZ,
maxIterationNumber,
0, // currentIterationNumber,
0,
&opt,
controlPointGridData,
transformationGradientData,
Expand Down
68 changes: 29 additions & 39 deletions reg-lib/cpu/_reg_optimiser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ reg_optimiser<T>::reg_optimiser() {
this->currentDofBw = nullptr;
this->bestDof = nullptr;
this->bestDofBw = nullptr;
this->isBackwards = false;
this->isSymmetric = false;
this->gradient = nullptr;
this->currentIterationNumber = 0;
this->currentObjFunctionValue = 0;
Expand Down Expand Up @@ -69,23 +69,21 @@ void reg_optimiser<T>::Initialise(size_t nvox,
this->maxIterationNumber = maxIt;
this->currentIterationNumber = startIt;
this->currentDof = cppData;
if (this->bestDof != nullptr) free(this->bestDof);
this->gradient = gradData;

if (this->bestDof) free(this->bestDof);
this->bestDof = (T*)malloc(this->dofNumber * sizeof(T));
memcpy(this->bestDof, this->currentDof, this->dofNumber * sizeof(T));
if (gradData)
this->gradient = gradData;

if (nvoxBw > 0)
this->isSymmetric = nvoxBw > 0 && cppDataBw && gradDataBw;
if (this->isSymmetric) {
this->dofNumberBw = nvoxBw;
if (cppDataBw) {
this->currentDofBw = cppDataBw;
this->isBackwards = true;
if (this->bestDofBw != nullptr) free(this->bestDofBw);
this->gradientBw = gradDataBw;
if (this->bestDofBw) free(this->bestDofBw);
this->bestDofBw = (T*)malloc(this->dofNumberBw * sizeof(T));
memcpy(this->bestDofBw, this->currentDofBw, this->dofNumberBw * sizeof(T));
}
if (gradDataBw)
this->gradientBw = gradDataBw;

this->StoreCurrentDof();

this->intOpt = intOpt;
this->bestObjFunctionValue = this->currentObjFunctionValue = this->intOpt->GetObjectiveFunctionValue();
Expand All @@ -97,33 +95,33 @@ void reg_optimiser<T>::Initialise(size_t nvox,
/* *************************************************************** */
template <class T>
void reg_optimiser<T>::RestoreBestDof() {
// restore forward transformation
// Restore forward transformation
memcpy(this->currentDof, this->bestDof, this->dofNumber * sizeof(T));
// restore backward transformation if required
if (this->currentDofBw && this->bestDofBw && this->dofNumberBw > 0)
// Restore backward transformation if required
if (this->isSymmetric)
memcpy(this->currentDofBw, this->bestDofBw, this->dofNumberBw * sizeof(T));
}
/* *************************************************************** */
template <class T>
void reg_optimiser<T>::StoreCurrentDof() {
// save forward transformation
// Save forward transformation
memcpy(this->bestDof, this->currentDof, this->dofNumber * sizeof(T));
// save backward transformation if required
if (this->currentDofBw && this->bestDofBw && this->dofNumberBw > 0)
// Save backward transformation if required
if (this->isSymmetric)
memcpy(this->bestDofBw, this->currentDofBw, this->dofNumberBw * sizeof(T));
}
/* *************************************************************** */
template <class T>
void reg_optimiser<T>::Perturbation(float length) {
// initialise the randomiser
// Initialise the randomiser
srand((unsigned)time(nullptr));
// Reset the number of iteration
this->currentIterationNumber = 0;
// Create some perturbation for degree of freedom
for (size_t i = 0; i < this->dofNumber; ++i) {
this->currentDof[i] = this->bestDof[i] + length * (float)(rand() - RAND_MAX / 2) / ((float)RAND_MAX / 2.0f);
}
if (this->isBackwards) {
if (this->isSymmetric) {
for (size_t i = 0; i < this->dofNumberBw; ++i) {
this->currentDofBw[i] = this->bestDofBw[i] + length * (float)(rand() % 2001 - 1000) / 1000.f;
}
Expand Down Expand Up @@ -195,10 +193,9 @@ void reg_optimiser<T>::Optimise(T maxLength, T smallLength, T& startLength) {
template <class T>
reg_conjugateGradient<T>::reg_conjugateGradient(): reg_optimiser<T>::reg_optimiser() {
this->array1 = nullptr;
this->array2 = nullptr;
this->array1Bw = nullptr;
this->array2 = nullptr;
this->array2Bw = nullptr;

#ifndef NDEBUG
reg_print_msg_debug("reg_conjugateGradient<T>::reg_conjugateGradient() called");
#endif
Expand All @@ -210,22 +207,18 @@ reg_conjugateGradient<T>::~reg_conjugateGradient() {
free(this->array1);
this->array1 = nullptr;
}

if (this->array2) {
free(this->array2);
this->array2 = nullptr;
}

if (this->array1Bw) {
free(this->array1Bw);
this->array1Bw = nullptr;
}

if (this->array2) {
free(this->array2);
this->array2 = nullptr;
}
if (this->array2Bw) {
free(this->array2Bw);
this->array2Bw = nullptr;
}

#ifndef NDEBUG
reg_print_msg_debug("reg_conjugateGradient<T>::~reg_conjugateGradient() called");
#endif
Expand All @@ -252,7 +245,7 @@ void reg_conjugateGradient<T>::Initialise(size_t nvox,
this->array1 = (T*)malloc(this->dofNumber * sizeof(T));
this->array2 = (T*)malloc(this->dofNumber * sizeof(T));

if (cppDataBw && gradDataBw && nvoxBw > 0) {
if (this->isSymmetric) {
if (this->array1Bw) free(this->array1Bw);
if (this->array2Bw) free(this->array2Bw);
this->array1Bw = (T*)malloc(this->dofNumberBw * sizeof(T));
Expand Down Expand Up @@ -296,7 +289,7 @@ void reg_conjugateGradient<T>::UpdateGradientValues() {
for (i = 0; i < num; i++) {
array2Ptr[i] = array1Ptr[i] = -gradientPtr[i];
}
if (this->dofNumberBw > 0) {
if (this->isSymmetric) {
#ifdef _OPENMP
#pragma omp parallel for default(none) \
shared(numBw,array1PtrBw,array2PtrBw,gradientPtrBw)
Expand All @@ -323,7 +316,7 @@ void reg_conjugateGradient<T>::UpdateGradientValues() {
}
double gam = dgg / gg;

if (this->dofNumberBw > 0) {
if (this->isSymmetric) {
double dggBw = 0, ggBw = 0;
#ifdef _OPENMP
#pragma omp parallel for default(none) \
Expand All @@ -346,7 +339,7 @@ void reg_conjugateGradient<T>::UpdateGradientValues() {
array2Ptr[i] = static_cast<T>(array1Ptr[i] + gam * array2Ptr[i]);
gradientPtr[i] = -array2Ptr[i];
}
if (this->dofNumberBw > 0) {
if (this->isSymmetric) {
#ifdef _OPENMP
#pragma omp parallel for default(none) \
shared(numBw,array1PtrBw,array2PtrBw,gradientPtrBw,gam)
Expand All @@ -365,9 +358,7 @@ void reg_conjugateGradient<T>::Optimise(T maxLength,
T smallLength,
T &startLength) {
this->UpdateGradientValues();
reg_optimiser<T>::Optimise(maxLength,
smallLength,
startLength);
reg_optimiser<T>::Optimise(maxLength, smallLength, startLength);
}
/* *************************************************************** */
template <class T>
Expand All @@ -377,8 +368,7 @@ void reg_conjugateGradient<T>::Perturbation(float length) {
}
/* *************************************************************** */
template <class T>
reg_lbfgs<T>::reg_lbfgs()
:reg_optimiser<T>::reg_optimiser() {
reg_lbfgs<T>::reg_lbfgs(): reg_optimiser<T>::reg_optimiser() {
this->stepToKeep = 5;
this->oldDof = nullptr;
this->oldGrad = nullptr;
Expand Down
34 changes: 17 additions & 17 deletions reg-lib/cpu/_reg_optimiser.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class InterfaceOptimiser {
template <class T>
class reg_optimiser {
protected:
bool isBackwards;
bool isSymmetric;
size_t dofNumber;
size_t dofNumberBw;
size_t ndim;
Expand Down Expand Up @@ -131,10 +131,10 @@ class reg_optimiser {
size_t startIt,
InterfaceOptimiser *intOpt,
T *cppData,
T *gradData = nullptr,
size_t nvoxBw = 0,
T *cppDataBw = nullptr,
T *gradDataBw = nullptr);
T *gradData,
size_t nvoxBw,
T *cppDataBw,
T *gradDataBw);
virtual void Optimise(T maxLength,
T smallLength,
T& startLength);
Expand Down Expand Up @@ -169,14 +169,14 @@ class reg_conjugateGradient: public reg_optimiser<T> {
size_t maxIt,
size_t startIt,
InterfaceOptimiser *intOpt,
T *cppData = nullptr,
T *gradData = nullptr,
size_t nvoxBw = 0,
T *cppDataBw = nullptr,
T *gradDataBw = nullptr) override;
T *cppData,
T *gradData,
size_t nvoxBw,
T *cppDataBw,
T *gradDataBw) override;
virtual void Optimise(T maxLength,
T smallLength,
T &startLength) override;
T& startLength) override;
virtual void Perturbation(float length) override;
};
/* *************************************************************** */
Expand Down Expand Up @@ -208,14 +208,14 @@ class reg_lbfgs: public reg_optimiser<T> {
size_t maxIt,
size_t startIt,
InterfaceOptimiser *intOpt,
T *cppData = nullptr,
T *gradData = nullptr,
size_t nvoxBw = 0,
T *cppDataBw = nullptr,
T *gradDataBw = nullptr) override;
T *cppData,
T *gradData,
size_t nvoxBw,
T *cppDataBw,
T *gradDataBw) override;
virtual void Optimise(T maxLength,
T smallLength,
T &startLength) override;
T& startLength) override;
};
/* *************************************************************** */
#include "_reg_optimiser.cpp"
12 changes: 6 additions & 6 deletions reg-lib/cuda/BlockSize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ struct BlockSize {
unsigned reg_defField_getJacobianMatrix;
/* _reg_optimiser_gpu */
unsigned reg_initialiseConjugateGradient;
unsigned reg_GetConjugateGradient1;
unsigned reg_GetConjugateGradient2;
unsigned reg_getConjugateGradient1;
unsigned reg_getConjugateGradient2;
unsigned GetMaximalLength;
unsigned reg_updateControlPointPosition;
/* _reg_ssd_gpu */
Expand Down Expand Up @@ -122,8 +122,8 @@ struct BlockSize100: public BlockSize {
reg_defField_getJacobianMatrix = 512; // 16 reg - 24 smem - 04 cmem
/* _reg_optimiser_gpu */
reg_initialiseConjugateGradient = 384; // 09 reg - 24 smem
reg_GetConjugateGradient1 = 320; // 12 reg - 24 smem
reg_GetConjugateGradient2 = 384; // 10 reg - 40 smem
reg_getConjugateGradient1 = 320; // 12 reg - 24 smem
reg_getConjugateGradient2 = 384; // 10 reg - 40 smem
GetMaximalLength = 384; // 04 reg - 24 smem
reg_updateControlPointPosition = 384; // 08 reg - 24 smem
/* _reg_ssd_gpu */
Expand Down Expand Up @@ -191,8 +191,8 @@ struct BlockSize300: public BlockSize {
reg_defField_getJacobianMatrix = 768; // 34 reg
/* _reg_optimiser_gpu */
reg_initialiseConjugateGradient = 1024; // 20 reg
reg_GetConjugateGradient1 = 1024; // 22 reg
reg_GetConjugateGradient2 = 1024; // 25 reg
reg_getConjugateGradient1 = 1024; // 22 reg
reg_getConjugateGradient2 = 1024; // 25 reg
GetMaximalLength = 1024; // 20 reg
reg_updateControlPointPosition = 1024; // 22 reg
/* _reg_ssd_gpu */
Expand Down
2 changes: 1 addition & 1 deletion reg-lib/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ else(NOT COMPILE_RESULT_VAR)
endif()
#adjust for debug and release versions
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --ptxas-options=-v -g -G -lineinfo")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --ptxas-options=-v -g -G")
else(CMAKE_BUILD_TYPE STREQUAL "Debug")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --ptxas-options=-O3")
endif(CMAKE_BUILD_TYPE STREQUAL "Debug")
Expand Down
30 changes: 17 additions & 13 deletions reg-lib/cuda/_reg_common_cuda_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,49 +8,53 @@
#pragma once

/* *************************************************************** */
__device__ __inline__ float2 operator*(float a, float2 b) {
__device__ __inline__ float2 operator*(const float& a, const float2& b) {
return { a * b.x, a * b.y };
}
__device__ __inline__ float3 operator*(float a, float3 b) {
__device__ __inline__ float3 operator*(const float& a, const float3& b) {
return { a * b.x, a * b.y, a * b.z };
}
__device__ __inline__ float3 operator*(float3 a, float3 b) {
__device__ __inline__ float3 operator*(const float3& a, const float3& b) {
return { a.x * b.x, a.y * b.y, a.z * b.z };
}
__device__ __inline__ float4 operator*(float4 a, float4 b) {
__device__ __inline__ float4 operator*(const float4& a, const float4& b) {
return { a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w };
}
__device__ __inline__ float4 operator*(float a, float4 b) {
__device__ __inline__ float4 operator*(const float& a, const float4& b) {
return { a * b.x, a * b.y, a * b.z, 0.0f };
}
/* *************************************************************** */
__device__ __inline__ float2 operator/(float2 a, float2 b) {
__device__ __inline__ float2 operator/(const float2& a, const float2& b) {
return { a.x / b.x, a.y / b.y };
}
__device__ __inline__ float3 operator/(float3 a, float b) {
__device__ __inline__ float3 operator/(const float3& a, const float& b) {
return { a.x / b, a.y / b, a.z / b };
}
__device__ __inline__ float3 operator/(float3 a, float3 b) {
__device__ __inline__ float3 operator/(const float3& a, const float3& b) {
return { a.x / b.x, a.y / b.y, a.z / b.z };
}
/* *************************************************************** */
__device__ __inline__ float2 operator+(float2 a, float2 b) {
__device__ __inline__ float2 operator+(const float2& a, const float2& b) {
return { a.x + b.x, a.y + b.y };
}
__device__ __inline__ float4 operator+(float4 a, float4 b) {
__device__ __inline__ float4 operator+(const float4& a, const float4& b) {
return { a.x + b.x, a.y + b.y, a.z + b.z, 0.0f };
}
__device__ __inline__ float3 operator+(float3 a, float3 b) {
__device__ __inline__ float3 operator+(const float3& a, const float3& b) {
return { a.x + b.x, a.y + b.y, a.z + b.z };
}
/* *************************************************************** */
__device__ __inline__ float3 operator-(float3 a, float3 b) {
__device__ __inline__ float3 operator-(const float3& a, const float3& b) {
return { a.x - b.x, a.y - b.y, a.z - b.z };
}
__device__ __inline__ float4 operator-(float4 a, float4 b) {
__device__ __inline__ float4 operator-(const float4& a, const float4& b) {
return { a.x - b.x, a.y - b.y, a.z - b.z, 0.f };
}
/* *************************************************************** */
__device__ __inline__ double2 operator+(const double2& a, const double2& b) {
return { a.x + b.x, a.y + b.y };
}
/* *************************************************************** */
__device__ __inline__ void reg_mat33_mul_cuda(const mat33& mat, const float (&in)[3], const float& weight, float (&out)[3], const bool& is3d) {
out[0] = weight * (mat.m[0][0] * in[0] + mat.m[0][1] * in[1] + mat.m[0][2] * in[2]);
out[1] = weight * (mat.m[1][0] * in[0] + mat.m[1][1] * in[1] + mat.m[1][2] * in[2]);
Expand Down
Loading

0 comments on commit 7204698

Please sign in to comment.