Commit 9488e67d authored by Marc Modat's avatar Marc Modat

The node-based gradient smoothing does work using the GPU-based implementation

parent 31980be0
......@@ -69,7 +69,7 @@ typedef struct{
float targetUpThresholdValue;
float sourceLowThresholdValue;
float sourceUpThresholdValue;
float nmiGradientSmoothingValue;
float gradientSmoothingValue;
}PARAM;
typedef struct{
bool targetImageFlag;
......@@ -108,7 +108,7 @@ typedef struct{
bool sourceUpThresholdFlag;
bool twoDimRegistration;
bool nmiGradientSmoothingFlag;
bool gradientSmoothingFlag;
#ifdef _USE_CUDA
bool useGPUFlag;
......@@ -163,7 +163,7 @@ void Usage(char *exec)
printf("\t-lp <int>\t\tOnly perform the first levels [ln]\n");
printf("\t-nopy\t\t\tDo not use a pyramidal approach [no]\n");
printf("\t-nmiGradSM <float>\t\tTo smooth the node-based NMI gradient (in mm) [0]\n");
printf("\t-smoothGrad <float>\tTo smooth the node-based gradient (in mm) [0]\n");
printf("\t-be <float>\t\tWeight of the bending energy penalty term [0.01]\n");
printf("\t-noAppBE\t\tTo not approximate the BE value only at the control point position\n");
......@@ -326,8 +326,8 @@ int main(int argc, char **argv)
flag->sourceUpThresholdFlag=1;
}
else if(strcmp(argv[i], "-nmiGradSM") == 0){
param->nmiGradientSmoothingValue=(float)(atof(argv[++i]));
flag->nmiGradientSmoothingFlag=1;
param->gradientSmoothingValue=(float)(atof(argv[++i]));
flag->gradientSmoothingFlag=1;
}
else if(strcmp(argv[i], "-bgi") == 0){
param->backgroundIndex[0]=atoi(argv[++i]);
......@@ -789,18 +789,22 @@ int main(int argc, char **argv)
positionFieldImage->dim[2]=positionFieldImage->ny=targetImage->ny;
positionFieldImage->dim[3]=positionFieldImage->nz=targetImage->nz;
positionFieldImage->dim[4]=positionFieldImage->nt=1;positionFieldImage->pixdim[4]=positionFieldImage->dt=1.0;
if(flag->twoDimRegistration) positionFieldImage->dim[5]=positionFieldImage->nu=2;
else positionFieldImage->dim[5]=positionFieldImage->nu=3;
positionFieldImage->pixdim[5]=positionFieldImage->du=1.0;
else positionFieldImage->dim[5]=positionFieldImage->nu=3;
positionFieldImage->pixdim[5]=positionFieldImage->du=1.0;
positionFieldImage->dim[6]=positionFieldImage->nv=1;positionFieldImage->pixdim[6]=positionFieldImage->dv=1.0;
positionFieldImage->dim[7]=positionFieldImage->nw=1;positionFieldImage->pixdim[7]=positionFieldImage->dw=1.0;
positionFieldImage->nvox=positionFieldImage->nx*positionFieldImage->ny*positionFieldImage->nz*positionFieldImage->nt*positionFieldImage->nu;
if(sizeof(PrecisionTYPE)==4) positionFieldImage->datatype = NIFTI_TYPE_FLOAT32;
else positionFieldImage->datatype = NIFTI_TYPE_FLOAT64;
positionFieldImage->nbyper = sizeof(PrecisionTYPE);
positionFieldImage->data = (void *)calloc(positionFieldImage->nvox, positionFieldImage->nbyper);
#ifdef _USE_CUDA
if(flag->useGPUFlag){
positionFieldImage->data=NULL;
}
else
#endif
positionFieldImage->data = (void *)calloc(positionFieldImage->nvox, positionFieldImage->nbyper);
/* allocate the result image */
nifti_image *resultImage = nifti_copy_nim_info(targetImage);
......@@ -1099,6 +1103,12 @@ int main(int argc, char **argv)
controlPointImage,
&voxelNMIGradientArray_d,
&nodeNMIGradientArray_d);
if(flag->gradientSmoothingFlag){
reg_gaussianSmoothing_gpu( controlPointImage,
&nodeNMIGradientArray_d,
param->gradientSmoothingValue,
NULL);
}
/* The NMI gradient is converted from voxel space to real space */
reg_convertNMIGradientFromVoxelToRealSpace_gpu( sourceMatrix_xyz,
controlPointImage,
......@@ -1153,9 +1163,9 @@ int main(int argc, char **argv)
targetMask);
reg_smoothImageForCubicSpline<PrecisionTYPE>(voxelNMIGradientImage,smoothingRadius);
reg_voxelCentric2NodeCentric(nodeNMIGradientImage,voxelNMIGradientImage);
if(flag->nmiGradientSmoothingFlag){
if(flag->gradientSmoothingFlag){
reg_gaussianSmoothing<PrecisionTYPE>( nodeNMIGradientImage,
param->nmiGradientSmoothingValue,
param->gradientSmoothingValue,
NULL);
}
......@@ -1570,21 +1580,26 @@ int main(int argc, char **argv)
nifti_set_filenames(controlPointImage, param->outputCPPName, 0, 0);
nifti_image_write(controlPointImage);
if(param->level2Perform != param->levelNumber){
free(positionFieldImage->data);
positionFieldImage->dim[1]=positionFieldImage->nx=targetHeader->nx;
positionFieldImage->dim[2]=positionFieldImage->ny=targetHeader->ny;
positionFieldImage->dim[3]=positionFieldImage->nz=targetHeader->nz;
positionFieldImage->dim[4]=positionFieldImage->nt=1;positionFieldImage->pixdim[4]=positionFieldImage->dt=1.0;
if(flag->twoDimRegistration)
positionFieldImage->dim[5]=positionFieldImage->nu=2;
else positionFieldImage->dim[5]=positionFieldImage->nu=3;
positionFieldImage->pixdim[5]=positionFieldImage->du=1.0;
positionFieldImage->dim[6]=positionFieldImage->nv=1;positionFieldImage->pixdim[6]=positionFieldImage->dv=1.0;
positionFieldImage->dim[7]=positionFieldImage->nw=1;positionFieldImage->pixdim[7]=positionFieldImage->dw=1.0;
positionFieldImage->nvox=positionFieldImage->nx*positionFieldImage->ny*positionFieldImage->nz*positionFieldImage->nt*positionFieldImage->nu;
#ifdef _USE_CUDA
if(flag->useGPUFlag && param->level2Perform==param->levelNumber)
positionFieldImage->data = (void *)calloc(positionFieldImage->nvox, positionFieldImage->nbyper);
}
else
#endif
if(param->level2Perform != param->levelNumber){
if(positionFieldImage->data)free(positionFieldImage->data);
positionFieldImage->dim[1]=positionFieldImage->nx=targetHeader->nx;
positionFieldImage->dim[2]=positionFieldImage->ny=targetHeader->ny;
positionFieldImage->dim[3]=positionFieldImage->nz=targetHeader->nz;
positionFieldImage->dim[4]=positionFieldImage->nt=1;positionFieldImage->pixdim[4]=positionFieldImage->dt=1.0;
if(flag->twoDimRegistration)
positionFieldImage->dim[5]=positionFieldImage->nu=2;
else positionFieldImage->dim[5]=positionFieldImage->nu=3;
positionFieldImage->pixdim[5]=positionFieldImage->du=1.0;
positionFieldImage->dim[6]=positionFieldImage->nv=1;positionFieldImage->pixdim[6]=positionFieldImage->dv=1.0;
positionFieldImage->dim[7]=positionFieldImage->nw=1;positionFieldImage->pixdim[7]=positionFieldImage->dw=1.0;
positionFieldImage->nvox=positionFieldImage->nx*positionFieldImage->ny*positionFieldImage->nz*positionFieldImage->nt*positionFieldImage->nu;
positionFieldImage->data = (void *)calloc(positionFieldImage->nvox, positionFieldImage->nbyper);
}
/* The corresponding deformation field is evaluated and saved */
reg_bspline<PrecisionTYPE>( controlPointImage,
......
......@@ -928,7 +928,7 @@ void reg_gaussianSmoothing1(nifti_image *image,
if(sigma>0) currentSigma=sigma/image->pixdim[n];
else currentSigma=fabs(sigma); // voxel based if negative value
int radius=(int)ceil(currentSigma*3.0f);
if(radius>1){
if(radius>0){
PrecisionTYPE *kernel = new PrecisionTYPE[2*radius+1];
PrecisionTYPE kernelSum=0;
for(int i=-radius; i<=radius; i++){
......@@ -976,14 +976,14 @@ void reg_gaussianSmoothing1(nifti_image *image,
template <class PrecisionTYPE>
void reg_gaussianSmoothing( nifti_image *image,
float sigma,
bool downXYZ[8])
bool smoothXYZ[8])
{
bool axisToSmooth[8];
if(downXYZ==NULL){
if(smoothXYZ==NULL){
for(int i=0; i<8; i++) axisToSmooth[i]=true;
}
else{
for(int i=0; i<8; i++) axisToSmooth[i]=downXYZ[i];
for(int i=0; i<8; i++) axisToSmooth[i]=smoothXYZ[i];
}
if(sigma==0.0) return;
......
......@@ -62,118 +62,4 @@ void reg_getVoxelBasedNMIGradientUsingPW_gpu( nifti_image *targetImage,
#endif
}
void reg_smoothImageForCubicSpline_gpu( nifti_image *resultImage,
float4 **voxelNMIGradientArray_d,
int *smoothingRadius)
{
const int voxelNumber = resultImage->nvox;
int windowSize;
float4 *smoothedImage;
float *window;
const int3 imageSize = make_int3(resultImage->nx, resultImage->ny, resultImage->nz);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_ImageSize,&imageSize,sizeof(int3)));
// X axis
windowSize = 1+smoothingRadius[0]*2;
CUDA_SAFE_CALL(cudaMalloc((void **)&window,windowSize*sizeof(float)));
CUDA_SAFE_CALL(cudaMalloc((void **)&smoothedImage,voxelNumber*sizeof(float4)));
int Grid_reg_FillConvolutionWindows = (int)ceil((float)windowSize/(float)Block_reg_FillConvolutionWindows);
dim3 B1(Block_reg_FillConvolutionWindows,1,1);
dim3 G1(Grid_reg_FillConvolutionWindows,1,1);
FillConvolutionWindows_kernel <<< G1, B1 >>> (window, windowSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
#if _VERBOSE
printf("[VERBOSE] FillConvolutionWindows_kernel: %s - Grid size [%i %i %i] - Block size [%i %i %i]\n",
cudaGetErrorString(cudaGetLastError()),G1.x,G1.y,G1.z,B1.x,B1.y,B1.z);
#endif
NormaliseConvolutionWindows_kernel <<< dim3(1,1,1), dim3(1,1,1) >>>(window, windowSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
#if _VERBOSE
printf("[VERBOSE] NormaliseConvolutionWindows_kernel: %s - Grid size [1 0 0] - Block size [1 0 0]\n",
cudaGetErrorString(cudaGetLastError()));
#endif
const unsigned int Grid_reg_ApplyConvolutionWindowAlongX =
(unsigned int)ceil((float)voxelNumber/(float)Block_reg_ApplyConvolutionWindowAlongX);
dim3 B2(Block_reg_ApplyConvolutionWindowAlongX,1,1);
dim3 G2(Grid_reg_ApplyConvolutionWindowAlongX,1,1);
CUDA_SAFE_CALL(cudaBindTexture(0, gradientImageTexture, *voxelNMIGradientArray_d, voxelNumber*sizeof(float4)));
CUDA_SAFE_CALL(cudaBindTexture(0, convolutionWinTexture, window, windowSize*sizeof(float)));
_reg_ApplyConvolutionWindowAlongX_kernel <<< G2, B2 >>> (smoothedImage, windowSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
#if _VERBOSE
printf("[VERBOSE] reg_ApplyConvolutionWindowAlongX_kernel: %s - Grid size [%i %i %i] - Block size [%i %i %i]\n",
cudaGetErrorString(cudaGetLastError()),G2.x,G2.y,G2.z,B2.x,B2.y,B2.z);
#endif
CUDA_SAFE_CALL(cudaFree(window));
CUDA_SAFE_CALL(cudaMemcpy(*voxelNMIGradientArray_d, smoothedImage, voxelNumber*sizeof(float4), cudaMemcpyDeviceToDevice));
// Y axis
windowSize = 1+smoothingRadius[1]*2;
CUDA_SAFE_CALL(cudaMalloc((void **)&window,windowSize*sizeof(float)));
Grid_reg_FillConvolutionWindows = (int)ceil((float)windowSize/(float)Block_reg_FillConvolutionWindows);
dim3 B3(Block_reg_FillConvolutionWindows,1,1);
dim3 G3(Grid_reg_FillConvolutionWindows,1,1);
FillConvolutionWindows_kernel <<< G3, B3 >>> (window, windowSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
#if _VERBOSE
printf("[VERBOSE] FillConvolutionWindows_kernel: %s - Grid size [%i %i %i] - Block size [%i %i %i]\n",
cudaGetErrorString(cudaGetLastError()),G3.x,G3.y,G3.z,B3.x,B3.y,B3.z);
#endif
NormaliseConvolutionWindows_kernel <<< dim3(1,1,1), dim3(1,1,1) >>>(window, windowSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
#if _VERBOSE
printf("[VERBOSE] NormaliseConvolutionWindows_kernel: %s - Grid size [1 0 0] - Block size [1 0 0]\n",
cudaGetErrorString(cudaGetLastError()));
#endif
const unsigned int Grid_reg_ApplyConvolutionWindowAlongY =
(unsigned int)ceil((float)voxelNumber/(float)Block_reg_ApplyConvolutionWindowAlongY);
dim3 B4(Block_reg_ApplyConvolutionWindowAlongY,1,1);
dim3 G4(Grid_reg_ApplyConvolutionWindowAlongY,1,1);
CUDA_SAFE_CALL(cudaBindTexture(0, gradientImageTexture, *voxelNMIGradientArray_d, voxelNumber*sizeof(float4)));
CUDA_SAFE_CALL(cudaBindTexture(0, convolutionWinTexture, window, windowSize*sizeof(float)));
_reg_ApplyConvolutionWindowAlongY_kernel <<< G4, B4 >>> (smoothedImage, windowSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
#if _VERBOSE
printf("[VERBOSE] reg_ApplyConvolutionWindowAlongY_kernel: %s - Grid size [%i %i %i] - Block size [%i %i %i]\n",
cudaGetErrorString(cudaGetLastError()),G4.x,G4.y,G4.z,B4.x,B4.y,B4.z);
#endif
CUDA_SAFE_CALL(cudaFree(window));
CUDA_SAFE_CALL(cudaMemcpy(*voxelNMIGradientArray_d, smoothedImage, voxelNumber*sizeof(float4), cudaMemcpyDeviceToDevice));
// Z axis
windowSize = 1+smoothingRadius[2]*2;
CUDA_SAFE_CALL(cudaMalloc((void **)&window,windowSize*sizeof(float)));
Grid_reg_FillConvolutionWindows = (int)ceil((float)windowSize/(float)Block_reg_FillConvolutionWindows);
dim3 B5(Block_reg_FillConvolutionWindows,1,1);
dim3 G5(Grid_reg_FillConvolutionWindows,1,1);
FillConvolutionWindows_kernel <<< G5, B5 >>> (window, windowSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
#if _VERBOSE
printf("[VERBOSE] FillConvolutionWindows_kernel: %s - Grid size [%i %i %i] - Block size [%i %i %i]\n",
cudaGetErrorString(cudaGetLastError()),G5.x,G5.y,G5.z,B5.x,B5.y,B5.z);
#endif
NormaliseConvolutionWindows_kernel <<< dim3(1,1,1), dim3(1,1,1) >>>(window, windowSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
#if _VERBOSE
printf("[VERBOSE] NormaliseConvolutionWindows_kernel: %s - Grid size [1 0 0] - Block size [1 0 0]\n",
cudaGetErrorString(cudaGetLastError()));
#endif
const unsigned int Grid_reg_ApplyConvolutionWindowAlongZ =
(unsigned int)ceil((float)voxelNumber/(float)Block_reg_ApplyConvolutionWindowAlongZ);
dim3 B6(Block_reg_ApplyConvolutionWindowAlongZ,1,1);
dim3 G6(Grid_reg_ApplyConvolutionWindowAlongZ,1,1);
CUDA_SAFE_CALL(cudaBindTexture(0, gradientImageTexture, *voxelNMIGradientArray_d, voxelNumber*sizeof(float4)));
CUDA_SAFE_CALL(cudaBindTexture(0, convolutionWinTexture, window, windowSize*sizeof(float)));
_reg_ApplyConvolutionWindowAlongZ_kernel <<< G6, B6 >>> (smoothedImage, windowSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
#if _VERBOSE
printf("[VERBOSE] reg_ApplyConvolutionWindowAlongZ_kernel: %s - Grid size [%i %i %i] - Block size [%i %i %i]\n",
cudaGetErrorString(cudaGetLastError()),G6.x,G6.y,G6.z,B6.x,B6.y,B6.z);
#endif
CUDA_SAFE_CALL(cudaFree(window));
CUDA_SAFE_CALL(cudaMemcpy(*voxelNMIGradientArray_d, smoothedImage, voxelNumber*sizeof(float4), cudaMemcpyDeviceToDevice));
CUDA_SAFE_CALL(cudaFree(smoothedImage));
}
#endif
......@@ -28,8 +28,4 @@ void reg_getVoxelBasedNMIGradientUsingPW_gpu( nifti_image *targetImage,
int binning);
extern "C++"
void reg_smoothImageForCubicSpline_gpu( nifti_image *resultImage,
float4 **voxelNMIGradientArray_d,
int *smoothingRadius);
#endif
......@@ -23,7 +23,6 @@ texture<float, 1, cudaReadModeElementType> targetImageTexture;
texture<float, 1, cudaReadModeElementType> resultImageTexture;
texture<float4, 1, cudaReadModeElementType> resultImageGradientTexture;
texture<float, 1, cudaReadModeElementType> histogramTexture;
texture<float, 1, cudaReadModeElementType> convolutionWinTexture;
texture<float4, 1, cudaReadModeElementType> gradientImageTexture;
texture<int, 1, cudaReadModeElementType> maskTexture;
......@@ -137,133 +136,4 @@ __global__ void reg_getVoxelBasedNMIGradientUsingPW_kernel(float4 *voxelNMIGradi
return;
}
__global__ void FillConvolutionWindows_kernel( float *window,
int windowSize)
{
const int i=blockIdx.x*blockDim.x+threadIdx.x;
if(i<windowSize){
float coeff = (float)((windowSize-1)/2);
coeff = fabs(2.0f*((float)(i)-coeff)/coeff);
if(coeff<1.0f) window[i] = 2.0f/3.0f - coeff*coeff + 0.5f*coeff*coeff*coeff;
else window[i] = -(coeff-2.0f)*(coeff-2.0f)*(coeff-2.0f)/6.0f;
}
}
__global__ void _reg_ApplyConvolutionWindowAlongX_kernel(float4 *smoothedImage,
int windowSize)
{
const int tid=blockIdx.x*blockDim.x+threadIdx.x;
if(tid < c_VoxelNumber){
int3 imageSize = c_ImageSize;
int temp=tid;
const short z=(int)(temp/(imageSize.x*imageSize.y));
temp -= z*imageSize.x*imageSize.y;
const short y =(int)(temp/(imageSize.x));
short x = temp - y*(imageSize.x);
int radius = (windowSize-1)/2;
int index = tid - radius;
x -= radius;
float4 finalValue = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
for(int i=0; i<windowSize; i++){
if(-1<x && x<imageSize.x){
float4 gradientValue = tex1Dfetch(gradientImageTexture,index);
float windowValue = tex1Dfetch(convolutionWinTexture,i);
finalValue.x += gradientValue.x * windowValue;
finalValue.y += gradientValue.y * windowValue;
finalValue.z += gradientValue.z * windowValue;
}
index++;
x++;
}
smoothedImage[tid] = finalValue;
}
return;
}
__global__ void _reg_ApplyConvolutionWindowAlongY_kernel(float4 *smoothedImage,
int windowSize)
{
const int tid=blockIdx.x*blockDim.x+threadIdx.x;
if(tid < c_VoxelNumber){
int3 imageSize = c_ImageSize;
const short z=(int)(tid/(imageSize.x*imageSize.y));
int index = tid - z*imageSize.x*imageSize.y;
short y=(int)(index/imageSize.x);
int radius = (windowSize-1)/2;
index = tid - imageSize.x*radius;
y -= radius;
float4 finalValue = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
for(int i=0; i<windowSize; i++){
if(-1<y && y<imageSize.y){
float4 gradientValue = tex1Dfetch(gradientImageTexture,index);
float windowValue = tex1Dfetch(convolutionWinTexture,i);
finalValue.x += gradientValue.x * windowValue;
finalValue.y += gradientValue.y * windowValue;
finalValue.z += gradientValue.z * windowValue;
}
index += imageSize.x;
y++;
}
smoothedImage[tid] = finalValue;
}
return;
}
__global__ void _reg_ApplyConvolutionWindowAlongZ_kernel(float4 *smoothedImage,
int windowSize)
{
const int tid=blockIdx.x*blockDim.x+threadIdx.x;
if(tid < c_VoxelNumber){
int3 imageSize = c_ImageSize;
short z=(int)(tid/((imageSize.x)*(imageSize.y)));
int radius = (windowSize-1)/2;
int index = tid - imageSize.x*imageSize.y*radius;
z -= radius;
float4 finalValue = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
for(int i=0; i<windowSize; i++){
if(-1<z && z<imageSize.z){
float4 gradientValue = tex1Dfetch(gradientImageTexture,index);
float windowValue = tex1Dfetch(convolutionWinTexture,i);
finalValue.x += gradientValue.x * windowValue;
finalValue.y += gradientValue.y * windowValue;
finalValue.z += gradientValue.z * windowValue;
}
index += imageSize.x*imageSize.y;
z++;
}
smoothedImage[tid] = finalValue;
}
return;
}
__global__ void NormaliseConvolutionWindows_kernel( float *window,
int windowSize)
{
const int tid=blockIdx.x*blockDim.x+threadIdx.x;
if(tid<1){
float sum=0.0f;
for(int i=0; i<windowSize; i++) sum += window[i];
for(int i=0; i<windowSize; i++) window[i] /= sum;
}
}
#endif
......@@ -208,5 +208,183 @@ void reg_updateControlPointPosition_gpu(nifti_image *controlPointImage,
#endif
}
void reg_gaussianSmoothing_gpu( nifti_image *image,
float4 **imageArray_d,
float sigma,
bool smoothXYZ[8])
{
const int voxelNumber = image->nx * image->ny * image->nz;
const int3 imageDim = make_int3(image->nx, image->ny, image->nz);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_ImageDim, &imageDim,sizeof(int3)));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_VoxelNumber, &voxelNumber,sizeof(int3)));
bool axisToSmooth[8];
if(smoothXYZ==NULL){
for(int i=0; i<8; i++) axisToSmooth[i]=true;
}
else{
for(int i=0; i<8; i++) axisToSmooth[i]=smoothXYZ[i];
}
for(int n=1; n<4; n++){
if(axisToSmooth[n]==true){
float currentSigma;
if(sigma>0) currentSigma=sigma/image->pixdim[n];
else currentSigma=fabs(sigma); // voxel based if negative value
int radius=(int)ceil(currentSigma*3.0f);
if(radius>0){
int kernelSize = 1+radius*2;
float *kernel_h;
CUDA_SAFE_CALL(cudaMallocHost((void **)&kernel_h, kernelSize*sizeof(float)));
float kernelSum=0;
for(int i=-radius; i<=radius; i++){
kernel_h[radius+i]=(float)(exp( -(i*i)/(2.0*currentSigma*currentSigma)) / (currentSigma*2.506628274631)); // 2.506... = sqrt(2*pi)
kernelSum += kernel_h[radius+i];
}
for(int i=0; i<kernelSize; i++)
kernel_h[i] /= kernelSum;
float *kernel_d;
CUDA_SAFE_CALL(cudaMalloc((void **)&kernel_d, kernelSize*sizeof(float)));
CUDA_SAFE_CALL(cudaMemcpy(kernel_d, kernel_h, kernelSize*sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaFreeHost(kernel_h));
float4 *smoothedImage;
CUDA_SAFE_CALL(cudaMalloc((void **)&smoothedImage,voxelNumber*sizeof(float4)));
CUDA_SAFE_CALL(cudaBindTexture(0, convolutionKernelTexture, kernel_d, kernelSize*sizeof(float)));
CUDA_SAFE_CALL(cudaBindTexture(0, gradientImageTexture, *imageArray_d, voxelNumber*sizeof(float4)));
unsigned int Grid_reg_ApplyConvolutionWindow;
dim3 B,G;
switch(n){
case 1:
Grid_reg_ApplyConvolutionWindow =
(unsigned int)ceil((float)voxelNumber/(float)Block_reg_ApplyConvolutionWindowAlongX);
B=dim3(Block_reg_ApplyConvolutionWindowAlongX,1,1);
G=dim3(Grid_reg_ApplyConvolutionWindow,1,1);
_reg_ApplyConvolutionWindowAlongX_kernel <<< G, B >>> (smoothedImage, kernelSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
#if _VERBOSE
printf("[VERBOSE] reg_ApplyConvolutionWindowAlongX_kernel: %s - Grid size [%i %i %i] - Block size [%i %i %i]\n",
cudaGetErrorString(cudaGetLastError()),G.x,G.y,G.z,B.x,B.y,B.z);
#endif
break;
case 2:
Grid_reg_ApplyConvolutionWindow =
(unsigned int)ceil((float)voxelNumber/(float)Block_reg_ApplyConvolutionWindowAlongY);
B=dim3(Block_reg_ApplyConvolutionWindowAlongY,1,1);
G=dim3(Grid_reg_ApplyConvolutionWindow,1,1);
_reg_ApplyConvolutionWindowAlongY_kernel <<< G, B >>> (smoothedImage, kernelSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
#if _VERBOSE
printf("[VERBOSE] reg_ApplyConvolutionWindowAlongY_kernel: %s - Grid size [%i %i %i] - Block size [%i %i %i]\n",
cudaGetErrorString(cudaGetLastError()),G.x,G.y,G.z,B.x,B.y,B.z);
#endif
break;
case 3:
Grid_reg_ApplyConvolutionWindow =
(unsigned int)ceil((float)voxelNumber/(float)Block_reg_ApplyConvolutionWindowAlongZ);
B=dim3(Block_reg_ApplyConvolutionWindowAlongZ,1,1);
G=dim3(Grid_reg_ApplyConvolutionWindow,1,1);
_reg_ApplyConvolutionWindowAlongZ_kernel <<< G, B >>> (smoothedImage, kernelSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
#if _VERBOSE
printf("[VERBOSE] reg_ApplyConvolutionWindowAlongZ_kernel: %s - Grid size [%i %i %i] - Block size [%i %i %i]\n",
cudaGetErrorString(cudaGetLastError()),G.x,G.y,G.z,B.x,B.y,B.z);
#endif
break;
}
CUDA_SAFE_CALL(cudaFree(kernel_d));
CUDA_SAFE_CALL(cudaMemcpy(*imageArray_d, smoothedImage, voxelNumber*sizeof(float4), cudaMemcpyDeviceToDevice));
CUDA_SAFE_CALL(cudaFree(smoothedImage));
}
}
}
}
void reg_smoothImageForCubicSpline_gpu( nifti_image *image,
float4 **imageArray_d,
int *smoothingRadius)
{
const int voxelNumber = image->nx * image->ny * image->nz;
const int3 imageDim = make_int3(image->nx, image->ny, image->nz);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_ImageDim, &imageDim,sizeof(int3)));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_VoxelNumber, &voxelNumber,sizeof(int3)));
for(int n=0; n<3; n++){
if(smoothingRadius[n]>0){
int kernelSize = 1+smoothingRadius[n]*2;
float *kernel_h;
CUDA_SAFE_CALL(cudaMallocHost((void **)&kernel_h, kernelSize*sizeof(float)));
float kernelSum=0;
for(int i=-smoothingRadius[n]; i<=smoothingRadius[n]; i++){
float coeff = fabs(2.0f*(float)(i)/smoothingRadius[n]);
if(coeff<1.0f) kernel_h[smoothingRadius[n]+i] = 2.0f/3.0f - coeff*coeff + 0.5f*coeff*coeff*coeff;
else kernel_h[smoothingRadius[n]+i] = -(coeff-2.0f)*(coeff-2.0f)*(coeff-2.0f)/6.0f;
kernelSum += kernel_h[smoothingRadius[n]+i];
}
for(int i=0; i<kernelSize; i++) kernel_h[i] /= kernelSum;
float *kernel_d;
CUDA_SAFE_CALL(cudaMalloc((void **)&kernel_d, kernelSize*sizeof(float)));
CUDA_SAFE_CALL(cudaMemcpy(kernel_d, kernel_h, kernelSize*sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaFreeHost(kernel_h));
CUDA_SAFE_CALL(cudaBindTexture(0, convolutionKernelTexture, kernel_d, kernelSize*sizeof(float)));
float4 *smoothedImage_d;
CUDA_SAFE_CALL(cudaMalloc((void **)&smoothedImage_d,voxelNumber*sizeof(float4)));
CUDA_SAFE_CALL(cudaBindTexture(0, gradientImageTexture, *imageArray_d, voxelNumber*sizeof(float4)));
unsigned int Grid_reg_ApplyConvolutionWindow;
dim3 B,G;
switch(n){
case 0:
Grid_reg_ApplyConvolutionWindow =
(unsigned int)ceil((float)voxelNumber/(float)Block_reg_ApplyConvolutionWindowAlongX);
B=dim3(Block_reg_ApplyConvolutionWindowAlongX,1,1);
G=dim3(Grid_reg_ApplyConvolutionWindow,1,1);
_reg_ApplyConvolutionWindowAlongX_kernel <<< G, B >>> (smoothedImage_d, kernelSize);
CUDA_SAFE_CALL(cudaThreadSynchronize());
#if _VERBOSE
printf("[VERBOSE] reg_ApplyConvolutionWindowAlongX_kernel: %s - Grid size [%i %i %i] - Block size [%i %i %i]\n",
cudaGetErrorString(cudaGetLastError()),G.x,G.y,G.z,B.x,B.y,B.z);