Commit d3ef08fe authored by Pankaj Daga's avatar Pankaj Daga

Make changes to the GPU version for each thread block to process a whole image block.

parent be0b436e
......@@ -147,16 +147,18 @@ void _reg_set_active_blocks(nifti_image *targetImage, _reg_blockMatchingParam *p
}
for (int i = params->activeBlockNumber; i < totalBlockNumber; ++i){
params->activeBlock[*indexArrayPtr--] = -1;
}
// renumber them to ensure consistency with the GPU version
}
#ifdef _USE_CUDA
count = 0;
for(int i = 0; i < totalBlockNumber; ++i){
if(params->activeBlock[i] != -1){
params->activeBlock[i] = count;
if(params->activeBlock[i] != -1){
params->activeBlock[i] = -1;
params->activeBlock[count] = i;
++count;
}
}
#endif
free(varianceArray);
free(indexArray);
}
......@@ -242,7 +244,6 @@ void real_block_matching_method(nifti_image * target,
unsigned int blockIndex=0;
unsigned int activeBlockIndex=0;
int index;
for(int k=0; k<params->blockNumber[2]; k++){
......@@ -345,9 +346,7 @@ void real_block_matching_method(nifti_image * target,
}
}
if(voxelNumber>BLOCK_SIZE/2){
targetMean /= voxelNumber;
resultMean /= voxelNumber;
......@@ -367,7 +366,7 @@ void real_block_matching_method(nifti_image * target,
localCC = fabs(localCC/sqrt(targetVar*resultVar));
if(localCC>bestCC){
if(localCC>bestCC){
bestCC=localCC;
bestDisplacement[0] = (float)l;
bestDisplacement[1] = (float)m;
......@@ -405,7 +404,7 @@ void real_block_matching_method(nifti_image * target,
free(resultValues);
free(targetValues);
free(targetOverlap);
free(resultOverlap);
free(resultOverlap);
}
// Called internally to determine the parameter type
......@@ -631,12 +630,12 @@ void estimate_affine_transformation(std::vector<_reg_sorted_point> & points,
{
w[k] = 0.0f;
}
// Now we can compute our svd
// Now we can compute our svd
svd(A, num_equations, 12, w, v);
// First we make sure that the really small singular values
// are set to 0. and compute the inverse by taking the reciprocal
// of the entries
// First we make sure that the really small singular values
// are set to 0. and compute the inverse by taking the reciprocal
// of the entries
for (unsigned k = 0; k < 12; ++k)
{
if (w[k] < 0.0001)
......@@ -1350,5 +1349,5 @@ void svd(float ** in, int m, int n, float * w, float ** v)
w[k-1] = x;
}
}
free (rv1);
free (rv1);
}
......@@ -23,6 +23,9 @@
#define BLOCK_SIZE 64
#define OVERLAP_SIZE 3
#define STEP_SIZE 1
#define NUM_BLOCKS_TO_COMPARE 216 // We compare in a 6x6x6 neighborhood.
#define NUM_BLOCKS_TO_COMPARE_2D 36
#define NUM_BLOCKS_TO_COMPARE_1D 6
/**
*
......
......@@ -47,33 +47,42 @@ void block_matching_method_gpu( nifti_image *targetImage,
float4 t_m_c_h = make_float4(xyz_mat->m[2][0],xyz_mat->m[2][1],xyz_mat->m[2][2],xyz_mat->m[2][3]);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(t_m_a, &t_m_a_h,sizeof(float4)));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(t_m_b, &t_m_b_h,sizeof(float4)));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(t_m_c, &t_m_c_h,sizeof(float4)));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(t_m_c, &t_m_c_h,sizeof(float4)));
// We need to allocate some memory to keep track of overlap areas and values for blocks
unsigned memSize = 64 * params->activeBlockNumber;
float * targetValues;CUDA_SAFE_CALL(cudaMalloc((void **)&targetValues, memSize * sizeof(float)));
float * resultValues;CUDA_SAFE_CALL(cudaMalloc((void **)&resultValues, memSize * sizeof(float)));
CUDA_SAFE_CALL(cudaMemset(targetValues, 0, memSize * sizeof(float)));
CUDA_SAFE_CALL(cudaMemset(resultValues, 0, memSize * sizeof(float)));
unsigned memSize = BLOCK_SIZE * params->activeBlockNumber;
float * targetValues;CUDA_SAFE_CALL(cudaMalloc((void **)&targetValues, memSize * sizeof(float)));
memSize = BLOCK_SIZE * params->activeBlockNumber;
float * resultValues;CUDA_SAFE_CALL(cudaMalloc((void **)&resultValues, memSize * sizeof(float)));
unsigned int Grid_block_matching = (unsigned int)ceil((float)params->activeBlockNumber/(float)Block_target_block);
unsigned int Grid_block_matching_2 = 1;
// We have hit the limit in one dimension
if (Grid_block_matching > 65335) {
Grid_block_matching_2 = (unsigned int)ceil((float)Grid_block_matching/65535.0f);
Grid_block_matching = 65335;
}
const unsigned int Grid_block_matching = (unsigned int)ceil((float)numBlocks/(float)Block_target_block);
dim3 B1(Block_target_block,1,1);
dim3 G1(Grid_block_matching,1,1);
dim3 G1(Grid_block_matching,Grid_block_matching_2,1);
// process the target blocks
process_target_blocks_gpu<<<G1, B1>>>( *targetPosition_d,
targetValues);
// Ensure that all the threads have done their job
CUDA_SAFE_CALL(cudaThreadSynchronize());
CUDA_SAFE_CALL(cudaThreadSynchronize());
unsigned int Result_block_matching = params->activeBlockNumber;
unsigned int Result_block_matching_2 = 1;
const unsigned int Result_block_matching = (unsigned int)ceil((float)numBlocks/(float)Block_result_block);
dim3 B2(Block_result_block,1,1);
dim3 G2(Result_block_matching,1,1);
// We have hit the limit in one dimension
if (Result_block_matching > 65335) {
Result_block_matching_2 = (unsigned int)ceil((float)Result_block_matching/65535.0f);
Result_block_matching = 65335;
}
process_result_blocks_gpu<<<G2, B2>>>( *targetPosition_d,
*resultPosition_d,
targetValues,
resultValues);
dim3 B2(Block_result_block,1,1);
dim3 G2(Result_block_matching,Result_block_matching_2,1);
process_result_blocks_gpu<<<G2, B2>>>(*resultPosition_d, targetValues);
// Ensure that all the threads have done their job
CUDA_SAFE_CALL(cudaThreadSynchronize());
......@@ -100,9 +109,9 @@ void optimize_gpu( _reg_blockMatchingParam *blockMatchingParams,
// device to host copy
int memSize = blockMatchingParams->activeBlockNumber * 3 * sizeof(float);
CUDA_SAFE_CALL(cudaMemcpy(blockMatchingParams->targetPosition, *targetPosition_d, memSize, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(blockMatchingParams->resultPosition, *resultPosition_d, memSize, cudaMemcpyDeviceToHost));
// Cheat and call the CPU version.
optimize(blockMatchingParams, updateAffineMatrix, affine);
CUDA_SAFE_CALL(cudaMemcpy(blockMatchingParams->resultPosition, *resultPosition_d, memSize, cudaMemcpyDeviceToHost));
// Cheat and call the CPU version.
optimize(blockMatchingParams, updateAffineMatrix, affine);
}
#endif
......@@ -37,7 +37,7 @@ void optimize_gpu( _reg_blockMatchingParam *blockMatchingParams,
mat44 *updateAffineMatrix,
float **targetPosition_d,
float **resultPosition_d,
bool affine = true);
bool affine = true);
#endif
......@@ -38,7 +38,7 @@ texture<float, 1, cudaReadModeElementType> resultImageArray_texture;
texture<int, 1, cudaReadModeElementType> activeBlock_texture;
// Apply the transformation matrix
__device__ void apply_affine(const float3 &pt, float * result)
__device__ inline void apply_affine(const float4 &pt, float * result)
{
float4 mat = t_m_a;
result[0] = (mat.x * pt.x) + (mat.y*pt.y) + (mat.z*pt.z) + (mat.w);
......@@ -51,20 +51,19 @@ __device__ void apply_affine(const float3 &pt, float * result)
// CUDA kernel to process the target values
__global__ void process_target_blocks_gpu(float *targetPosition_d,
float *targetValues)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
{
const int tid = (blockIdx.x * blockDim.x + threadIdx.x) + (blockIdx.y * gridDim.x);
const int3 bDim = c_BlockDim;
if (tid < bDim.x * bDim.y * bDim.z){
const int currentBlockIndex = tex1Dfetch(activeBlock_texture,tid);
const int currentBlockIndex = tex1Dfetch(activeBlock_texture,tid);
if (currentBlockIndex >= 0){
// Get the corresponding (i, j, k) indices
int tempIndex = tid;
// Get the corresponding (i, j, k) indices
int tempIndex = currentBlockIndex;
const int k =(int)(tempIndex/(bDim.x * bDim.y));
tempIndex -= k * bDim.x * bDim.y;
const int j =(int)(tempIndex/(bDim.x));
const int i = tempIndex - j * (bDim.x);
const int offset = currentBlockIndex * BLOCK_SIZE;
const int offset = tid * BLOCK_SIZE;
const int targetIndex_start_x = i * BLOCK_WIDTH;
const int targetIndex_start_y = j * BLOCK_WIDTH;
const int targetIndex_start_z = k * BLOCK_WIDTH;
......@@ -113,178 +112,189 @@ __global__ void process_target_blocks_gpu(float *targetPosition_d,
index += rampY[threadIdx.x] * BLOCK_WIDTH;
}
float3 targetPosition;
float4 targetPosition;
targetPosition.x = i * BLOCK_WIDTH;
targetPosition.y = j * BLOCK_WIDTH;
targetPosition.z = k * BLOCK_WIDTH;
apply_affine(targetPosition, &(targetPosition_d[currentBlockIndex * 3]));
targetPosition.z = k * BLOCK_WIDTH;
apply_affine(targetPosition, &(targetPosition_d[tid * 3]));
}
}
}
// CUDA kernel to process the result blocks
__global__ void process_result_blocks_gpu(float *targetPosition_d,
float *resultPosition_d,
float *targetValues,
float *resultValues)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
// CUDA kernel to process the result blocks
__global__ void process_result_blocks_gpu(float *resultPosition_d,
float *targetValues)
{
const int tid = (blockIdx.x * blockDim.x + threadIdx.x) + (blockIdx.y * gridDim.x);
const int3 bDim = c_BlockDim;
if (tid < bDim.x * bDim.y * bDim.z){
const int activeBlockIndex = tex1Dfetch(activeBlock_texture,tid);
if (activeBlockIndex >= 0){
int tempIndex = tid;
const int k =(int)(tempIndex/(bDim.x * bDim.y));
tempIndex -= k * bDim.x * bDim.y;
const int j =(int)(tempIndex/(bDim.x));
const int i = tempIndex - j * (bDim.x);
const int offset = activeBlockIndex * BLOCK_SIZE;
const int targetIndex_start_x = i * BLOCK_WIDTH;
const int targetIndex_start_y = j * BLOCK_WIDTH;
const int targetIndex_start_z = k * BLOCK_WIDTH;
unsigned int index = 0;
const int ctid = (int)(tid / NUM_BLOCKS_TO_COMPARE);
__shared__ float4 localCC [NUM_BLOCKS_TO_COMPARE];
localCC[tid % NUM_BLOCKS_TO_COMPARE] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
__shared__ int updateThreadID;
updateThreadID = -1;
if (ctid < bDim.x * bDim.y * bDim.z) {
const int activeBlockIndex = tex1Dfetch(activeBlock_texture, ctid);
//int tempIndex = ctid;
int tempIndex = activeBlockIndex;
const int k =(int)(tempIndex/(bDim.x * bDim.y));
tempIndex -= k * bDim.x * bDim.y;
const int j =(int)(tempIndex/(bDim.x));
const int i = tempIndex - j * (bDim.x);
const int targetIndex_start_x = i * BLOCK_WIDTH;
const int targetIndex_start_y = j * BLOCK_WIDTH;
const int targetIndex_start_z = k * BLOCK_WIDTH;
if (activeBlockIndex >= 0) {
//const int block_offset = activeBlockIndex * BLOCK_SIZE;
const int block_offset = ctid * BLOCK_SIZE;
const int3 imageSize = c_ImageSize;
tempIndex = tid % NUM_BLOCKS_TO_COMPARE;
int n = (int)tempIndex /NUM_BLOCKS_TO_COMPARE_2D;
tempIndex -= n * NUM_BLOCKS_TO_COMPARE_2D;
int m = (int)tempIndex /NUM_BLOCKS_TO_COMPARE_1D;
int l = tempIndex - m * NUM_BLOCKS_TO_COMPARE_1D;
n -= OVERLAP_SIZE;
m -= OVERLAP_SIZE;
l -= OVERLAP_SIZE;
__shared__ int rampZ[Block_result_block];
__shared__ int rampYLeft[Block_result_block];
__shared__ int rampYRight[Block_result_block];
__shared__ int rampXLeft[Block_result_block];
__shared__ int rampXRight[Block_result_block];
tempIndex = tid % NUM_BLOCKS_TO_COMPARE;
__shared__ int n[Block_result_block];
__shared__ int m[Block_result_block];
__shared__ int l[Block_result_block];
float bestCC = 0.0f;
float3 bestDisplacement = make_float3(0.0f, 0.0f, 0.0f);
int resultIndex_start_z = targetIndex_start_z + n;
int resultIndex_end_z = resultIndex_start_z + BLOCK_WIDTH;
for(n[threadIdx.x] = -OVERLAP_SIZE; n[threadIdx.x] < OVERLAP_SIZE; n[threadIdx.x]+=STEP_SIZE){
int resultIndex_start_z = targetIndex_start_z + n[threadIdx.x];
int resultIndex_end_z = resultIndex_start_z + BLOCK_WIDTH;
rampZ[threadIdx.x] = 0;
if (resultIndex_start_z < 0){
rampZ[threadIdx.x] = -resultIndex_start_z;
resultIndex_start_z = 0;
}
if (resultIndex_end_z > imageSize.z){
resultIndex_end_z = imageSize.z;
}
for(m[threadIdx.x] = -OVERLAP_SIZE; m[threadIdx.x] < OVERLAP_SIZE; m[threadIdx.x]+=STEP_SIZE){
int resultIndex_start_y = targetIndex_start_y + m[threadIdx.x];
int resultIndex_end_y = resultIndex_start_y + BLOCK_WIDTH;
rampYLeft[threadIdx.x] = rampYRight[threadIdx.x] = 0;
if (resultIndex_start_y < 0) {
rampYLeft[threadIdx.x] = -resultIndex_start_y;
resultIndex_start_y = 0;
}
if (resultIndex_end_y > imageSize.y) {
rampYRight[threadIdx.x] = resultIndex_end_y - imageSize.y;
resultIndex_end_y = imageSize.y;
}
int rampZ = 0;
if (resultIndex_start_z < 0){
rampZ = -resultIndex_start_z;
resultIndex_start_z = 0;
}
if (resultIndex_end_z > imageSize.z){
resultIndex_end_z = imageSize.z;
}
for(l[threadIdx.x] = -OVERLAP_SIZE; l[threadIdx.x]< OVERLAP_SIZE; l[threadIdx.x]+=STEP_SIZE){
int resultIndex_start_x = targetIndex_start_x + l[threadIdx.x];
int resultIndex_end_x = resultIndex_start_x + BLOCK_WIDTH;
rampXLeft[threadIdx.x] = rampXRight[threadIdx.x] = 0;
if (resultIndex_start_x < 0) {
rampXLeft[threadIdx.x] = -resultIndex_start_x;
resultIndex_start_x = 0;
}
if (resultIndex_end_x > imageSize.x) {
rampXRight[threadIdx.x] = resultIndex_end_x - imageSize.x;
resultIndex_end_x = imageSize.x;
}
int resultIndex_start_y = targetIndex_start_y + m;
int resultIndex_end_y = resultIndex_start_y + BLOCK_WIDTH;
int rampYLeft = 0;
int rampYRight = 0;
if (resultIndex_start_y < 0) {
rampYLeft = -resultIndex_start_y;
resultIndex_start_y = 0;
}
for (int count = 0; count < BLOCK_SIZE; ++count)
{
resultValues[count + offset] = 0.0f;
}
if (resultIndex_end_y > imageSize.y) {
rampYRight = resultIndex_end_y - imageSize.y;
resultIndex_end_y = imageSize.y;
}
index = 0;
index += rampZ[threadIdx.x] * BLOCK_WIDTH * BLOCK_WIDTH;
for(int z = resultIndex_start_z; z< resultIndex_end_z; ++z){
int indexZ = z * imageSize.y * imageSize.x;
index += rampYLeft[threadIdx.x] * BLOCK_WIDTH;
for(int y = resultIndex_start_y; y < resultIndex_end_y; ++y){
int indexXYZ = indexZ + y * imageSize.x + resultIndex_start_x;
index += rampXLeft[threadIdx.x];
for(int x = resultIndex_start_x; x < resultIndex_end_x; ++x){
resultValues[offset+index] = tex1Dfetch(resultImageArray_texture, indexXYZ);
indexXYZ++;
index++;
}
index += rampXRight[threadIdx.x];
}
index += rampYRight[threadIdx.x] * BLOCK_WIDTH;
}
// Do the cross corelation stuff
float targetMean=0.0f;
float resultMean=0.0f;
float voxelNumber=0.0f;
int resultIndex_start_x = targetIndex_start_x + l;
int resultIndex_end_x = resultIndex_start_x + BLOCK_WIDTH;
int rampXLeft = 0;
int rampXRight = 0;
if (resultIndex_start_x < 0) {
rampXLeft = -resultIndex_start_x;
resultIndex_start_x = 0;
}
if (resultIndex_end_x > imageSize.x) {
rampXRight = resultIndex_end_x - imageSize.x;
resultIndex_end_x = imageSize.x;
}
float target_mean = 0.0f;
float result_mean = 0.0f;
float voxel_number = 0.0f;
float result_var = 0.0f;
float target_var = 0.0f;
float target_temp = 0.0f;
float result_temp = 0.0f;
localCC[tempIndex].w = 0.0f;
for(int a = 0; a < BLOCK_SIZE; ++a){
float tv = targetValues[a + offset];
float rv = resultValues[a + offset];
if (tv != 0.0f && rv != 0.0f) {
targetMean += tv;
resultMean += rv;
voxelNumber++;
}
unsigned int index = rampZ * BLOCK_WIDTH * BLOCK_WIDTH;
for(int z = resultIndex_start_z; z< resultIndex_end_z; ++z){
int indexZ = z * imageSize.y * imageSize.x;
index += rampYLeft * BLOCK_WIDTH;
for(int y = resultIndex_start_y; y < resultIndex_end_y; ++y){
int indexXYZ = indexZ + y * imageSize.x + resultIndex_start_x;
index += rampXLeft;
for(int x = resultIndex_start_x; x < resultIndex_end_x; ++x){
float current_value = tex1Dfetch(resultImageArray_texture, indexXYZ);
float current_target_value = targetValues[block_offset + index];
if (current_value != 0.0f && current_target_value != 0.0f) {
result_mean += current_value;
target_mean += current_target_value;
++voxel_number;
}
indexXYZ++;
index++;
}
index += rampXRight;
}
index += rampYRight * BLOCK_WIDTH;
}
float targetVar=0.0f;
float resultVar=0.0f;
float localCC = 0.0f;
float targetTemp=0.0f;
float resultTemp=0.0f;
if(voxelNumber > 0.0f){
targetMean /= voxelNumber;
resultMean /= voxelNumber;
for(int a = 0; a < BLOCK_SIZE; ++a)
{
float tv = targetValues[a + offset];
float rv = resultValues[a + offset];
if (tv != 0.0f && rv != 0.0f)
{
targetTemp = tv-targetMean;
resultTemp = rv-resultMean;
targetVar += (targetTemp)*(targetTemp);
resultVar += (resultTemp)*(resultTemp);
localCC += (targetTemp)*(resultTemp);
}
}
targetVar = sqrt(targetVar/voxelNumber);
resultVar = sqrt(resultVar/voxelNumber);
if (targetVar > 0.0f && resultVar > 0.0f)
localCC = fabsf(localCC/
(voxelNumber*targetVar*resultVar));
if (voxel_number > 0.0f) {
result_mean /= voxel_number;
target_mean /= voxel_number;
}
if(localCC > bestCC){
bestCC = localCC;
bestDisplacement.x=l[threadIdx.x];
bestDisplacement.y=m[threadIdx.x];
bestDisplacement.z=n[threadIdx.x];
}
}
index = rampZ * BLOCK_WIDTH * BLOCK_WIDTH;
for(int z = resultIndex_start_z; z< resultIndex_end_z; ++z){
int indexZ = z * imageSize.y * imageSize.x;
index += rampYLeft * BLOCK_WIDTH;
for(int y = resultIndex_start_y; y < resultIndex_end_y; ++y){
int indexXYZ = indexZ + y * imageSize.x + resultIndex_start_x;
index += rampXLeft;
for(int x = resultIndex_start_x; x < resultIndex_end_x; ++x){
float current_value = tex1Dfetch(resultImageArray_texture, indexXYZ);
float current_target_value = targetValues[block_offset + index];
if (current_value != 0.0f && current_target_value != 0.0f) {
target_temp = (current_target_value - target_mean);
result_temp = (current_value - result_mean);
result_var += result_temp * result_temp;
target_var += target_temp * target_temp;
localCC[tempIndex].w += target_temp * result_temp;
}
indexXYZ++;
index++;
}
index += rampXRight;
}
index += rampYRight * BLOCK_WIDTH;
}
bestDisplacement.x += i * BLOCK_WIDTH;
bestDisplacement.y += j * BLOCK_WIDTH;
bestDisplacement.z += k * BLOCK_WIDTH;
apply_affine(bestDisplacement, &(resultPosition_d[activeBlockIndex * 3]));
localCC[tempIndex].x = l;
localCC[tempIndex].y = m;
localCC[tempIndex].z = n;
if (voxel_number > 0.0f) {
target_var = sqrt(target_var/voxel_number);
result_var = sqrt(result_var/voxel_number);
if (target_var > 0.0f && result_var > 0.0f)
localCC[tempIndex].w = fabsf(localCC[tempIndex].w/
(voxel_number*target_var*result_var));
}
// Just take ownership of updating the final value
if (updateThreadID == -1)
updateThreadID = tid;
}
__syncthreads();
// Just let one thread do the final update
if (updateThreadID > -1) {
float4 bestCC = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
for (unsigned i = 0; i < NUM_BLOCKS_TO_COMPARE; ++i) {
if (localCC[i].w > bestCC.w) {
bestCC.x = localCC[i].x;
bestCC.y = localCC[i].y;
bestCC.z = localCC[i].z;
bestCC.w = localCC[i].w;
}
}
bestCC.x += targetIndex_start_x;
bestCC.y += targetIndex_start_y;
bestCC.z += targetIndex_start_z;
apply_affine(bestCC, &(resultPosition_d[ctid * 3]));
}
}
}
#endif
......@@ -45,7 +45,7 @@
#define Block_reg_bspline_storeApproxBendingEnergy 192 // 39 regs - 025% occupancy
#define Block_reg_bspline_getApproxBendingEnergyGradient 384 // 19 regs - 050% occupancy
#define Block_target_block 512 // 26 regs - 100% occupancy
#define Block_result_block 192 // 34 regs - 25% occupancy
#define Block_result_block 216 // 31 regs - 25% occupancy
#endif
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment