Hi, I,m immigrating CUDA marching cube algorithm to C++ AMP.
There are some performance problem so it is not appropriate to convert code by 1:1 matching.
The result of performance of marching cube is CUDA is 2ms and AMP is 340ms on one marching cube(128*128*128 size).
I attached one phase of source code, classify voxels, both version of AMP and CUDA.
The time delay for this phase are AMP(20ms) and CUDA(may be 0?).
Course, I believe it can be optimized but I don't know how to do.
I guess the reason of performance down is access accessing array_view.
CUDA uses cudaBindTexture library to access volume data.
In AMP, I couldn't find similar library so I used just array_view.
I figure out accessing d_volume(array_view volume data) cause performance down by test.
This is AMP Code.
inline unsigned int sampleVolumeIndex(const uint3& p, const uint3& gridSize) restrict(amp)
{
return (p.z*(gridSize.x+1)*(gridSize.y+1)) + (p.y*(gridSize.x+1)) + p.x;
}
// launch_classifyVoxel
parallel_for_each(d_voxelVerts.extent.tile<1, 1, TILE_SIZE>(), [=] (tiled_index<1, 1, TILE_SIZE> t_idx) restrict(amp)
{
unsigned int i = (t_idx.global[1] * d_voxelVerts.extent[2]) + t_idx.global[2];
uint3 gridPos = calcGridPos(i, gridsizeShift, gridsizeMask);
float field[8];
field[0] = d_volume[sampleVolumeIndex(gridPos, gridsize)];
field[1] = d_volume[sampleVolumeIndex(gridPos + uint3(1, 0, 0), gridsize)];
field[2] = d_volume[sampleVolumeIndex(gridPos + uint3(1, 1, 0), gridsize)];
field[3] = d_volume[sampleVolumeIndex(gridPos + uint3(0, 1, 0), gridsize)];
field[4] = d_volume[sampleVolumeIndex(gridPos + uint3(0, 0, 1), gridsize)];
field[5] = d_volume[sampleVolumeIndex(gridPos + uint3(1, 0, 1), gridsize)];
field[6] = d_volume[sampleVolumeIndex(gridPos + uint3(1, 1, 1), gridsize)];
field[7] = d_volume[sampleVolumeIndex(gridPos + uint3(0, 1, 1), gridsize)];
// calculate flag indicating if each vertex is inside or outside isosurface
unsigned int cubeindex;
cubeindex = unsigned int(field[0] < ref_iso_value);
cubeindex += unsigned int(field[1] < ref_iso_value)*2;
cubeindex += unsigned int(field[2] < ref_iso_value)*4;
cubeindex += unsigned int(field[3] < ref_iso_value)*8;
cubeindex += unsigned int(field[4] < ref_iso_value)*16;
cubeindex += unsigned int(field[5] < ref_iso_value)*32;
cubeindex += unsigned int(field[6] < ref_iso_value)*64;
cubeindex += unsigned int(field[7] < ref_iso_value)*128;
// read number of vertices from texture
unsigned int numVerts = d_numVertsTable[cubeindex];
if (i < numVoxels) {
d_voxelVerts[t_idx.global] = numVerts;
d_voxelOccupied[t_idx.global] = (numVerts > 0);
}
});
}
This is CUDA code.
// sample volume data set at a point
__device__
float sampleVolume(uchar *data, uint3 p, uint3 gridSize)
{
p.x = min(p.x, gridSize.x - 1);
p.y = min(p.y, gridSize.y - 1);
p.z = min(p.z, gridSize.z - 1);
uint i = (p.z*gridSize.x*gridSize.y) + p.y*gridSize.x) + p.x;
//return (float) data[i] / 255.0f;
return tex1Dfetch(volumeTex, i);
}
__global__ void
classifyVoxel(uint* voxelVerts, uint *voxelOccupied, uchar *volume, uint3 gridSize, uint3 gridSizeShift, uint3 gridSizeMask, uint numVoxels, float3 voxelSize, float isoValue)
{
uint blockId = __mul24(blockIdx.y, gridDim.x) + blockIdx.x;
uint i = __mul24(blockId, blockDim.x) + threadIdx.x;
uint3 gridPos = calcGridPos(i, gridSizeShift, gridSizeMask);
// read field values at neighbouring grid vertices
float field[8];
field[0] = sampleVolume(volume, gridPos, gridSize);
field[1] = sampleVolume(volume, gridPos + make_uint3(1, 0, 0), gridSize);
field[2] = sampleVolume(volume, gridPos + make_uint3(1, 1, 0), gridSize);
field[3] = sampleVolume(volume, gridPos + make_uint3(0, 1, 0), gridSize);
field[4] = sampleVolume(volume, gridPos + make_uint3(0, 0, 1), gridSize);
field[5] = sampleVolume(volume, gridPos + make_uint3(1, 0, 1), gridSize);
field[6] = sampleVolume(volume, gridPos + make_uint3(1, 1, 1), gridSize);
field[7] = sampleVolume(volume, gridPos + make_uint3(0, 1, 1), gridSize);
// calculate flag indicating if each vertex is inside or outside isosurface
uint cubeindex;
cubeindex = uint(field[0] < isoValue);
cubeindex += uint(field[1] < isoValue)*2;
cubeindex += uint(field[2] < isoValue)*4;
cubeindex += uint(field[3] < isoValue)*8;
cubeindex += uint(field[4] < isoValue)*16;
cubeindex += uint(field[5] < isoValue)*32;
cubeindex += uint(field[6] < isoValue)*64;
cubeindex += uint(field[7] < isoValue)*128;
// read number of vertices from texture
uint numVerts = tex1Dfetch(numVertsTex, cubeindex);
if (i < numVoxels) {
voxelVerts[i] = numVerts;
voxelOccupied[i] = (numVerts > 0);
}
}