开发者

CUDA warp vote functions make the code slower?

I have written a CUDA function that calculates a convex envelop in a set of points in 2D. But it is extremely slower than the CPU code!

I am using warp vote functions and __syncronisation(); quite number of times. So does that make the code slower ?

Thanks

Adding the code :

__global__ void find_edges_on_device(TYPE * h_x, TYPE * h_y, int *h_edges){

int tidX = threadIdx.x;
int tidY = threadIdx.y;
int tid = tidY*blockSizeX + tidX;
int i = threadIdx.x+blockIdx.x*blockDim.x;
int j = threadIdx.y+blockIdx.y*blockDim.y;

int hxi = h_x[i];
int hxj = h_x[j];
int hyi = h_y[i];
int hyj = h_y[j];

long scalarProduct = 0;
TYPE nx;
TYPE ny;

bool isValid = true;

__shared__ int shared_X[blockSizeX*blockSizeY];
__shared__ int shared_Y[blockSizeX*blockSizeY];
__shared__ bool iswarpvalid[32];
__shared__ bool isBlockValid;

if (tid==0)
{
    isBlockValid=true;
}
if (tid<(blockSizeX*blockSizeY-1)/32+1)
{
    iswarpvalid[tid]=true;
}
else if (tid<32)
{
    iswarpvalid[tid]=false;
}

//all the others points should be on the same side of the edge i,j
//normal to the edge (unnormalized)
nx = - ( hyj- hyi);
ny = hxj- hxi;
int k=0;
while ((k==i)||(k==j))
{
    k++;
} //k will be 0,1,or 2, but different from i and j to avoid 
scalarProduct=nx* (h_x[k]-hxi)+ny* (h_y[k]-hyi);
if (scalarProduct<0)
{
    nx*=-1;
    ny*=-1;
}

for(int count = 0; count < ((NPOINTS/blockSizeX*blockSizeY) + 1); count++ ){

    int globalIndex = tidY*blockSizeX + tidX + count*blockSizeX*blockSizeY;

    if (NPOINTS <= globalIndex){
        shared_X[tidY*blockSizeX + tidX] = -1;
        shared_Y[tidY*blockSizeX + tidX] = -1;
    }
    else {
        shared_X[tidY*blockSizeX + tidX]= h_x[globalIndex];
        shared_Y[tidY*blockSizeX + tidX]= h_y[globalInde开发者_Python百科x];
    }
    __syncthreads();

    //we have now at least one point with scalarProduct>0
    //all the other points should comply with the same condition for
    //the edge to be valid
    //loop on all the points 

    if(i < j){
        for (int k=0; k < blockSizeX*blockSizeY; k++)
        {   
            if((count * blockSizeX*blockSizeY + k < NPOINTS )&&(isValid)) {
                scalarProduct=nx* (shared_X[k]-hxi)+ny* (shared_Y[k]-hyi);
                if(__all(scalarProduct) < 0){
                    iswarpvalid[(tidY*blockSizeX + tidX)/32] = false;
                    break;
                }
                else if(0 > (scalarProduct) ){
                    isValid = false;
                    break;
                }
            }
        }
    }

    __syncthreads();
    if (tid<32)
    {
        isBlockValid=__any(iswarpvalid[tid]);
    }
    __syncthreads();
    if(!isBlockValid) break;
}

if ((i<j) && (true == isValid )){
            int tmp_i = i;
            int tmp_j = j;

            if( -1 != atomicCAS(&h_edges[2*i], -1, tmp_j) )
                h_edges[2*i+1]=j;

            if( -1 != atomicCAS(&h_edges[2*j], -1, tmp_i) )
                h_edges[2*j+1]=i;

}
}


The answers you're looking for can be found in the NVIDIA CUDA C Programming Guide.

Section 5.4.3 states that:

Throughput for __syncthreads() is 8 operations per clock cycle for devices of compute capability 1.x and 16 operations per clock cycle for devices of compute capability 2.x.

Warp vote functions are tackled in Section B.12 and in Table 109 of the PTX ISA manual. The latter indicates that two instructions are required to perform a warp vote. However, I couldn't figure out any clock cycle figure for warp vote functions in the reference documentation.

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜