cuda nbody simulation - shared memory problem

2023-04-01 20:54 问答作者：

Based on the example from Nvidia GPU computing SDK I created two kernels for the nbody simulation. The first kernel which doesn't take advantage of shared memory is ~15% faster than the second kernel which uses shared memory. Why the kernel with a shared memory is slower ?

Kernel paramters: 8192 bodies, threads per block = 128, blocks per grid = 64. Device: GeForce GTX 560 Ti.

First kernel:

#define N 8192
#define EPS2 0.001f
__device__ float4 vel[N];

__device__ float3 force(float4 bi, float4 bj, float3 ai)
{
     float3开发者_运维技巧 r;

     r.x = bj.x - bi.x;
     r.y = bj.y - bi.y;
     r.z = bj.z - bi.z; 

    float distSqr = r.x * r.x + r.y * r.y + r.z * r.z + EPS2;
    float distSixth = distSqr * distSqr * distSqr;
    float invDistCube = 1.0f/sqrtf(distSixth); 
    float s = bj.w * invDistCube;

    ai.x += r.x * s;  
    ai.y += r.y * s;  
    ai.z += r.z * s; 

    return ai;
}

__global__ void points(float4 *pos, float dt)
{ 
     int k = blockIdx.x * blockDim.x + threadIdx.x;

     if(k >= N) return;

     float4 bi, bj, v;
     float3 ai;

     v = vel[k];
     bi = pos[k];
     ai = make_float3(0,0,0);

     for(int i = 0; i < N; i++)
     {
          bj = pos[i];
          ai = force(bi, bj, ai);
     }

     v.x += ai.x * dt;
     v.y += ai.y * dt;
     v.z += ai.z * dt;

     bi.x += v.x * dt;
     bi.y += v.y * dt;
     bi.z += v.z * dt;

     pos[k]=bi;
     vel[k]=v;
 }

Second kernel:

#define N 8192
#define EPS2 0.001f
#define THREADS_PER_BLOCK 128
__device__ float4 vel[N];
__shared__ float4 shPosition[THREADS_PER_BLOCK];

__device__ float3 force(float4 bi, float4 bj, float3 ai)
{
     float3 r;

     r.x = bj.x - bi.x;
     r.y = bj.y - bi.y;
     r.z = bj.z - bi.z; 

     float distSqr = r.x * r.x + r.y * r.y + r.z * r.z + EPS2;
     float distSixth = distSqr * distSqr * distSqr;
     float invDistCube = 1.0f/sqrtf(distSixth); 
     float s = bj.w * invDistCube;

     ai.x += r.x * s;  
     ai.y += r.y * s;  
     ai.z += r.z * s; 

     return ai;
}

__device__ float3  accumulate_tile(float4 myPosition, float3 accel)  
{  
     int i;  
     for (i = 0; i < THREADS_PER_BLOCK; i++) 
     {  
         accel = force(myPosition, shPosition[i], accel);  
     }  
     return accel;  
}  

__global__ void points(float4 *pos, float dt)
{ 
     int k = blockIdx.x * blockDim.x + threadIdx.x;

     if(k >= N) return;

     float4 bi, v;
     float3 ai;

     v = vel[k];
     bi = pos[k];
     ai = make_float3(0.0f, 0.0f, 0.0f);

     int i,tile;

     for(tile=0; tile < N / THREADS_PER_BLOCK; tile++)
     {
          i = tile *  blockDim.x + threadIdx.x;
          shPosition[threadIdx.x] = pos[i];
          __syncthreads();
          ai = accumulate_tile(bi, ai);
          __syncthreads();
     }

     v.x += ai.x * dt;
     v.y += ai.y * dt;
     v.z += ai.z * dt;

     bi.x += v.x * dt;
     bi.y += v.y * dt;
     bi.z += v.z * dt;

    pos[k]=bi;
    vel[k]=v;
}

The only really useful answer will be obtained by careful profiling, and that is only something that you are in a position to do. NVIDIA ship useful profiling tools for both Linux and Windows, now might be the time to use them.

Having said that, the register consumption of the shared memory version is considerably larger than the non-shared memory version (37 versus 29 when compiled to the sm_20 target with the CUDA 4.0 release compiler). It might be a simple difference in occupancy which is causing the change in performance you are seeing.

Actually non-shared version of kernel does use shared memory in form of L1 cache. From the code we can see that threads hit the same areas of global memory so it get's cached and reused. When we add better occupancy and lack of additional instructions (synchronization etc) we get faster kernel.

继续阅读：gpgpu

cuda nbody simulation - shared memory problem

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

王昌瑞《潜梦追凶》剧组庆生新锐演员未来可期？

Is it allowed to ask users to enter credit card details for own payment method?

Escaping "<" in Perl-generated XML

imessage会显示已读吗？

微信重新建群怎么建？