Passing structs to CUDA kernels
I'm new to CUDA C, and am trying to pass a typedef'd struct into a kernel. My method worked fine when I tried it with a struct containing only ints, but when I switch to floats I get meaningless numbers back as results. I assume this has to do with alignment, and I tried including __align__
along with my type declaration, but to no avail. Can someone give me an example of how this is done, or provide an alternative approach? I'm trying to set it up so that I can easily add or remove fields without changing anything o开发者_运维百科ther than the struct and the kernel. My code:
typedef struct __align__(8)
{
float a, b;
} point;
__global__ void testKernel(point *p)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
p[i].a = 1.1;
p[i].b = 2.2;
}
int main(void)
{
// set number of points
int numPoints = 16,
gpuBlockSize = 4,
pointSize = sizeof(point),
numBytes = numPoints * pointSize,
gpuGridSize = numPoints / gpuBlockSize;
// allocate memory
point *cpuPointArray = new point[numPoints],
*gpuPointArray = new point[numPoints];
cpuPointArray = (point*)malloc(numBytes);
cudaMalloc((void**)&gpuPointArray, numBytes);
// launch kernel
testKernel<<<gpuGridSize,gpuBlockSize>>>(gpuPointArray);
// retrieve the results
cudaMemcpy(cpuPointArray, gpuPointArray, numBytes, cudaMemcpyDeviceToHost);
printf("testKernel results:\n");
for(int i = 0; i < numPoints; ++i)
{
printf("point.a: %d, point.b: %d\n",cpuPointArray[i].a,cpuPointArray[i].b);
}
// deallocate memory
free(cpuPointArray);
cudaFree(gpuPointArray);
return 0;
}
Since there doesn't appear to be any decent documentation on how to do this, I thought I'd post the final, revised code here. It turns out that the __align__
part was unnecessary as well, the actual problem was the use of %d in the printf when trying to print floats.
#include <stdlib.h>
#include <stdio.h>
typedef struct
{
float a, b;
} point;
__global__ void testKernel(point *p)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
p[i].a = 1.1;
p[i].b = 2.2;
}
int main(void)
{
// set number of points
int numPoints = 16,
gpuBlockSize = 4,
pointSize = sizeof(point),
numBytes = numPoints * pointSize,
gpuGridSize = numPoints / gpuBlockSize;
// allocate memory
point *cpuPointArray,
*gpuPointArray;
cpuPointArray = (point*)malloc(numBytes);
cudaMalloc((void**)&gpuPointArray, numBytes);
// launch kernel
testKernel<<<gpuGridSize,gpuBlockSize>>>(gpuPointArray);
// retrieve the results
cudaMemcpy(cpuPointArray, gpuPointArray, numBytes, cudaMemcpyDeviceToHost);
printf("testKernel results:\n");
for(int i = 0; i < numPoints; ++i)
{
printf("point.a: %f, point.b: %f\n",cpuPointArray[i].a,cpuPointArray[i].b);
}
// deallocate memory
free(cpuPointArray);
cudaFree(gpuPointArray);
return 0;
}
Have a look at how it's done in the vector_types.h header that comes in your CUDA include directory. That should already give you some pointers.
However, the main problem here is the %d
in your printf
calls. You're trying to print floats now, not integers. So those really should use %f
instead.
精彩评论