Problem with CUDA operation overlapping example
all
I referred to simpleMultiCopy.cu
in CUDA SDK 4.0
and wrote one, see code below.
simpleMultiCopy.cu
is an example of operation overlapping in a loop. And mine is similar, it will send a slice of data to GPU
to compute each iteration in a loop where I perform the overlapping operation.
This is just a test/demo, don't care the logic of the kernel(increment_kernel
), it was used just to delay some time. The main logic lies in processWithStreams
function.
But this program works incorrectly with this out put:
i: 0, current_stream: 0, next_stream: 1
i: 1, current_stream: 1, next_stream: 0
Cuda error in file 'ttt.cu' in line 132 : unspecified launch failure.
line 132 is:
CUDA_SAFE_CALL( cudaMemcpyAsync(
d_data_in[next_stream],
h_data_in[next_stream],
memsize,
cudaMemcpyHostToDevice,
stream[next_stream]) ); //this is line 132
I don't have much ideas about how CUDA
works, so please help.
Any help will be appreciate.
Code:
#include <stdio.h>
#include <cutil_inline.h>
float processWithStreams(int streams_used);
#define STREAM_COUNT 2
int N = 1 << 24;
int *h_data_source;
int *h_data_sink;
int *h_data_in[STREAM_COUNT];
int *d_data_in[STREAM_COUNT];
int *h_data_out[STREAM_COUNT];
int *d_data_out[STREAM_COUNT];
cudaEvent_t cycleDone[STREAM_COUNT];
cudaStream_t stream[STREAM_COUNT];
cudaEvent_t start, stop;
dim3 block(512);
dim3 grid;
int memsize;
__global__ void increment_kernel(int *g_data, int inc_value)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
//g_data[idx] = g_data[idx] + inc_value;
int i = blockDim.x * gridDim.x;
for(; i > 0; i /= 2)
{
if(idx > i)
g_data[idx]++;
}
}
int main(int argc, char *argv[])
{
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId());
h_data_source = (int *)malloc(sizeof(int) * N);
memset(h_data_source, 0, sizeof(int) * N);
int i;
memsize = 1024 * 1024 * sizeof(int);
for(i = 0; i < STREAM_COUNT; i++)
{
CUDA_SAFE_CALL( cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault) );
CUDA_SAFE_CALL( cudaMalloc(&d_data_in[i], memsize) );
CUDA_SAFE_CALL( cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault) );
CUDA_SAFE_CALL( cudaMalloc(&d_data_out[i], memsize) );
CUDA_SAFE_CALL( cudaStreamCreate(&stream[i]) );
CUDA_SAFE_CALL( cudaEventCreate(&cycleDone[i]) );
cudaEventRecord(cycleDone[i], stream[i]);
}
CUDA_SAFE_CALL( cudaEventCreate(&start) );
CUDA_SAFE_CALL( cudaEventCreate(&stop) );
grid.x = N / block.x;
开发者_StackOverflow中文版 grid.y = 1;
float time1 = processWithStreams(STREAM_COUNT);
printf("time: %f\n", time1);
free( h_data_source );
free( h_data_sink );
for( i = 0; i < STREAM_COUNT; ++i ) {
cudaFreeHost(h_data_in[i]);
cudaFree(d_data_in[i]);
cudaStreamDestroy(stream[i]);
cudaEventDestroy(cycleDone[i]);
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaThreadExit();
cutilExit(argc, argv);
return 0;
}
float processWithStreams(int streams_used) {
int current_stream = 0;
float time;
cudaEventRecord(start, 0);
for( int i=0; i < N / 1024 / 1024; ++i ) {
int next_stream = (current_stream + 1 ) % streams_used;
printf("i: %d, current_stream: %d, next_stream: %d\n", i, current_stream, next_stream);
// Ensure that processing and copying of the last cycle has finished
cudaEventSynchronize(cycleDone[next_stream]);
// Process current frame
increment_kernel<<<grid, block, 0, stream[current_stream]>>>(
d_data_in[current_stream], 1);
// Upload next frame
CUDA_SAFE_CALL( cudaMemcpyAsync(
d_data_in[next_stream],
h_data_in[next_stream],
memsize,
cudaMemcpyHostToDevice,
stream[next_stream]) );
CUDA_SAFE_CALL( cudaEventRecord(
cycleDone[next_stream],
stream[next_stream]) );
// Download current frame
CUDA_SAFE_CALL( cudaMemcpyAsync(
h_data_out[current_stream],
d_data_out[current_stream],
memsize,
cudaMemcpyDeviceToHost,
stream[current_stream]) );
CUDA_SAFE_CALL( cudaEventRecord(
cycleDone[current_stream],
stream[current_stream]) );
current_stream = next_stream;
}
cudaEventRecord(stop, 0);
cudaEventElapsedTime(&time, start, stop);
return time;
}
The problem is in your kernel. One thing that happens when checking errors in CUDA is that errors that occurred previously and were not checked will be reported next time you check for an error. That line is the first time you check for errors after the kernel launch which returned the error your are seeing.
The error unspecified launch failure is usually associated with out of bounds accesses to memory if I recall correctly.
You are launching your kernel with 32768 blocks and 512 threads per block. Calculating the idx value for the last thread of the last block we have 32767 * 512 + 511 = 16777215. In the first iteration idx < i and in the following ones you are trying to read and write to position 16777215 of g_data when you only allocated space for 1024 * 1024 integers.
edit: just noticed, why the tag operator overloading?
精彩评论