开发者

Problem with CUDA operation overlapping example

all

I referred to simpleMultiCopy.cu in CUDA SDK 4.0 and wrote one, see code below.

simpleMultiCopy.cu is an example of operation overlapping in a loop. And mine is similar, it will send a slice of data to GPU to compute each iteration in a loop where I perform the overlapping operation.

This is just a test/demo, don't care the logic of the kernel(increment_kernel), it was used just to delay some time. The main logic lies in processWithStreams function. But this program works incorrectly with this out put:

i: 0, current_stream: 0, next_stream: 1
i: 1, current_stream: 1, next_stream: 0
Cuda error in file 'ttt.cu' in line 132 : unspecified launch failure.

line 132 is:

CUDA_SAFE_CALL( cudaMemcpyAsync(
            d_data_in[next_stream], 
            h_data_in[next_stream], 
            memsize, 
            cudaMemcpyHostToDevice, 
            stream[next_stream]) ); //this is line 132

I don't have much ideas about how CUDA works, so please help.

Any help will be appreciate.


Code:

#include <stdio.h>
#include <cutil_inline.h>

float processWithStreams(int streams_used);
#define STREAM_COUNT    2

int N = 1 << 24;

int *h_data_source;
int *h_data_sink;

int *h_data_in[STREAM_COUNT];
int *d_data_in[STREAM_COUNT];

int *h_data_out[STREAM_COUNT];
int *d_data_out[STREAM_COUNT];

cudaEvent_t cycleDone[STREAM_COUNT];
cudaStream_t stream[STREAM_COUNT];

cudaEvent_t start, stop;

dim3 block(512);
dim3 grid;

int memsize;

__global__ void increment_kernel(int *g_data, int inc_value)
{ 
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   //g_data[idx] = g_data[idx] + inc_value;

   int i = blockDim.x * gridDim.x;
   for(; i > 0; i /= 2)
   {
        if(idx > i)
            g_data[idx]++;
   }
}


int main(int argc, char *argv[])
{
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( cutGetMaxGflopsDeviceId());

    h_data_source = (int *)malloc(sizeof(int) * N);
    memset(h_data_source, 0, sizeof(int) * N);

    int i;
    memsize = 1024 * 1024 * sizeof(int);
    for(i = 0; i < STREAM_COUNT; i++)
    {
        CUDA_SAFE_CALL( cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault) );
        CUDA_SAFE_CALL( cudaMalloc(&d_data_in[i], memsize) );

        CUDA_SAFE_CALL( cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault) );
        CUDA_SAFE_CALL( cudaMalloc(&d_data_out[i], memsize) );

        CUDA_SAFE_CALL( cudaStreamCreate(&stream[i]) );
        CUDA_SAFE_CALL( cudaEventCreate(&cycleDone[i]) ); 

        cudaEventRecord(cycleDone[i], stream[i]);
    }

    CUDA_SAFE_CALL( cudaEventCreate(&start) );
    CUDA_SAFE_CALL( cudaEventCreate(&stop) );

    grid.x = N / block.x;
 开发者_StackOverflow中文版   grid.y = 1;



    float time1 = processWithStreams(STREAM_COUNT);
    printf("time: %f\n", time1);



    free( h_data_source );
    free( h_data_sink );

    for( i = 0; i < STREAM_COUNT; ++i ) {

        cudaFreeHost(h_data_in[i]);
        cudaFree(d_data_in[i]);

        cudaStreamDestroy(stream[i]);
        cudaEventDestroy(cycleDone[i]);
    }

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    cudaThreadExit();
    cutilExit(argc, argv);

    return 0;
}

float processWithStreams(int streams_used) {
    int current_stream = 0;
    float time;

    cudaEventRecord(start, 0);
    for( int i=0; i < N / 1024 / 1024; ++i ) {
        int next_stream = (current_stream + 1 ) % streams_used;
        printf("i: %d, current_stream: %d, next_stream: %d\n", i, current_stream, next_stream);

        // Ensure that processing and copying of the last cycle has finished
        cudaEventSynchronize(cycleDone[next_stream]);

        // Process current frame
        increment_kernel<<<grid, block, 0, stream[current_stream]>>>(
            d_data_in[current_stream], 1);

        // Upload next frame
        CUDA_SAFE_CALL( cudaMemcpyAsync(
            d_data_in[next_stream], 
            h_data_in[next_stream], 
            memsize, 
            cudaMemcpyHostToDevice, 
            stream[next_stream]) );

        CUDA_SAFE_CALL( cudaEventRecord(
            cycleDone[next_stream], 
            stream[next_stream]) );

        // Download current frame
        CUDA_SAFE_CALL( cudaMemcpyAsync(
            h_data_out[current_stream], 
            d_data_out[current_stream], 
            memsize, 
            cudaMemcpyDeviceToHost, 
            stream[current_stream]) );

        CUDA_SAFE_CALL( cudaEventRecord(
            cycleDone[current_stream], 
            stream[current_stream]) );

        current_stream = next_stream;
    }
    cudaEventRecord(stop, 0);    
    cudaEventElapsedTime(&time, start, stop);
    return time;
}


The problem is in your kernel. One thing that happens when checking errors in CUDA is that errors that occurred previously and were not checked will be reported next time you check for an error. That line is the first time you check for errors after the kernel launch which returned the error your are seeing.

The error unspecified launch failure is usually associated with out of bounds accesses to memory if I recall correctly.

You are launching your kernel with 32768 blocks and 512 threads per block. Calculating the idx value for the last thread of the last block we have 32767 * 512 + 511 = 16777215. In the first iteration idx < i and in the following ones you are trying to read and write to position 16777215 of g_data when you only allocated space for 1024 * 1024 integers.

edit: just noticed, why the tag operator overloading?

0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜