Problem with CUDA operation overlapping example

2023-03-11 16:46 问答作者：

all

I referred to simpleMultiCopy.cu in CUDA SDK 4.0 and wrote one, see code below.

simpleMultiCopy.cu is an example of operation overlapping in a loop. And mine is similar, it will send a slice of data to GPU to compute each iteration in a loop where I perform the overlapping operation.

This is just a test/demo, don't care the logic of the kernel(increment_kernel), it was used just to delay some time. The main logic lies in processWithStreams function. But this program works incorrectly with this out put:

i: 0, current_stream: 0, next_stream: 1
i: 1, current_stream: 1, next_stream: 0
Cuda error in file 'ttt.cu' in line 132 : unspecified launch failure.

line 132 is:

CUDA_SAFE_CALL( cudaMemcpyAsync(
            d_data_in[next_stream], 
            h_data_in[next_stream], 
            memsize, 
            cudaMemcpyHostToDevice, 
            stream[next_stream]) ); //this is line 132

I don't have much ideas about how CUDA works, so please help.

Any help will be appreciate.

Code:

#include <stdio.h>
#include <cutil_inline.h>

float processWithStreams(int streams_used);
#define STREAM_COUNT    2

int N = 1 << 24;

int *h_data_source;
int *h_data_sink;

int *h_data_in[STREAM_COUNT];
int *d_data_in[STREAM_COUNT];

int *h_data_out[STREAM_COUNT];
int *d_data_out[STREAM_COUNT];

cudaEvent_t cycleDone[STREAM_COUNT];
cudaStream_t stream[STREAM_COUNT];

cudaEvent_t start, stop;

dim3 block(512);
dim3 grid;

int memsize;

__global__ void increment_kernel(int *g_data, int inc_value)
{ 
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   //g_data[idx] = g_data[idx] + inc_value;

   int i = blockDim.x * gridDim.x;
   for(; i > 0; i /= 2)
   {
        if(idx > i)
            g_data[idx]++;
   }
}


int main(int argc, char *argv[])
{
    if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
        cutilDeviceInit(argc, argv);
    else
        cudaSetDevice( cutGetMaxGflopsDeviceId());

    h_data_source = (int *)malloc(sizeof(int) * N);
    memset(h_data_source, 0, sizeof(int) * N);

    int i;
    memsize = 1024 * 1024 * sizeof(int);
    for(i = 0; i < STREAM_COUNT; i++)
    {
        CUDA_SAFE_CALL( cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault) );
        CUDA_SAFE_CALL( cudaMalloc(&d_data_in[i], memsize) );

        CUDA_SAFE_CALL( cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault) );
        CUDA_SAFE_CALL( cudaMalloc(&d_data_out[i], memsize) );

        CUDA_SAFE_CALL( cudaStreamCreate(&stream[i]) );
        CUDA_SAFE_CALL( cudaEventCreate(&cycleDone[i]) ); 

        cudaEventRecord(cycleDone[i], stream[i]);
    }

    CUDA_SAFE_CALL( cudaEventCreate(&start) );
    CUDA_SAFE_CALL( cudaEventCreate(&stop) );

    grid.x = N / block.x;
 开发者_StackOverflow中文版   grid.y = 1;



    float time1 = processWithStreams(STREAM_COUNT);
    printf("time: %f\n", time1);



    free( h_data_source );
    free( h_data_sink );

    for( i = 0; i < STREAM_COUNT; ++i ) {

        cudaFreeHost(h_data_in[i]);
        cudaFree(d_data_in[i]);

        cudaStreamDestroy(stream[i]);
        cudaEventDestroy(cycleDone[i]);
    }

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    cudaThreadExit();
    cutilExit(argc, argv);

    return 0;
}

float processWithStreams(int streams_used) {
    int current_stream = 0;
    float time;

    cudaEventRecord(start, 0);
    for( int i=0; i < N / 1024 / 1024; ++i ) {
        int next_stream = (current_stream + 1 ) % streams_used;
        printf("i: %d, current_stream: %d, next_stream: %d\n", i, current_stream, next_stream);

        // Ensure that processing and copying of the last cycle has finished
        cudaEventSynchronize(cycleDone[next_stream]);

        // Process current frame
        increment_kernel<<<grid, block, 0, stream[current_stream]>>>(
            d_data_in[current_stream], 1);

        // Upload next frame
        CUDA_SAFE_CALL( cudaMemcpyAsync(
            d_data_in[next_stream], 
            h_data_in[next_stream], 
            memsize, 
            cudaMemcpyHostToDevice, 
            stream[next_stream]) );

        CUDA_SAFE_CALL( cudaEventRecord(
            cycleDone[next_stream], 
            stream[next_stream]) );

        // Download current frame
        CUDA_SAFE_CALL( cudaMemcpyAsync(
            h_data_out[current_stream], 
            d_data_out[current_stream], 
            memsize, 
            cudaMemcpyDeviceToHost, 
            stream[current_stream]) );

        CUDA_SAFE_CALL( cudaEventRecord(
            cycleDone[current_stream], 
            stream[current_stream]) );

        current_stream = next_stream;
    }
    cudaEventRecord(stop, 0);    
    cudaEventElapsedTime(&time, start, stop);
    return time;
}

The problem is in your kernel. One thing that happens when checking errors in CUDA is that errors that occurred previously and were not checked will be reported next time you check for an error. That line is the first time you check for errors after the kernel launch which returned the error your are seeing.

The error unspecified launch failure is usually associated with out of bounds accesses to memory if I recall correctly.

You are launching your kernel with 32768 blocks and 512 threads per block. Calculating the idx value for the last thread of the last block we have 32767 * 512 + 511 = 16777215. In the first iteration idx < i and in the following ones you are trying to read and write to position 16777215 of g_data when you only allocated space for 1024 * 1024 integers.

edit: just noticed, why the tag operator overloading?

Problem with CUDA operation overlapping example

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

王昌瑞《潜梦追凶》剧组庆生新锐演员未来可期？

Is it allowed to ask users to enter credit card details for own payment method?

Escaping "<" in Perl-generated XML

imessage会显示已读吗？

微信重新建群怎么建？