开发者

OpenCl cleanup causes segfault

I constructed my own little Opencl example using different sources on the net. The actual kernel works, and I get the output I want, but the cleanup functions, I found in one of the examples, cause segfaults. What did I do wrong?

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl

#define CL_CHECK(_expr)                                                         \
   do {                                                                         \
     cl_int _err = _expr;                                                       \
     if (_err == CL_SUCCESS)                                                    \
       break;                                                                   \
     fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
     abort();                                                                   \
   } while (0)

#define CL_CHECK_ERR(_expr)                                                     \
   ({                                                                           \
     cl_int _err = CL_INVALID_VALUE;                                            \
     typeof(_expr) _ret = _expr;                                                \
     if (_err != CL_SUCCESS) {                                                  \
       fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
       abort();                                                                 \
     }                                                                          \
     _ret;                                                                      \
   })

const char* OpenCLSource[] = {
       "__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
       "{",
       "      // Index of the elements to add \n",
       "      unsigned int n = get_global_id(0);",
       "      // Sum the n’th element of vectors a and b and store in c \n",
       "      c[n] = a[n] + b[n];",
       "}"
};

cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){

    // Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
    cl_int _err;
    *GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
    printf("\n1-%i\n",_err);
    // Get the list of GPU devices associated with this context
    size_t ParmDataBytes;
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
    cl_device_id* GPUDevices;
    GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
    // Create a command-queue on the first GPU device
    *GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
    printf("\n2-%i\n",_err);
    // Create OpenCL program with source code
    *OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
    printf("\n3-%i\n",_err);

    CL_CHECK(clBuildProgram(*OpenCLProgram, 0, 
              NULL, NULL, NULL, NULL));


     cl_int errcode;
    *cl_forward1 = clCreateKernel(*OpenCLProgram, 
               "VectorAdd", &errcode);
             开发者_如何学Python  printf("\n7-%i\n",errcode);

    return GPUDevices;
}


int main(int argc, char** argv)
{
    cl_context GPUContext;
    cl_command_queue GPUCommandQueue;
    cl_program OpenCLProgram;
    cl_kernel OpenCLVectorAdd;
    cl_device_id* GPUDevices;

    GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);

    // Two integer source vectors in Host memory
    int n=5 ;
    int x[5]={1,2,4,6,8};
    int y[5]={1,2,4,6,8};
    int output[n];
    int size_x = n*sizeof(x);
    int size_y = n*sizeof(y);

    int size_output = n*sizeof(output); // this changes for the second forward1
    cl_int _err;
    // Allocate GPU memory for source vectors AND initialize from CPU memory
    cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                    CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
                     printf("\n4-%i\n",_err);
    cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                    CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
                     printf("\n5-%i\n",_err);


    // Allocate output memory on GPU
    cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
                                          size_output, NULL, &_err);
                                           printf("\n6-%i\n",_err);

     // In the next step we associate the GPU memory with the Kernel arguments
    clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
    clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
    clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);


    // 7. Launch OpenCL kernel
    size_t localWorkSize[1], globalWorkSize[1];
    //localWorkSize = ;
    globalWorkSize[0] = n;

    // Launch the Kernel on the GPU
    CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
    // Copy the output in GPU memory back to CPU memory

    //float* h_C = (float*) malloc(size_output);
    CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue, 
              total_cl, CL_TRUE, 0, size_output, 
                output, 0, NULL, NULL));
    for (int i=0; i<n;i++){
        printf("\n%i",output[i]);
    }

    // Cleanup (each of the following lines causes a seg fault
    // ******************************
    CL_CHECK(free(GPUDevices)); 
    CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
    CL_CHECK(clReleaseProgram(OpenCLProgram));
    CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
    CL_CHECK(clReleaseContext(GPUContext));
    CL_CHECK(clReleaseMemObject(total_cl));
    CL_CHECK(clReleaseMemObject(x_cl));
    CL_CHECK(clReleaseMemObject(y_cl));
    /* ****************

    return 0;
}

Merci!


For people who arrives here in the future:

As Brafford suggested, this is resolved by adding clFinish(GPUCommandQueue) after clEnqueueNDRangeKernel as well as clEnqueueReadBuffer.

Apparently trying to clean up any object (e.g. release a queue) that is still under execution yields segmentation fault.


I corrected and changed several small things. So this code should work now.

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <CL/cl.h> //opencl

#define CL_CHECK(_expr)                                                         \
   do {                                                                         \
     cl_int _err = _expr;                                                       \
     if (_err == CL_SUCCESS)                                                    \
       break;                                                                   \
     fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
     abort();                                                                   \
   } while (0)

#define CL_CHECK_ERR(_expr)                                                     \
   ({                                                                           \
     cl_int _err = CL_INVALID_VALUE;                                            \
     typeof(_expr) _ret = _expr;                                                \
     if (_err != CL_SUCCESS) {                                                  \
       fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
       abort();                                                                 \
     }                                                                          \
     _ret;                                                                      \
   })

const char* OpenCLSource[] = {
       "__kernel void VectorAdd(__global int* c, __global int* a,__global int* b)",
       "{",
       "      // Index of the elements to add \n",
       "      unsigned int n = get_global_id(0);",
       "      // Sum the n’th element of vectors a and b and store in c \n",
       "      c[n] = a[n] + b[n];",
       "}"
};

cl_device_id* init_opencl(cl_context *GPUContext,cl_command_queue *GPUCommandQueue, cl_kernel* cl_forward1,cl_program* OpenCLProgram){

    // Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
    cl_int _err;
    *GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &_err) ;
    printf("\nclCreateContextFromType:%i\n",_err);
    // Get the list of GPU devices associated with this context
    size_t ParmDataBytes;
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes));
    cl_device_id* GPUDevices;
    GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
    CL_CHECK(clGetContextInfo(*GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL));
    // Create a command-queue on the first GPU device
    *GPUCommandQueue = clCreateCommandQueue(*GPUContext, GPUDevices[0], 0, &_err);
    printf("\nclCreateCommandQueue:%i\n",_err);
    // Create OpenCL program with source code
    *OpenCLProgram = clCreateProgramWithSource(*GPUContext, 7, OpenCLSource, NULL, &_err);
    printf("\nclCreateProgramWithSource:%i\n",_err);

    CL_CHECK(clBuildProgram(*OpenCLProgram, 0, 
              NULL, NULL, NULL, NULL));


     cl_int errcode;
    *cl_forward1 = clCreateKernel(*OpenCLProgram, 
               "VectorAdd", &errcode);
               printf("\nclCreateKernel:%i\n",errcode);

    return GPUDevices;
}


int main(int argc, char** argv)
{
    cl_context GPUContext;
    cl_command_queue GPUCommandQueue;
    cl_program OpenCLProgram;
    cl_kernel OpenCLVectorAdd;
    cl_device_id* GPUDevices;

    GPUDevices=init_opencl(&GPUContext,&GPUCommandQueue,&OpenCLVectorAdd,&OpenCLProgram);

    int n=5 ;
    int x[5]={1,2,4,6,8};
    int y[5]={1,2,4,6,8};
    int output[n];
    int size_x = n*sizeof(x);
    int size_y = n*sizeof(y);
    int size_output = n*sizeof(output);

    cl_int _err;

    // Allocate GPU memory for source vectors AND initialize from CPU memory
    cl_mem x_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                    CL_MEM_COPY_HOST_PTR, size_x, x, &_err);
                    printf("\nclCreateBuffer:%i\n",_err);
    cl_mem y_cl = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY |
                    CL_MEM_COPY_HOST_PTR, size_y, y, &_err);
                    printf("\nclCreateBuffer:%i\n",_err);


    // Allocate output memory on GPU
    cl_mem total_cl = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY,
                                          size_output, NULL, &_err);
                                           printf("\nclCreateBuffer:%i\n",_err);

     // In the next step we associate the GPU memory with the Kernel arguments
    clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem),(void*)&total_cl);
    clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&x_cl);
    clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&y_cl);


    size_t globalWorkSize[1];
    globalWorkSize[0] = n;

    // Launch the Kernel on the GPU
    CL_CHECK(clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL));
    clFinish(GPUCommandQueue);
    // Copy the output in GPU memory back to CPU memory

    int* h_c = (int*) malloc(size_output);
    CL_CHECK(clEnqueueReadBuffer(GPUCommandQueue, 
              total_cl, CL_TRUE, 0, size_output, 
                h_c, 0, NULL, NULL));
    clFinish(GPUCommandQueue);
    for (int i=0; i<n;i++){
        printf("\noutput[%i]=%i",i,h_c[i]);
    }

    // Cleanup
    free(GPUDevices); 
    CL_CHECK(clReleaseKernel(OpenCLVectorAdd));
    CL_CHECK(clReleaseProgram(OpenCLProgram));
    CL_CHECK(clReleaseCommandQueue(GPUCommandQueue));
    CL_CHECK(clReleaseContext(GPUContext));
    CL_CHECK(clReleaseMemObject(x_cl));
    CL_CHECK(clReleaseMemObject(total_cl));
    CL_CHECK(clReleaseMemObject(y_cl));

    return 0;
}
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜