Sorting 3 arrays by key in CUDA (using Thrust perhaps)

2023-03-18 07:32 问答作者：

I have 3 arrays of the same size (more than 300.000 elements). One array of float numbers and two arrays of indices. So, for each number I have 2 IDs.

All the 3 arrays are already in GPU global memory. I want to sort all the numbers with theirs IDs accordingly.

Is there any way I can use Thrust library to do this task? Is there any better way than Thrust library?

Of course, I prefer not to copy them to and from host memory a couple of times. By the way, they're arrays not vectors.

Thanks for your help in advance.

Tentative solution, but this is extremely slow. It takes almost 4 seconds and my array size is in order of 300000

thrust::device_ptr<开发者_JAVA技巧float> keys(afterSum);
thrust::device_ptr<int> vals0(d_index);
thrust::device_ptr<int> vals1(blockId); 

thrust::device_vector<int> sortedIndex(numElements);
thrust::device_vector<int> sortedBlockId(numElements);

thrust::counting_iterator<int> iter(0);
thrust::device_vector<int> indices(numElements);
thrust::copy(iter, iter + indices.size(), indices.begin()); 

thrust::sort_by_key(keys, keys + numElements , indices.begin());    

thrust::gather(indices.begin(), indices.end(), vals0, sortedIndex.begin());
thrust::gather(indices.begin(), indices.end(), vals1, sortedBlockId.begin());

thrust::host_vector<int> h_sortedIndex=sortedIndex;
thrust::host_vector<int> h_sortedBlockId=sortedBlockId;

Of course you can use Thrust. First, you need to wrap your raw CUDA device pointers with thrust::device_ptr. Assuming your float values are in the array pkeys, and the IDs are in the arrays pvals0 and pvals1, and numElements is the length of the arrays, something like this should work:

#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/iterator/counting_iterator.h>

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

cudaEventRecord(start);

thrust::device_ptr<float> keys(pkeys);
thrust::device_ptr<int> vals0(pvals0);
thrust::device_ptr<int> vals1(pvals1);

// allocate space for the output
thrust::device_vector<int> sortedVals0(numElements);
thrust::device_vector<int> sortedVals1(numElements);

// initialize indices vector to [0,1,2,..]
thrust::counting_iterator<int> iter(0);
thrust::device_vector<int> indices(numElements);
thrust::copy(iter, iter + indices.size(), indices.begin());

// first sort the keys and indices by the keys
thrust::sort_by_key(keys.begin(), keys.end(), indices.begin());

// Now reorder the ID arrays using the sorted indices
thrust::gather(indices.begin(), indices.end(), vals0.begin(), sortedVals0.begin());
thrust::gather(indices.begin(), indices.end(), vals1.begin(), sortedVals1.begin());

cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("Took %f milliseconds for %d elements\n", milliseconds, numElements);

I have compared the two approaches proposed above, namely, that using thrust::zip_iterator and that using thrust::gather. I have tested them in the case of sorting two arrays by key or three arrays, as requested by the poster. In all the two cases, the approach using thrust::gather has shown to be faster.

THE CASE OF 2 ARRAYS

#include <time.h>       // --- time
#include <stdlib.h>     // --- srand, rand

#include <thrust\host_vector.h>
#include <thrust\device_vector.h>
#include <thrust\sort.h>
#include <thrust\iterator\zip_iterator.h>

#include "TimingGPU.cuh"

//#define VERBOSE
//#define COMPACT

int main() {

    const int N = 1048576;
    //const int N = 10;

    TimingGPU timerGPU;

    // --- Initialize random seed
    srand(time(NULL));

    thrust::host_vector<int> h_code(N);
    thrust::host_vector<double> h_x(N);
    thrust::host_vector<double> h_y(N);

    for (int k = 0; k < N; k++) {       
        // --- Generate random numbers between 0 and 9
        h_code[k] = rand() % 10 + 1;
        h_x[k] = ((double)rand() / (RAND_MAX));
        h_y[k] = ((double)rand() / (RAND_MAX));
    }

    thrust::device_vector<int> d_code(h_code);

    thrust::device_vector<double> d_x(h_x);
    thrust::device_vector<double> d_y(h_y);

#ifdef VERBOSE
    printf("Before\n");
    for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]);
#endif

    timerGPU.StartCounter();
#ifdef COMPACT
    thrust::sort_by_key(d_code.begin(), d_code.end(), thrust::make_zip_iterator(thrust::make_tuple(d_x.begin(), d_y.begin())));
#else

    // --- Initialize indices vector to [0,1,2,..]
    thrust::counting_iterator<int> iter(0);
    thrust::device_vector<int> indices(N);
    thrust::copy(iter, iter + indices.size(), indices.begin());

    // --- First, sort the keys and indices by the keys
    thrust::sort_by_key(d_code.begin(), d_code.end(), indices.begin());

    // Now reorder the ID arrays using the sorted indices
    thrust::gather(indices.begin(), indices.end(), d_x.begin(), d_x.begin());
    thrust::gather(indices.begin(), indices.end(), d_y.begin(), d_y.begin());
#endif

    printf("Timing GPU = %f\n", timerGPU.GetCounter());

#ifdef VERBOSE
    h_code = d_code;
    h_x = d_x;
    h_y = d_y;

    printf("After\n");
    for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]);
#endif
}

THE CASE OF 3 ARRAYS

#include <time.h>       // --- time
#include <stdlib.h>     // --- srand, rand

#include <thrust\host_vector.h>
#include <thrust\device_vector.h>
#include <thrust\sort.h>
#include <thrust\iterator\zip_iterator.h>

#include "TimingGPU.cuh"

//#define VERBOSE
//#define COMPACT

int main() {

    const int N = 1048576;
    //const int N = 10;

    TimingGPU timerGPU;

    // --- Initialize random seed
    srand(time(NULL));

    thrust::host_vector<int> h_code(N);
    thrust::host_vector<double> h_x(N);
    thrust::host_vector<double> h_y(N);
    thrust::host_vector<double> h_z(N);

    for (int k = 0; k < N; k++) {
        // --- Generate random numbers between 0 and 9
        h_code[k] = rand() % 10 + 1;
        h_x[k] = ((double)rand() / (RAND_MAX));
        h_y[k] = ((double)rand() / (RAND_MAX));
        h_z[k] = ((double)rand() / (RAND_MAX));
    }

    thrust::device_vector<int> d_code(h_code);

    thrust::device_vector<double> d_x(h_x);
    thrust::device_vector<double> d_y(h_y);
    thrust::device_vector<double> d_z(h_z);

#ifdef VERBOSE
    printf("Before\n");
    for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]);
#endif

    timerGPU.StartCounter();
#ifdef COMPACT
    thrust::sort_by_key(d_code.begin(), d_code.end(), thrust::make_zip_iterator(thrust::make_tuple(d_x.begin(), d_y.begin(), d_z.begin())));
#else

    // --- Initialize indices vector to [0,1,2,..]
    thrust::counting_iterator<int> iter(0);
    thrust::device_vector<int> indices(N);
    thrust::copy(iter, iter + indices.size(), indices.begin());

    // --- First, sort the keys and indices by the keys
    thrust::sort_by_key(d_code.begin(), d_code.end(), indices.begin());

    // Now reorder the ID arrays using the sorted indices
    thrust::gather(indices.begin(), indices.end(), d_x.begin(), d_x.begin());
    thrust::gather(indices.begin(), indices.end(), d_y.begin(), d_y.begin());
    thrust::gather(indices.begin(), indices.end(), d_z.begin(), d_z.begin());
#endif

    printf("Timing GPU = %f\n", timerGPU.GetCounter());

#ifdef VERBOSE
    h_code = d_code;
    h_x = d_x;
    h_y = d_y;

    printf("After\n");
    for (int k = 0; k < N; k++) printf("code = %i; x = %f; y = %f\n", h_code[k], h_x[k], h_y[k]);
#endif
}

Timing in the case of 2 arrays for N = 1048576

zip_iterator  = 7.34ms
gather        = 4.27ms

Timing in the case of 3 arrays for N = 1048576

zip_iterator  = 9.64ms
gather        = 4.22ms

Tests performed on an NVIDIA GTX 960 card.

I would use zip_iterator to perform one sort_by_key on both indice vectors at the same time.

This would look like this:

    typedef typename thrust::tuple<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> IteratorTuple;
    typedef typename thrust::zip_iterator<IteratorTuple> ZipIterator;   

    // here I suppose your 3 arrays are pointed to by device_ptr as suggested by @harrism
    thrust::device_vector<float> key(pKey, pKey + numElements);
    thrust::device_vector<int> val0(pVal0, pVal0 + numElements);
    thrust::device_vector<int> val1(pVal1, pVal1 + numElements);

    ZipIterator iterBegin(thrust::make_tuple(val0.begin(), val1.begin()));  
    thrust::sort_by_key(key.begin(), key.end(), iterBegin);

继续阅读：gpu sorting thrust

Sorting 3 arrays by key in CUDA (using Thrust perhaps)

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？