CUDA Thrust performance

2023-03-23 10:22 问答作者：

I have a 640*480 vector which contains a set of numbers, I wish to find the min and max number of each row of the vector.

for(int i = 0; i < R; i++)
    {
        Begin = m_valBuffer.begin()  + (i*C);
        End = Begin+C;

     开发者_如何学运维   rMinmax= minmax_element(Begin, End);
     }

However this is extremely slow, is there any way I could speed this up?

The current load on the GPU when running this is only 34% so there must be a way to improve this?

This example shows how to compute the sum of each row using the reduce_by_key algorithm. You can easily adapt that example to compute the min or max of each row. To compute the min and max of each row simultaneously you'll need to use this strategy. Specifically, you'll need to use a transform_iterator on the input data and convert each value x into a tuple (x,x) before applying the minmax_binary_op reduction operator.

Here's a complete example:

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/extrema.h>
#include <thrust/random.h>
#include <iostream>
#include <iomanip>

// minmax_pair stores the minimum and maximum 
// values that have been encountered so far
template <typename T>
struct minmax_pair
{
  T min_val;
  T max_val;
};

// minmax_unary_op is a functor that takes in a value x and
// returns a minmax_pair whose minimum and maximum values
// are initialized to x.
template <typename T>
struct minmax_unary_op
  : public thrust::unary_function< T, minmax_pair<T> >
{
  __host__ __device__
  minmax_pair<T> operator()(const T& x) const
  {
    minmax_pair<T> result;
    result.min_val = x;
    result.max_val = x;
    return result;
  }
};

// minmax_binary_op is a functor that accepts two minmax_pair 
// structs and returns a new minmax_pair whose minimum and 
// maximum values are the min() and max() respectively of 
// the minimums and maximums of the input pairs
template <typename T>
struct minmax_binary_op
  : public thrust::binary_function< minmax_pair<T>,
                                    minmax_pair<T>,
                                    minmax_pair<T> >
{
  __host__ __device__
  minmax_pair<T> operator()(const minmax_pair<T>& x, const minmax_pair<T>& y) const 
  {
    minmax_pair<T> result;
    result.min_val = thrust::min(x.min_val, y.min_val);
    result.max_val = thrust::max(x.max_val, y.max_val);
    return result;
  }
};

// convert a linear index to a row index
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T>
{
    T C; // number of columns

    __host__ __device__
    linear_index_to_row_index(T C) : C(C) {}

    __host__ __device__
    T operator()(T i)
    {
        return i / C;
    }
};

int main(void)
{
    int R = 5;     // number of rows
    int C = 8;     // number of columns
    thrust::default_random_engine rng;
    thrust::uniform_int_distribution<int> dist(0, 99);

    // initialize data
    thrust::device_vector<int> array(R * C);
    for (size_t i = 0; i < array.size(); i++)
        array[i] = dist(rng);

    // allocate storage for per-row results and indices
    thrust::device_vector< minmax_pair<int> > row_results(R);
    thrust::device_vector< int              > row_indices(R);

    // compute row sums by summing values with equal row indices
    thrust::reduce_by_key
      (thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)),
       thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)) + (R*C),
       thrust::make_transform_iterator(array.begin(), minmax_unary_op<int>()),
       row_indices.begin(),
       row_results.begin(),
       thrust::equal_to<int>(),
       minmax_binary_op<int>());

    // print data 
    for(int i = 0; i < R; i++)
    {
      minmax_pair<int> result = row_results[i];
        std::cout << "[";
        for(int j = 0; j < C; j++)
            std::cout << std::setw(3) << array[i * C + j] << " ";
        std::cout << "] = " << "(" << result.min_val << "," << result.max_val << ")\n";
    }

    return 0;
}

Sample output:

[  0   8  60  89  96  18  51  39 ] = (0,96)
[ 26  74   8  56  58  80  59  51 ] = (8,80)
[ 87  99  72  96  29  42  89  65 ] = (29,99)
[ 90  96  16  85  90  29  93  41 ] = (16,96)
[ 30  51  39  78  68  54  59   9 ] = (9,78)

继续阅读：thrust

CUDA Thrust performance

更多精彩内容

精彩评论

最新问答

央视是哪个频道？

请问买过的朋友，舒提啦旅行箱实际使用体验如何？？

检查不孕不育需要的费用？

海信ULED电视画质有什么不同的地方?？

钉子可以挂的住画框幕布吗？

问答排行榜

河神2九牛入海钓河妖是第几集河妖什么来历可活吞牛？

性激素六项检查的最佳时间是多久？多少钱？？

Easiest way to get words of one line from istream into a vector?

《梦在燃烧 (《三国演义》动画片主题曲)》MP3歌词-汤子星？

抽烟只抽炫赫门？