开发者

CUDA Thrust performance

I have a 640*480 vector which contains a set of numbers, I wish to find the min and max number of each row of the vector.

for(int i = 0; i < R; i++)
    {
        Begin = m_valBuffer.begin()  + (i*C);
        End = Begin+C;

     开发者_如何学运维   rMinmax= minmax_element(Begin, End);
     }

However this is extremely slow, is there any way I could speed this up?

  • The current load on the GPU when running this is only 34% so there must be a way to improve this?


This example shows how to compute the sum of each row using the reduce_by_key algorithm. You can easily adapt that example to compute the min or max of each row. To compute the min and max of each row simultaneously you'll need to use this strategy. Specifically, you'll need to use a transform_iterator on the input data and convert each value x into a tuple (x,x) before applying the minmax_binary_op reduction operator.

Here's a complete example:

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/extrema.h>
#include <thrust/random.h>
#include <iostream>
#include <iomanip>

// minmax_pair stores the minimum and maximum 
// values that have been encountered so far
template <typename T>
struct minmax_pair
{
  T min_val;
  T max_val;
};

// minmax_unary_op is a functor that takes in a value x and
// returns a minmax_pair whose minimum and maximum values
// are initialized to x.
template <typename T>
struct minmax_unary_op
  : public thrust::unary_function< T, minmax_pair<T> >
{
  __host__ __device__
  minmax_pair<T> operator()(const T& x) const
  {
    minmax_pair<T> result;
    result.min_val = x;
    result.max_val = x;
    return result;
  }
};

// minmax_binary_op is a functor that accepts two minmax_pair 
// structs and returns a new minmax_pair whose minimum and 
// maximum values are the min() and max() respectively of 
// the minimums and maximums of the input pairs
template <typename T>
struct minmax_binary_op
  : public thrust::binary_function< minmax_pair<T>,
                                    minmax_pair<T>,
                                    minmax_pair<T> >
{
  __host__ __device__
  minmax_pair<T> operator()(const minmax_pair<T>& x, const minmax_pair<T>& y) const 
  {
    minmax_pair<T> result;
    result.min_val = thrust::min(x.min_val, y.min_val);
    result.max_val = thrust::max(x.max_val, y.max_val);
    return result;
  }
};

// convert a linear index to a row index
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T>
{
    T C; // number of columns

    __host__ __device__
    linear_index_to_row_index(T C) : C(C) {}

    __host__ __device__
    T operator()(T i)
    {
        return i / C;
    }
};

int main(void)
{
    int R = 5;     // number of rows
    int C = 8;     // number of columns
    thrust::default_random_engine rng;
    thrust::uniform_int_distribution<int> dist(0, 99);

    // initialize data
    thrust::device_vector<int> array(R * C);
    for (size_t i = 0; i < array.size(); i++)
        array[i] = dist(rng);

    // allocate storage for per-row results and indices
    thrust::device_vector< minmax_pair<int> > row_results(R);
    thrust::device_vector< int              > row_indices(R);

    // compute row sums by summing values with equal row indices
    thrust::reduce_by_key
      (thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)),
       thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)) + (R*C),
       thrust::make_transform_iterator(array.begin(), minmax_unary_op<int>()),
       row_indices.begin(),
       row_results.begin(),
       thrust::equal_to<int>(),
       minmax_binary_op<int>());

    // print data 
    for(int i = 0; i < R; i++)
    {
      minmax_pair<int> result = row_results[i];
        std::cout << "[";
        for(int j = 0; j < C; j++)
            std::cout << std::setw(3) << array[i * C + j] << " ";
        std::cout << "] = " << "(" << result.min_val << "," << result.max_val << ")\n";
    }

    return 0;
}

Sample output:

[  0   8  60  89  96  18  51  39 ] = (0,96)
[ 26  74   8  56  58  80  59  51 ] = (8,80)
[ 87  99  72  96  29  42  89  65 ] = (29,99)
[ 90  96  16  85  90  29  93  41 ] = (16,96)
[ 30  51  39  78  68  54  59   9 ] = (9,78)
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜