CUDA Thrust performance
I have a 640*480 vector which contains a set of numbers, I wish to find the min and max number of each row of the vector.
for(int i = 0; i < R; i++)
    {
        Begin = m_valBuffer.begin()  + (i*C);
        End = Begin+C;
     开发者_如何学运维   rMinmax= minmax_element(Begin, End);
     }
However this is extremely slow, is there any way I could speed this up?
- The current load on the GPU when running this is only 34% so there must be a way to improve this?
This example shows how to compute the sum of each row using 
the reduce_by_key algorithm.  You can easily adapt that example to compute 
the min or max of each row.  To compute the min and max of each row 
simultaneously you'll need to use this strategy.  Specifically, 
you'll need to use a transform_iterator on the input data and convert each 
value x into a tuple (x,x) before applying the minmax_binary_op reduction operator.
Here's a complete example:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/extrema.h>
#include <thrust/random.h>
#include <iostream>
#include <iomanip>
// minmax_pair stores the minimum and maximum 
// values that have been encountered so far
template <typename T>
struct minmax_pair
{
  T min_val;
  T max_val;
};
// minmax_unary_op is a functor that takes in a value x and
// returns a minmax_pair whose minimum and maximum values
// are initialized to x.
template <typename T>
struct minmax_unary_op
  : public thrust::unary_function< T, minmax_pair<T> >
{
  __host__ __device__
  minmax_pair<T> operator()(const T& x) const
  {
    minmax_pair<T> result;
    result.min_val = x;
    result.max_val = x;
    return result;
  }
};
// minmax_binary_op is a functor that accepts two minmax_pair 
// structs and returns a new minmax_pair whose minimum and 
// maximum values are the min() and max() respectively of 
// the minimums and maximums of the input pairs
template <typename T>
struct minmax_binary_op
  : public thrust::binary_function< minmax_pair<T>,
                                    minmax_pair<T>,
                                    minmax_pair<T> >
{
  __host__ __device__
  minmax_pair<T> operator()(const minmax_pair<T>& x, const minmax_pair<T>& y) const 
  {
    minmax_pair<T> result;
    result.min_val = thrust::min(x.min_val, y.min_val);
    result.max_val = thrust::max(x.max_val, y.max_val);
    return result;
  }
};
// convert a linear index to a row index
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T>
{
    T C; // number of columns
    __host__ __device__
    linear_index_to_row_index(T C) : C(C) {}
    __host__ __device__
    T operator()(T i)
    {
        return i / C;
    }
};
int main(void)
{
    int R = 5;     // number of rows
    int C = 8;     // number of columns
    thrust::default_random_engine rng;
    thrust::uniform_int_distribution<int> dist(0, 99);
    // initialize data
    thrust::device_vector<int> array(R * C);
    for (size_t i = 0; i < array.size(); i++)
        array[i] = dist(rng);
    // allocate storage for per-row results and indices
    thrust::device_vector< minmax_pair<int> > row_results(R);
    thrust::device_vector< int              > row_indices(R);
    // compute row sums by summing values with equal row indices
    thrust::reduce_by_key
      (thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)),
       thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)) + (R*C),
       thrust::make_transform_iterator(array.begin(), minmax_unary_op<int>()),
       row_indices.begin(),
       row_results.begin(),
       thrust::equal_to<int>(),
       minmax_binary_op<int>());
    // print data 
    for(int i = 0; i < R; i++)
    {
      minmax_pair<int> result = row_results[i];
        std::cout << "[";
        for(int j = 0; j < C; j++)
            std::cout << std::setw(3) << array[i * C + j] << " ";
        std::cout << "] = " << "(" << result.min_val << "," << result.max_val << ")\n";
    }
    return 0;
}
Sample output:
[  0   8  60  89  96  18  51  39 ] = (0,96)
[ 26  74   8  56  58  80  59  51 ] = (8,80)
[ 87  99  72  96  29  42  89  65 ] = (29,99)
[ 90  96  16  85  90  29  93  41 ] = (16,96)
[ 30  51  39  78  68  54  59   9 ] = (9,78)
 
         加载中,请稍侯......
 加载中,请稍侯......
      
精彩评论