CUDA Thrust performance
I have a 640*480 vector which contains a set of numbers, I wish to find the min and max number of each row of the vector.
for(int i = 0; i < R; i++)
{
Begin = m_valBuffer.begin() + (i*C);
End = Begin+C;
开发者_如何学运维 rMinmax= minmax_element(Begin, End);
}
However this is extremely slow, is there any way I could speed this up?
- The current load on the GPU when running this is only 34% so there must be a way to improve this?
This example shows how to compute the sum of each row using
the reduce_by_key
algorithm. You can easily adapt that example to compute
the min or max of each row. To compute the min and max of each row
simultaneously you'll need to use this strategy. Specifically,
you'll need to use a transform_iterator
on the input data and convert each
value x
into a tuple (x,x)
before applying the minmax_binary_op
reduction operator.
Here's a complete example:
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/extrema.h>
#include <thrust/random.h>
#include <iostream>
#include <iomanip>
// minmax_pair stores the minimum and maximum
// values that have been encountered so far
template <typename T>
struct minmax_pair
{
T min_val;
T max_val;
};
// minmax_unary_op is a functor that takes in a value x and
// returns a minmax_pair whose minimum and maximum values
// are initialized to x.
template <typename T>
struct minmax_unary_op
: public thrust::unary_function< T, minmax_pair<T> >
{
__host__ __device__
minmax_pair<T> operator()(const T& x) const
{
minmax_pair<T> result;
result.min_val = x;
result.max_val = x;
return result;
}
};
// minmax_binary_op is a functor that accepts two minmax_pair
// structs and returns a new minmax_pair whose minimum and
// maximum values are the min() and max() respectively of
// the minimums and maximums of the input pairs
template <typename T>
struct minmax_binary_op
: public thrust::binary_function< minmax_pair<T>,
minmax_pair<T>,
minmax_pair<T> >
{
__host__ __device__
minmax_pair<T> operator()(const minmax_pair<T>& x, const minmax_pair<T>& y) const
{
minmax_pair<T> result;
result.min_val = thrust::min(x.min_val, y.min_val);
result.max_val = thrust::max(x.max_val, y.max_val);
return result;
}
};
// convert a linear index to a row index
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T>
{
T C; // number of columns
__host__ __device__
linear_index_to_row_index(T C) : C(C) {}
__host__ __device__
T operator()(T i)
{
return i / C;
}
};
int main(void)
{
int R = 5; // number of rows
int C = 8; // number of columns
thrust::default_random_engine rng;
thrust::uniform_int_distribution<int> dist(0, 99);
// initialize data
thrust::device_vector<int> array(R * C);
for (size_t i = 0; i < array.size(); i++)
array[i] = dist(rng);
// allocate storage for per-row results and indices
thrust::device_vector< minmax_pair<int> > row_results(R);
thrust::device_vector< int > row_indices(R);
// compute row sums by summing values with equal row indices
thrust::reduce_by_key
(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(C)) + (R*C),
thrust::make_transform_iterator(array.begin(), minmax_unary_op<int>()),
row_indices.begin(),
row_results.begin(),
thrust::equal_to<int>(),
minmax_binary_op<int>());
// print data
for(int i = 0; i < R; i++)
{
minmax_pair<int> result = row_results[i];
std::cout << "[";
for(int j = 0; j < C; j++)
std::cout << std::setw(3) << array[i * C + j] << " ";
std::cout << "] = " << "(" << result.min_val << "," << result.max_val << ")\n";
}
return 0;
}
Sample output:
[ 0 8 60 89 96 18 51 39 ] = (0,96)
[ 26 74 8 56 58 80 59 51 ] = (8,80)
[ 87 99 72 96 29 42 89 65 ] = (29,99)
[ 90 96 16 85 90 29 93 41 ] = (16,96)
[ 30 51 39 78 68 54 59 9 ] = (9,78)
精彩评论