Boost::multi_array -- referencing too slow
I have to pass arrays to other functions, through referencing or pointer, I don't care as long as it works fast. That's why I started to use the boost library. I did it in the following way:
using namespace开发者_如何学C boost;
typedef multi_array<long double, 4> array_type;
typedef multi_array<long double, 2> twod_array_type;
typedef multi_array<long double, 1> vec_type;
as functions:
void pde_3d_7_stencil_discretization(array_type& A, vec_type& b, vec_type& x,const int& xdim, const int& ydim,const int& zdim)
void gmressolver3d(array_type& A, vec_type& x, vec_type& rhs,const int& KrylovDim,const int& xdim,const int& ydim,const int& zdim,const int& COP, const int& threeDStencil)
and in the main function:
array_type A(extents[threeDimStencil][COP][COP][xdim*ydim*zdim]);
vec_type b(extents[xdim*ydim*zdim*COP]);
vec_type x(extents[xdim*ydim*zdim*COP]);
pde_3d_7_stencil_discretization(A,b,x,xdim,ydim,zdim);
gmressolver3d(A,x,b,KrylovDim,xdim,ydim,zdim,COP,threeDimStencil);
Obviously, I'm doing something wrong, because the code works really slower than the static version, which doesn't involve any references/pointers, just passing arrays from one function to another.
What can I do to accelerate this?
Thank you for any kind of help..
edit: I'm posting what these codes do, a sequence from GMRES solver: All arrays in it were initialized also using Boost, such as:
vec_type pp(extents[zdim*xdim*ydim*COP]);
vec_type ppp(extents[zdim*xdim*ydim*COP]);
vec_type w(extents[zdim*xdim*ydim*COP]);
vec_type y(extents[KrylovDim]);
vec_type vv(extents[zdim*xdim*ydim*COP]);
vec_type b(extents[KrylovDim+1]);
vec_type ro(extents[zdim*xdim*ydim*COP]);
vec_type out1(extents[xdim*zdim*ydim*COP]);
vec_type m_jac(extents[xdim*zdim*ydim*COP]);
twod_array_type h(extents[KrylovDim+1][KrylovDim]);
twod_array_type v(extents[zdim*xdim*ydim*COP][KrylovDim]);
twod_array_type hess(extents[KrylovDim+1][KrylovDim]);
array_type maa(extents[threeDStencil][COP][COP][zdim*xdim*ydim]);
array_type maaa(extents[threeDStencil][COP][COP][zdim*xdim*ydim]);
for (i=0;i<m+1;i++){
b[i] = 0;
for(k=0;k<m;k++){
h[i][k] = 0.0;
}
}
for (i=0;i<n;i++){
v[i][0] = ro[i]/r;
}
for(j=0;j<m;j++){
b[0] = r;
vector_zero_fill(n,ppp);
for(i=0;i<n;i++){
vv[i]=v[i][j];
}
//********************MATRIX FREE********************
matrix_vector_product_heptadiagonal_discret(A,vv,pp,xdim,ydim,zdim);
//two_vector_dot_product(n,pp,m_jac);
// if(isPrec)
// forback(A,pp);
//********************MATRIX FREE********************
//pretty fast**
for(i=0;i<=j;i++){
for(k=0;k<n;k++){
h[i][j] = h[i][j] + pp[k]*v[k][i];
}
}
for(i=0;i<=j;i++){
for(k=0;k<n;k++){
ppp[k] = ppp[k] + h[i][j]*v[k][i];
}
}
p=0.0;
for(i=0;i<n;i++){
w[i] = pp[i] - ppp[i];
p = p + pow(w[i],2);
}
h[j+1][j] = sqrt(p);
for(i=0;i<=j+1;i++){
for(k=0;k<=j;k++){
hess[i][k] = h[i][k];
}
}
for(i=0;i<j+1;i++){
c = hess[i][i]/sqrt(pow(hess[i][i],2)+pow(hess[i+1][i],2));
s = hess[i+1][i]/sqrt(pow(hess[i][i],2)+pow(hess[i+1][i],2));
for (k=0;k<=j;k++){
inner1=c*hess[i][k]+s*hess[i+1][k];
inner2=(-s)*hess[i][k]+c*hess[i+1][k];
hess[i][k] = inner1;
hess[i+1][k] = inner2;
}
b[i+1] = -s*b[i];
b[i] = c*b[i];
}
Where you zero-initialize your multi_arays, you can try using std::memset
instead. For example
std::memset(b.data(), 0, size_of_b_in_bytes);
There are a few places in your code where you index the same multi_array
element more than once. For example, instead of
h[i][j] = h[i][j] + pp[k]*v[k][i]
try
h[i][j] += pp[k]*v[k][i]
Normally, the optimizer would automatically make such substitutions for you, but maybe it can't with multi_array
.
I also spotted two for loops that can be merged into one to avoid indexing the same multi_array element multiple times:
/*
for(i=0; i<=j; i++)
{
for(k=0; k<n; k++)
{
h[i][j] = h[i][j] + pp[k]*v[k][i];
}
}
for(i=0; i<=j; i++)
{
for(k=0; k<n; k++)
{
ppp[k] = ppp[k] + h[i][j]*v[k][i];
}
}
*/
for(i=0; i<=j; i++)
{
for(k=0; k<n; k++)
{
long double& h_elem = h[i][j];
long double v_elem = v[k][i];
h_elem += pp[k]*v_elem;
ppp[k] += h_elem*v_elem;
}
}
There might be more like these. Note the use of references and variables to "remember" an element and to avoid having to recompute its position in the multi_array.
In the last for loop from your code, you can avoid lots of recomputing of multi_array indices by using temporary variables and references:
/*
for(i=0;i<j+1;i++){
c = hess[i][i]/sqrt(pow(hess[i][i],2)+pow(hess[i+1][i],2));
s = hess[i+1][i]/sqrt(pow(hess[i][i],2)+pow(hess[i+1][i],2));
for (k=0;k<=j;k++){
inner1=c*hess[i][k]+s*hess[i+1][k];
inner2=(-s)*hess[i][k]+c*hess[i+1][k];
hess[i][k] = inner1;
hess[i+1][k] = inner2;
}
b[i+1] = -s*b[i];
b[i] = c*b[i];
}
*/
for(i=0;i<j+1;i++){
long double hess_i_i = hess[i][i];
long double hess_ip1_i = hess[i+1][i];
long double temp = sqrt(pow(hess_i_i,2)+pow(hess_ip1_i,2));
c = hess_i_i/temp;
s = hess_ip1_i/temp;
for (k=0;k<=j;k++){
long double& hess_i_k = hess[i][k];
long double& hess_ip1_k = hess[i+1][k];
inner1=c*hess_i_k+s*hess_ip1_k;
inner2=(-s)*hess_i_k+c*hess_ip1_k;
hess_i_k = inner1;
hess_ip1_k = inner2;
}
long double b_i& = b[i];
b[i+1] = -s*b_i;
b_i = c*b_i;
}
Double check my work -- it's certain I've made a mistake somewhere. Note that I've stored the sqrt(pow(hess_i_i,2)+pow(hess_ip1_i,2))
in a variable so that it's not needlessly computed twice.
I doubt these minor tweaks will bring the runtime down to 5 seconds. The problem with multi_array
is that the array dimensions are only known at runtime. Support for row-major/column-major ordering probably also induces some overhead.
With C-style multi-dimensional arrays, dimensions are known at compile time, so the compiler can produce "tighter" code.
By using Boost multi_arrays you're basically trading off speed for flexibilty and convenience.
See rodrigob's answer here. Also, using Blaze DynamicMatrix with the same compiler optimization can give almost an extra factor 2 improvement.
精彩评论