Matrix inversion in OpenCL
I am trying to accelerate some computations using OpenCL and part of the algorithm cons开发者_运维知识库ists of inverting a matrix. Is there any open-source library or freely available code to compute lu factorization (lapack dgetrf and dgetri) of matrix or general inversion written in OpenCL or CUDA? The matrix is real and square but doesn't have any other special properties besides that. So far, I've managed to find only basic blas matrix-vector operations implementations on gpu.
The matrix is rather small, only about 60-100 rows and cols, so it could be computed faster on cpu, but it's used kinda in the middle of the algorithm, so I would have to transfer it to host, calculate the inverse, and then transfer the result back on the device where it's then used in much larger computations.
Look at ViennaCL: http://viennacl.sourceforge.net/
I don't have an implementation in Open CL, but both "Numerical Recipes" and Gil Strang's "Into to Applied Math" have wonderful explanations that would be easy to code. "NR" has C code that you could adapt.
calculate the inverse
This is incorrect. You aren't calculating an inverse with LU decomposition, you're decomposing the matrix. If you wanted the inverse, you'd have to do forward back substitution with a series of unit vectors. It's a small but important difference.
I know this is kind of late, but if you are trying to do any matrix calculations on a matrix that is that small (60-100 rows), then the computations are going to be much faster on a CPU rather than a GPU because of the time it takes to copy the information from main memory to the GPU's memory. If you are wanting to do this, then I would suggest looking into using a parallel language such as OpenMP or MPI as these would allow you to parallelize your code to speed up the calculations on the CPU.
I make calculus up to 2k x 2k at CPU using multithread using eigen lib, so it is now 3.5-3.65 times faster (depends of matrix size) than using one thread. I used an Intel Xeon 3.5Ghz E5-1620 v3 processor and 16Gb ram. (Unfortunately I deleted the old version to add exact values, but if has some priority I could rewrite the sw)
This is my matrix inverse algorithm I used to compare with. (It is correct as I stated making a lot of tests again excel result):
/*
Uses 2D arrays.
Main routines are:
init_2Dvector() that initializes any 2d vector (can be uchar, char, int, float or double)
multiply_2Dvector()
inverse()
*/
#include<iostream>
#include <vector>
#include <stdlib.h>
#include <time.h>
using namespace std;
/*
void print_2Dvector(vector<vector<double> >& vec)
{
size_t xmax, ymax;
ymax = vec.size();
xmax = vec[0].size();
int x, y;
for (y = 0; y < ymax; y++)
{
for (x = 0; x < xmax; x++)
cout << vec[y][x] << " \t";
cout << endl;
}
}*/
void print_2Dvector(vector<vector<double> >& vec,char *format="%lg \t")
{
size_t xmax, ymax;
ymax = vec.size();
xmax = vec[0].size();
int x, y;
for (y = 0; y < ymax; y++)
{
{
for (x = 0; x < xmax; x++)
printf(format, vec[y][x]);
}
cout << endl;
}
}
//Resizes to y_dim,x_dim any kind of 2d array:
template<typename T>
void init_2Dvector(vector<vector<T> >& vec, size_t y_dim, size_t x_dim)
{
vec.resize(y_dim);
for (size_t i = 0; i < vec.size(); i++)
vec[i].resize(x_dim);
}
//Returns vec1*vec2. vec1 and 2 are not touch
vector< vector<double> > multiply_2Dvector(vector< vector<double> > vec1, vector< vector<double> > vec2)
{
size_t xmax, ymax;
ymax = vec1.size();
xmax = vec1[0].size();
vector< vector<double> > vec3;
if ((ymax != vec2[0].size()) || (xmax != vec2.size()))
{
cout << "ERROR on dim2_multiply() dimensions of vec2 not corresponding with vec1 ones" << endl; getchar(); return{};//returns a null
}
init_2Dvector(vec3, ymax, ymax);
cout << "dimensions of vec3=" << vec3.size() << " x " << vec3[0].size() << endl;
double xx;
for (int y = 0; y < ymax; y++)
for (int x = 0; x < ymax; x++)
{
xx = 0.0;
for (int t = 0; t < xmax; t++)
xx += vec1[y][t] * vec2[t][x];
vec3[y][x] = xx;
}
return vec3;//ok
}
//returns inverse of x2, x2 is not modified
vector< vector<double> > inverse(vector< vector<double> > x)
{
if (x.size() != x[0].size())
{
cout << "ERROR on inverse() not square array" << endl; getchar(); return{};//returns a null
}
size_t dim = x.size();
int i, j, ord;
vector< vector<double> > y(dim,vector<double>(dim));//output
//init_2Dvector(y, dim, dim);
//1. Unity array y:
for (i = 0; i < dim; i++)
{
y[i][i] = 1.0;
for (j = i+1; j < dim; j++)
{
y[i][j]= y[j][i] = 0.0;
}
}
double diagon, coef;
double *ptrx, *ptry, *ptrx2, *ptry2;
for (ord = 0; ord<dim; ord++)
{
//2 Hacemos diagonal de x =1
int i2;
if (fabs(x[ord][ord])<1e-15) //Si el elemento diagonal es 0 sumamos una columna que no sea 0 el elemento correspondiente
{
for (i2 = ord + 1; i2<dim; i2++)
{
if (fabs(x[i2][ord])>1e-15) break;
}
if (i2 >= dim)
return{};//error, returns null
for (i = 0; i<dim; i++)//sumo la linea que no es 0 el de la misma fila de ord
{
x[ord][i] += x[i2][i];
y[ord][i] += y[i2][i];
}
}
diagon = 1.0/x[ord][ord];
ptry = &y[ord][0];
ptrx = &x[ord][0];
for (i = 0; i < dim; i++)
{
*ptry++ *= diagon;
*ptrx++ *= diagon;
}
//Hacemos '0' la columna ord salvo elemento diagonal:
for (i = 0; i<dim; i++)//Empezamos por primera fila
{
if (i == ord) continue;
coef = x[i][ord];//elemento ha hacer 0
if (fabs(coef)<1e-15) continue; //si es cero se evita
ptry = &y[i][0];
ptry2 = &y[ord][0];
ptrx = &x[i][0];
ptrx2 = &x[ord][0];
for (j = 0; j < dim; j++)
{
*ptry++ = *ptry - coef * (*ptry2++);//1ª matriz
*ptrx++ = *ptrx - coef * (*ptrx2++);//2ª matriz
}
}
}//end ord
return y;
}
void test_5_inverse()
{
vector< vector<double> > vec1 = {
{0,-5,0,7,33,11,-1},
{72,0,-11,7,9,33,5 },
{-13,31,-5,15,29,30,24 },
{-24,9,8,-23,31,-12,4 },
{-3,-22,4,-24,-5,27,-10 },
{-10,-21,-16,-32,-11,20,14 },
{5,30,13,-32,29,-13,-13 }
};
vector< vector<double> > vec2;
vec2 = inverse(vec1);
vector< vector<double> > vec3;
vec3 = multiply_2Dvector(vec1, vec2);
cout << "initial array (must be unmodified):" << endl;
print_2Dvector(vec1);
cout << "Must be diagon array:" << endl;
print_2Dvector(vec3," %8.3lf");
cout << endl;
}
void test_6_inverse(int dim)
{
vector< vector<double> > vec1(dim, vector<double>(dim));
for (int i=0;i<dim;i++)
for (int j = 0; j < dim; j++)
{
vec1[i][j] = (-1.0 + 2.0*rand() / RAND_MAX) * 10000;
}
vector< vector<double> > vec2;
double ini, end;
ini = (double)clock();
vec2 = inverse(vec1);
end = (double)clock();
cout << "Time inverse =" << (end - ini) / CLOCKS_PER_SEC << endl;
vector< vector<double> > vec3;
vec3 = multiply_2Dvector(vec1, vec2);
cout << "initial array (must be unmodified):" << endl;
//print_2Dvector(vec1);
cout << "Must be diagon array:" << endl;
//print_2Dvector(vec3, " %8.3lf");
cout << endl;
}
int main()
{
vector< vector<double> > vec1;
init_2Dvector(vec1, 10, 5); //size_t ymax = vec1.size(),xmax = vec1[0].size();
//test_2_dimension_vector();
//test_one_dimension_vector();
test_5_inverse();
test_6_inverse(1300);
cout << endl << "=== END ===" << endl; getchar();
return 1;
}
Check CULA
http://www.culatools.com/ http://www.culatools.com/versions/basic
The original question (now 7 years old) actually was solved 4 years later in a paper describing matrix inversion in CUDA based on Gauss-Jordan. It attempts to distribute the calculations across different threads, and gives detailed performance indications for matrices up to 2048 in size.
While not OpenCL, the general ideas will translate from CUDA quite easily.
精彩评论