getting collective abort in my possion equation code
/***every function is working correct but after only first iteration is giving collective abort anyone can tell what is or coulde be the reason***/
#include<stdio.h>
#include<stdlib.h>
#include"mpi.h"
const double tolerance = 0.00001;
const int maxit = 10000;
void MPE_decomp1d(int n, int size, int id, int *s, int *e)
{
/*****calculating start and end row for every process*****/
*s = (n/size)*id + ((n%size)>0)*(id>(n%size)?n%size:id);
*e = *s + (n/size)+((n%size)>id);
}
void onedinit(double **a, double **b, double **f, const int nx, const int s, const int e)
{
int i, j;
int ls, le;
ls = s - (s!=0);
le = e + (e!=nx);
/***setting all the intial values to zero****/
for (i = ls; i < le; i++)
{
for (j = 0; j < nx; j++)
{
a[i][j] = b[i][j] = f[i][j] = 0.0;
}
}
//***************************Defining Boundary Condition***********************************//
/***setting left boundary to 1***/
for (i = ls; i < le; i++) a[i][0] = b[i][0] = 1;
/***setting value f(0, i) is 2***/
if (s==0) for (i = 0; i < nx; i++) a[0][i] = b[0][i] = 2.0;
}
void exchng1(double **a, const int nx, const int s, const int e, MPI_Comm comm1d, int nbrbottom, int nbrtop)
{
int rank, coord;
MPI_Status status;
MPI_Comm_rank(comm1d, &rank);
MPI_Cart_coords(comm1d, rank, 1, &coord);
/*****************if process id is odd then first send and if 开发者_高级运维even then first recive to avoid deadlock**********/
if (coord&1)
{
if (nbrbottom != -1) MPI_Send(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, comm1d);
if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 1, comm1d, &status);
if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 0, comm1d);
if (nbrbottom != -1) MPI_Recv(a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
}
else
{
if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
if (nbrbottom != -1) MPI_Send(a[e-s-(s==0)], nx, MPI_DOUBLE, nbrbottom, 1, comm1d);
if (nbrbottom != -1) MPI_Recv(a[e-s+(s!=0)], nx, MPI_DOUBLE, nbrbottom, 0, comm1d, &status);
if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 1, comm1d);
}
}
void sweep1d(double **a, double **f, int nx, const int s, const int e, double **b)
{
int i, j;
int rows;
rows = e - s - (s==0) - (e==0);
nx -= 1;
double h = 1.0 / (double)nx;
for (i = 1; i <= rows; i++) for (j = 1; j < nx; j++)
b[i][j] = 0.25 * (a[i-1][j] + a[i][j+1] + a[i][j-1] + a[i+1][j]) - h*h*f[i][j];
return;
}
double diff(double **a, double **b, const int nx, int s, int e)
{
double sum = 0.0;
int i, j;
int st, ed;
st = (s!=0);
ed = e-s+(s!=0);
for (i = st; i < ed; i++) for (j = 0; j < nx; j++)
sum += (a[i][j] - b[i][j])*(a[i][j] - b[i][j]);
return sum;
}
int main(int argc, char *argv[])
{
int nx, ny;
int myid, root, numprocs, period=0;
int nbrbottom, nbrtop, s, e, it;
double diffnorm, dwork;
double t1, t2;
double **a, **b, **f;
root = 0;
MPI_Comm comm1d;
MPI_Init(&argc, &argv);;
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if(!myid)
{
/******for this piece of code nx and ny are assumed to be same please*******/
printf("Enter the number of cells in X & Y direction\n");
scanf("%d %d", &nx, &ny);
nx += 1;
ny += 1;
ny = nx; //forced to follow our assumption;
}
MPI_Bcast(&nx, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&ny, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Cart_create(MPI_COMM_WORLD, 1, &numprocs, &period, 1, &comm1d);
MPI_Comm_rank(comm1d, &myid);
MPI_Cart_shift(comm1d, 0, 1, &nbrtop, &nbrbottom);
MPE_decomp1d(ny, numprocs, myid, &s, &e);
int ls, le, rows;
int i, j;
ls = s - (s!=0);
le = e + (e!=nx);
rows = le - ls;
a = (double**)malloc(rows*sizeof(double*));
b = (double**)malloc(rows*sizeof(double*));
f = (double**)malloc(rows*sizeof(double*));
for (i = ls; i < le; i++)
{
a[i] = (double*)malloc(nx*sizeof(double));
b[i] = (double*)malloc(nx*sizeof(double));
f[i] = (double*)malloc(nx*sizeof(double));
}
onedinit(a, b, f, nx, s, e);
diffnorm = 0.0;
it = 0;
do
{
// printf("%danshu\n", myid);
exchng1(a, nx, s, e, comm1d, nbrbottom, nbrtop);
sweep1d(a, f, nx, s, e, b);
exchng1(b, nx, s, e, comm1d, nbrbottom, nbrtop);
sweep1d(b, f, nx, s, e, a);
dwork = diff(a, b, nx, s, e);
/************printing matrix a after every iteration******/
for (i = 0; i < rows; i++)
{
for (j = 0; j < nx; j++) printf("%lf ", a[i][j]);
printf("\n");
}
MPI_Barrier(comm1d);
//printf("%lfhehe\n", dwork);
MPI_Allreduce(&dwork, &diffnorm, 1, MPI_DOUBLE, MPI_SUM, comm1d);
//printf("%dhere\n", myid);
}
while (++it < maxit && diffnorm > tolerance);
MPI_Finalize();
return 0;
}
So just dumping 130 lines of code in SO and asking why it doesn't work is probably not the best way to get good answers - especially when the only actual sentance you write is "every function is working"... if that were the case, you wouldn't have a problem. You need to narrow things down to some more specific case and get a more specific question.
In this particular case, I've seen lots of code like this in the past while teaching, so it's feasible to see some of what's going on.
First off, you can't do stuff like this:
ls = s - (s!=0);
le = e + (e!=nx);
rows = le - ls;
a = (double**)malloc(rows*sizeof(double*));
/*...*/
for (i = ls; i < le; i++)
{
a[i] = (double*)malloc(nx*sizeof(double));
/*...*/
}
If you have 100 rows broken up into 4 processors, and you're (say) MPI Task 2, then your s
is 50 and e
is 75, and so ls
would be 49 and le
would be 76, and so you're trying to access a[49..76]
even though you've only allocated a
of size 27! That particular error comes up all over the code, and needs to be fixed. You want to be accessing a[0..rows-1]
.
Incidentally, I haven't even checked to see if MPE_decomp1d
actually does the right thing. We all go through the phase where we think it's cute in C to put things in one line by using logical expressions multiplied by ternary operators, etc, but seriously, it makes your code unnecessarily tedious to disentangle when someone else has to fix it -- whether it's SOers or yourself 2 months later.
In exchng1
, you're doing unnecessary work. You don't need to check to see if nbrbottom
or nbrtop
are valid; if they aren't, MPI_Cart_shift
returns MPI_PROC_NULL
to which sending or receiving is a no-op. So sending/receiving from those ranks is harmless, which is a great design decision, because it avoids lots of corner cases in the logic.
Similarly, to avoid deadlock you can use MPI_Sendrecv
rather than individual Send
s and Recv
s. That plus the above means that instead of this:
if (coord&1)
{
if (nbrbottom != -1) MPI_Send(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, comm1d);
if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 1, comm1d, &status);
if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 0, comm1d);
if (nbrbottom != -1) MPI_Recv(a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
}
else
{
if (nbrtop != -1) MPI_Recv(a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
if (nbrbottom != -1) MPI_Send(a[e-s-(s==0)], nx, MPI_DOUBLE, nbrbottom, 1, comm1d);
if (nbrbottom != -1) MPI_Recv(a[e-s+(s!=0)], nx, MPI_DOUBLE, nbrbottom, 0, comm1d, &status);
if (nbrtop != -1) MPI_Send(a[1], nx, MPI_DOUBLE, nbrtop, 1, comm1d);
}
you can do this:
MPI_Sendrecv(a[e-s], nx, MPI_DOUBLE, nbrbottom, 0, a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
MPI_Sendrecv(a[1], nx, MPI_DOUBLE, nbrtop, 1, a[e-s+1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
-- way simpler, right?
There's still some problem in the exchange, though; receiving into a[e-s+1]
isn't right, although as I've mentioned, I can't be bothered decrypting MPE_decomp1d
to figure out why. Presumably you want to be receiving into a[rows-1]
.
Finally, the MPI_Barrier()
is slow and completely unnecesssary; there's enough synchronization in the guardcell exchanges (to say nothing of the Allreduce) that you don't need it.
When all those changes are made, the code runs without memory access problems; you'll have to check that it gives the right answers.
#include<stdio.h>
#include<stdlib.h>
#include"mpi.h"
const double tolerance = 0.00001;
const int maxit = 10000;
void MPE_decomp1d(int n, int size, int id, int *rows)
{
int s, e;
s = (n/size)*id + ((n%size)>0)*(id>(n%size)?n%size:id);
e = s + (n/size)+((n%size)>id);
*rows = e - s - (s==0) - (e==0);
}
void onedinit(double **a, double **b, double **f, const int nx, const int rows, const int id, const int nprocs)
{
int i, j;
for (i = 0; i < rows; i++)
{
for (j = 0; j < nx; j++)
{
a[i][j] = b[i][j] = f[i][j] = 0.0;
}
}
for (i = 0; i < rows; i++) a[i][0] = b[i][0] = 1;
if (id == 0)
for (i = 0; i < nx; i++) a[0][i] = b[0][i] = 2.0;
}
void exchng1(double **a, const int nx, const int rows, MPI_Comm comm1d, int nbrbottom, int nbrtop)
{
int rank, coord;
MPI_Status status;
MPI_Comm_rank(comm1d, &rank);
MPI_Cart_coords(comm1d, rank, 1, &coord);
/* send data downwards */
MPI_Sendrecv(a[rows-2], nx, MPI_DOUBLE, nbrbottom, 0, a[0], nx, MPI_DOUBLE, nbrtop, 0, comm1d, &status);
/* send data upwards */
MPI_Sendrecv(a[1], nx, MPI_DOUBLE, nbrtop, 1, a[rows-1], nx, MPI_DOUBLE, nbrbottom, 1, comm1d, &status);
}
void sweep1d(double **a, double **f, const int nx, const int rows, double **b)
{
int i, j;
double h = 1.0 / (double)nx;
for (i = 1; i < rows-1; i++) for (j = 1; j < nx-1; j++)
b[i][j] =
0.25 * ( a[i-1][j] + a[i][j+1] + a[i][j-1] + a[i+1][j]) - h*h*f[i][j];
return;
}
double diff(double **a, double **b, const int nx, const int rows)
{
double sum = 0.0;
int i, j;
for (i = 0; i < rows; i++) for (j = 0; j < nx; j++)
sum += (a[i][j] - b[i][j])*(a[i][j] - b[i][j]);
return sum;
}
int main(int argc, char *argv[])
{
int nx, ny;
int myid, root, numprocs, period=0;
int nbrbottom, nbrtop, it;
double diffnorm, dwork;
double **a, **b, **f;
root = 0;
MPI_Comm comm1d;
MPI_Init(&argc, &argv);;
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if(!myid)
{
/******for this piece of code nx and ny are assumed to be same please*******/
printf("Enter the number of cells in X & Y direction\n");
scanf("%d %d", &nx, &ny);
nx += 1;
ny += 1;
ny = nx; //forced to follow our assumption;
}
MPI_Bcast(&nx, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&ny, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Cart_create(MPI_COMM_WORLD, 1, &numprocs, &period, 1, &comm1d);
MPI_Comm_rank(comm1d, &myid);
MPI_Cart_shift(comm1d, 0, 1, &nbrtop, &nbrbottom);
int rows;
MPE_decomp1d(ny, numprocs, myid, &rows);
int i, j;
a = (double**)malloc(rows*sizeof(double*));
b = (double**)malloc(rows*sizeof(double*));
f = (double**)malloc(rows*sizeof(double*));
for (i = 0; i < rows; i++)
{
a[i] = (double*)malloc(nx*sizeof(double));
b[i] = (double*)malloc(nx*sizeof(double));
f[i] = (double*)malloc(nx*sizeof(double));
}
onedinit(a, b, f, nx, rows, myid, numprocs);
diffnorm = 0.0;
it = 0;
do
{
exchng1(a, nx, rows, comm1d, nbrbottom, nbrtop);
sweep1d(a, f, nx, rows, b);
exchng1(b, nx, rows, comm1d, nbrbottom, nbrtop);
sweep1d(b, f, nx, rows, a);
dwork = diff(a, b, nx, rows);
/************printing matrix a after every iteration******/
for (i = 0; i < rows; i++)
{
for (j = 0; j < nx; j++) printf("%lf ", a[i][j]);
printf("\n");
}
//printf("%lfhehe\n", dwork);
MPI_Allreduce(&dwork, &diffnorm, 1, MPI_DOUBLE, MPI_SUM, comm1d);
//printf("%dhere\n", myid);
}
while (++it < maxit && diffnorm > tolerance);
MPI_Finalize();
return 0;
}
精彩评论