2014-10-10 57 views
0

我是MPI的新手。试图近似解决PDE。 1000乘1000阵列。除了第一行和最后一行,在每次迭代中,每个元素都更新为其8个邻居的平均值。不同数量的处理器以不同的结果结束?

我的代码正在运行,但使用不同数量的处理器后,在小数点后第三位的结果略有不同。我猜我的交流失去了精确性?我按行分区大数组,因为C++逐行存储数组。

这是代码。

#include <iostream> 
#include <mpi.h> 
#include <cmath> 
#include <stdio.h> 
#include <stdlib.h> 

int main(int argc, const char * argv[]) 
{ 

    // Initialize the MPI environment 
    MPI_Init(NULL,NULL); 
    int p; 
    MPI_Comm_size(MPI_COMM_WORLD, &p); 
    int id; 
    MPI_Comm_rank(MPI_COMM_WORLD, &id); 

    const double pi = 3.1415926; 

    int n; 
    n=atoi(argv[argc-1]); 

    //calculate the starting and ending column indices 
    int m=floor((n-2)/p); 
    int r=n-2-m*p; 

    //dividing row 1 to row n-2 by rows since in c/c++ arrays are stored in row-wise 
    //therefore n-2=(m+1)*r+m*(p-r) 
    //the first r processors get m+1 rows, the rest get m rows 

    //starting row, ending row in the original A, width of matrix 
    int start_row, end_row, width; 
    if (id<=r-1){ 
     start_row=1+id*(m+1); 
     end_row=start_row+m; 
     width=m+1; 
    } 
    else { 
     start_row=1+r*(m+1)+(id-r)*m; 
     end_row=start_row+m-1; 
     width=m; 
    } 

    //printf("mpi debug 1"); 
    printf("on processor %d, starting row is %d, ending row is %d \n",id,start_row,end_row); 

    //id of the processor before and after 
    //id_before is not significant for id==0 
    //id_after is not 
    int id_before, id_after; 
    if (id==0) 
     id_before=p-1; 
    else 
     id_before=id-1; 

    if (id==p-1) 
     id_after=0; 
    else 
     id_after=id+1; 

    //printf("debug000"); 


    //initialize the local matrix 
    //**** better way to initialize? 
    double a[width][n], b[width][n]; 
    for (int i=0; i<width; i++) 
     for (int j=0; j<n; j++) 
      a[i][j]=0.5; 

    //two 1d arrays to store the halo cells 
    double halo_before[n], halo_after[n]; 

    if (id==0){ 
     for (int j=0; j<n; j++){ 
      halo_before[j]=0.0; 
     } 
    } 

    if (id==p-1){ 
     for (int j=0; j<n; j++) { 
      halo_after[j]=5*sin(M_PI*((double)j/n)*((double)j/n)); 
     } 
    } 

    MPI_Barrier(MPI_COMM_WORLD); 

    //std::cout << " the sin function is" << 5*sin(M_PI*((double)1/1)*((double)1/2)) << "\n" <<std::endl; 

    //set id=0 to be the root processor and call 
    double start_time, end_time; 
    if (id==0) 
     start_time=MPI_Wtime(); 

    MPI_Status status_before, status_after; 
    MPI_Request request_before, request_after; 

    ///////////////////////////////////////////////////////// 
    //to dubug, print out arrays 
    //std::cout<< " the array on processor " << id <<" to start is \n" << std::endl; 

    //for (int i=0; i<width; i++){ 
    // for (int j=0; j<n; j++){ 
    //  std::cout << a[i][j] << " "; 
    //  if (j==n-1) 
    //   std::cout << "\n" <<std::endl; 
    // } 
    //} 

    //to debug print out halos 
    //std::cout << "halo_before on processor " << id << " to start with is\n" << std::endl; 
    //for (int i=0;i<n;i++){ 
    // std::cout << halo_before[i] << " "; 
    // if (i==n-1) 
    //  std::cout <<"\n" <<std::endl; 
    //} 

    //std::cout << "halo_after on processor " << id << " to start with is\n" << std::endl; 
    //for (int i=0;i<n;i++){ 
    // std::cout << halo_after[i] << " "; 
    // if (i==n-1) 
    //  std::cout <<"\n" <<std::endl; 
    //} 
    ////////////////////////////////////////////////////// 

    //begin iteration 
    for (int t=0; t<500; t++){ 
     //unblocking send 

     //send first row to id_before: 
     //how should I use tag? 
     if (id>0){ 
      MPI_Isend(&a[0][0], n, MPI_DOUBLE, id_before, t , MPI_COMM_WORLD, &request_before); 
     } 
     if (id<p-1){ 
      //send the last row to id_after 
      MPI_Isend(&a[width-1][0], n, MPI_DOUBLE, id_after, t, MPI_COMM_WORLD, &request_after); 
     } 

     //printf("dubug0"); 

     //update the entries that do not depend on halos 
     //local row=1 to row=width-2 
     //add if (width>3)?? 
     int j_b, j_a; 
     for (int i=1; i<width-1; i++){ 
      for (int j=0; j<n; j++){ 
       j_b=(n+j-1)%n; 
       j_a=(j+1)%n; 
       b[i][j]=(a[i-1][j_b]+a[i-1][j]+a[i-1][j_a]+a[i][j_b]+a[i][j_a]+a[i+1][j_b]+a[i+1][j]+a[i+1][j_a])/8; 
      } 
     } 

     //printf("dubug1"); 

     //blocking receive 
     //may consider unblocking receive 
     //receive from id_before and store in halo_before 
     //not sure about status 
     if (id>0){ 
      MPI_Recv(&halo_before[0], n, MPI_DOUBLE, id_before ,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE); 
     } 
     //receive from id_after and store in halo_after 
     if (id<p-1){ 
      MPI_Recv(&halo_after[0], n, MPI_DOUBLE, id_after,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE); 
     } 

     //to debug print out halos 
     //std::cout << "halo_before on processor " << id << " at iteration " << t<< " is\n" <<std::endl; 
     //for (int i=0;i<n;i++){ 
     // std::cout << halo_before[i] << " "; 
     // if (i==n-1) 
     //  std::cout <<"\n" <<std::endl; 
     //} 

     //std::cout << "halo_after on processor " << id << " at iteration " << t<< " is\n" <<std::endl; 
     //for (int i=0;i<n;i++){ 
     // std::cout << halo_after[i] << " "; 
     // if (i==n-1) 
     //  std::cout <<"\n" <<std::endl; 
     //} 


     //update entries that depend on halos 
     //bugs here, what if width==1??? 
     if (width==1){ 
      for (int j=0; j<n; j++){ 
       j_a=(n+j-1)%n; 
       j_b=(j+1)%n; 
       b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+halo_after[j_b]+halo_after[j]+halo_after[j_a])/8; 
      } 

     } 
     else{ 
     for (int j=0; j<n; j++){ 
      j_a=(n+j-1)%n; 
      j_b=(j+1)%n; 
      b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+a[1][j_b]+a[1][j]+a[1][j_a])/8; 
      b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8; 
     } 
     } 

     //copy to b 
     //but make sure the send have been completed 

     if (id>0) 
      MPI_Wait(&request_before,MPI_STATUS_IGNORE); 

     if (id<p-1) 
      MPI_Wait(&request_after,MPI_STATUS_IGNORE); 


     for (int i=0; i<width; i++) 
      for (int j=0; j<n; j++) 
       a[i][j]=b[i][j]; 

     //to dubug, print out arrays 
     //std::cout<< " the array on processor " << id <<" at iteration " << t <<" is \n"<< std::endl; 

     //for (int i=0; i<width; i++){ 
     // for (int j=0; j<n; j++){ 
     // std::cout << a[i][j] << " "; 
     // if (j==n-1) 
     //  std::cout << "\n" <<std::endl; 
     // } 
     //} 


    } 

    //calculate the sum 
    double sum=0.0; 
    for (int i=0; i<width; i++) 
     sum += a[i][i+start_row]; 

    double total_sum; 
    //send to root processor 
    MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); 
    MPI_Barrier(MPI_COMM_WORLD); 

    if (id==0){ 
     end_time=MPI_Wtime(); 
     //double sum_receive[p]; 
     //double sum_calc; 
     //for (int i=0; i<p; i++){ 
     // MPI_Recv(&sum_receive[i], 1, MPI_DOUBLE, i, i, MPI_COMM_WORLD, MPI_STATUS_IGNORE); 
     // sum_calc += sum_receive[i]; 
     //} 

     printf("time elapse is %f \n", end_time-start_time); 
     printf("at root processor %d, the calculated sum is %f, \n", id, total_sum+5*sin(M_PI*((double)(n-1)/n)*((double)(n-1)/n))); 
    } 


    MPI_Finalize(); 

    return 0; 


} 

回答

3

在您的某行代码中存在简单的错字。此:

 b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8; 

应该是这样(注意4RD术语; j_a,而不是第二j_b):

 b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_a]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8; 

,因为这是在各结构域的结束行,确切的量发生导致的错误将取决于域边界 - 例如,您拥有多少个处理器。

现在的原因这样的错误基本上是不可避免的在你发布的代码是,相同的计算 - b - 从 - 发生不少于3次的一些修改。这是一颗定时炸弹;最终会有人更新,而其他人会变得与第一个不一致,或者对其他人的更新最终会出现一些错误。

在这里有很多方法可以减少复制量,既可以阐明代码,又可以避免这类错误。最好的办法是将光环纳入a和b数组本身,在数据前后添加一行额外的行以包含数据 - 这样,您不必担心宽度是否为1,并且不处理结束行分别。此外,定义一个函数,它基于a更新b的一行或一个元素,并使用该函数而不是重复该代码。

下面是在清理后的代码封装位成例程的例子,根据配备卤素区对待n和宽度一致地等

#include <mpi.h> 
#include <math.h> 
#include <stdio.h> 
#include <stdlib.h> 

int min2i(int a, int b) { 
    int result = a; 
    if (b < a) result = b; 
    return result; 
} 

void decomposition(const int n, const int nprocs, const int id, 
        int *start_row, int *width, int *id_before, int *id_after) { 
    const int nrows = n; 
    const int m = nrows/nprocs; 
    const int r = nrows % nprocs; 

    *width = m; 
    if (id < r) (*width)++; 

    *start_row = 1 + id*m + min2i(id,r); 

    *id_before = (id > 0 ? id-1 : MPI_PROC_NULL); 
    *id_after = (id < nprocs-1 ? id+1 : MPI_PROC_NULL); 
} 

void startBC(const int n, const int width, double a[][n+2], double b[][n+2], 
      const int id_before, const int id_after, const int t, MPI_Request *req) { 
    MPI_Isend(&a[1][1],  n, MPI_DOUBLE, id_before, 2*t , MPI_COMM_WORLD, &req[0]); 
    MPI_Isend(&a[width][1], n, MPI_DOUBLE, id_after , 2*t+1, MPI_COMM_WORLD, &req[1]); 
} 

void finishBC(const int n, const int width, double a[][n+2], double b[][n+2], 
       const int id_before, const int id_after, const int t, MPI_Request *req) { 
    MPI_Status stats[2]; 

    MPI_Recv(&a[0][1],  n, MPI_DOUBLE, id_before, 2*t+1, MPI_COMM_WORLD, &stats[0]); 
    MPI_Recv(&a[width+1][1], n, MPI_DOUBLE, id_after, 2*t , MPI_COMM_WORLD, &stats[1]); 

    for (int i=0; i<width+2; i++) { 
     a[i][0] = a[i][n]; 
     a[i][n+1] = a[i][1]; 
    } 

    MPI_Waitall(2, req, stats); 
} 

void updateRow(const int n, const int width, double a[][n+2], double b[][n+2], const int row) { 
    for (int j=1; j<=n; j++) 
     b[row][j]=(a[row-1][j-1] + a[row-1][j] + a[row-1][j+1] 
        +a[ row ][j-1]    + a[ row ][j+1] 
        +a[row+1][j-1] + a[row+1][j] + a[row+1][j+1])/8; 
} 

int main(int argc, const char * argv[]) 
{ 
    int p, id; 
    MPI_Init(NULL,NULL); 
    MPI_Comm_size(MPI_COMM_WORLD, &p); 
    MPI_Comm_rank(MPI_COMM_WORLD, &id); 

    const double pi = 3.1415926; 

    int n=atoi(argv[argc-1]); 
    int width, start_row, id_before, id_after; 

    decomposition(n, p, id, &start_row, &width, &id_before, &id_after); 

    double a[width+2][n+2], b[width+2][n+2]; 
    for (int i=0; i<width+2; i++) 
     for (int j=0; j<n+2; j++) 
      a[i][j]=0.5; 

    if (id==p-1) 
     for (int j=0; j<n+2; j++) 
      a[width+1][j]=5*sin(pi*((double)(j-1)/n)*((double)(j-1)/n)); 

    if (id==0) 
     for (int j=0; j<n+2; j++) 
      a[0][j]=0.; 

    double start_time, end_time; 
    if (id==0) 
     start_time=MPI_Wtime(); 

    MPI_Request reqs[2]; 

    //begin iteration 
    for (int t=0; t<2; t++){ 
     startBC(n, width, a, b, id_before, id_after, t, reqs); 
     /* interior rows */ 
     for (int row=2; row<width; row++) 
      updateRow(n, width, a, b, row); 

     finishBC(n, width, a, b, id_before, id_after, t, reqs); 

     /* boundary rows */ 
     updateRow(n, width, a, b, 1); 
     updateRow(n, width, a, b, width); 

     for (int i=1; i<width+1; i++) 
      for (int j=1; j<n+1; j++) 
       a[i][j]=b[i][j]; 
    } 

    //calculate the sum 
    double sum=0.0; 
    for (int i=1; i<width+1; i++) 
     sum += a[i][i+start_row-1]; 

    double total_sum; 
    MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); 

    if (id==0){ 
     end_time=MPI_Wtime(); 

     printf("time elapse is %f \n", end_time-start_time); 
     printf("at root processor %d, the calculated sum is %f, \n", id, total_sum+5*sin(pi*((double)(n-1)/n)*((double)(n-1)/n))); 
    } 


    MPI_Finalize(); 

    return 0; 
}