openmp并行的时间表构造给予不同的答案，只有很少的程序运行

我想使用openmp工作共享结构。共享的代码是我更大的openmp代码出现问题的更简单的例子。我将值赋给一个整数矩阵，打印矩阵元素值，将它们初始化为0，并在't'循环中重复它。我计算了赋值（通过并行执行）失败的次数，通过整数'p'。如果代码是正确的，p应该是0，但是它为不同的运行提供了不同的答案，所以工作构造失败了。我必须运行它大约12次，然后才得到第一个错误的p值作为输出（1，2，3等）。openmp并行的时间表构造给予不同的答案，只有很少的程序运行

代码中的屏障指令并非真的必要，我得到的值不同没有它，并认为明确的障碍会有所帮助，但我错了。这是代码：

#define NRA 10     /* number of rows in matrix A */ 
    #define NCA 10     /* number of columns in matrix A */ 

    int main() 
    { 
     int i, j, ir, p = 0, t; 
     int *a; 
     a = (int*) malloc(sizeof(int)*NRA*NCA); 

     omp_set_num_threads(5); 

     for(t=0;t<100000;t++) 
     { 
      #pragma omp barrier 
      #pragma omp parallel for schedule (static,2) collapse(2) 
      for(i=0;i<NRA;i++) 
      { 
       for(j=0;j<NCA;j++) 
       { 
        ir=j*NRA+i; 
        a[ir] = 1; 
       } 
      } 

      #pragma omp single 
      { 
       for(i=0;i<NRA;i++) 
       { 
        for(j=0;j<NCA;j++) 
        { 
         ir=j*NRA+i; 
         if(a[ir] != 1) 
         { 
          p += 1; 
         } 
        } 
       } 
      } 

      #pragma omp parallel for schedule (static,2) collapse(2) 
      for(i=0;i<NRA;i++) 
      { 
       for(j=0;j<NCA;j++) 
       { 
        ir=j*NRA+i; 
        a[ir] = 0; 
       } 
      } 

      # pragma omp barrier 
     }//end t 

     printf("p is %d\n",p); 
    }

这是更大的代码，我不认为比赛状态是一个问题，因为我对外宣称的共享并行循环的所有变量和其他变量本地并行循环中。任何的意见都将会有帮助！

#define NRA 10     /* number of rows in matrix A */ 
    #define NCA 10     /* number of columns in matrix A */ 
    #define NCB 10     /* number of columns in matrix B */ 

    void matrixcalc (double *ad, double *bd, double *cd, int chunkd); 
    void printresults (double *cd, int chunkd); 
    void printrep (double *cd, int chunkd); 

    int main() 
    { 
     int nthreads, chunk, p = 0; 
     double *a,*b,*c; 
     a = (double*)malloc(NRA*NCA*sizeof(double)); 
     if(a==NULL) 
      printf("ho\n"); 
     b = (double*)malloc(NCA*NCB*sizeof(double)); 
     c = (double*)malloc(NRA*NCB*sizeof(double)); 

     omp_set_num_threads(5); 

     chunk = 2;     /* set loop iteration chunk size */ 
     int ir3, i1, j1; 

     /*** Spawn a parallel region explicitly scoping all variables ***/ 
     int t, tmax = 100000; 
     for(t=0;t<tmax;t++) 
     { 
      #pragma omp parallel shared(a,b,c,nthreads,chunk,t,tmax) 
      { 
       int tid = omp_get_thread_num(); 
       int i, j, ir; 
       if (tid == 0) 
       { 
        nthreads = omp_get_num_threads(); 
        // printf("Starting matrix multiple example with %d threads\n",nthreads); 
        // printf("Initializing matrices...\n"); 
       } 

       /*** Initialize matrices ***/ 
       #pragma omp for schedule (static, chunk) collapse(2) 
       for (i=0; i<NRA; i++) 
       { 
        for (j=0; j<NCA; j++) 
        { 
         ir =j*NRA+i; 
         a[ir]= 1.0; 
        } 
       } 
       #pragma omp for schedule (static, chunk) collapse(2) 
       for (i=0; i<NCA; i++) 
       { 
        for (j=0; j<NCB; j++) 
        { 
         ir = j*NCA+i; 
         b[ir] = 1.0; 
        } 
       } 
       #pragma omp for schedule (static, chunk) collapse(2) 
       for (i=0; i<NRA; i++) 
       {  
        for (j=0; j<NCB; j++) 
        { 
         ir=j*NRA+i; 
         c[ir]= 0.0; 
        } 
       } 
       /*** Do matrix multiply sharing iterations on outer loop ***/ 
       /*** Display who does which iterations for demonstration purposes ***/ 

       matrixcalc(a,b,c,chunk); 
       if(t!=tmax-1) 
       { 
        #pragma omp for schedule (static, chunk) collapse(2) 
        for(i=0;i<NRA;i++) 
        {  
         for(j=0;j<NCB;j++) 
         { 
          ir=j*NRA+i; 
          c[ir]=0.0; 
         } 
        } 
       } 
      }//end parallel region 

      for(i1=0;i1<NRA;i1++) 
      { 
       for(j1=0;j1<NCB;j1++) 
       { 
        ir3=j1*NRA+i1; 
        if(c[ir3]!=12.20000&&c[ir3]!=0.0) 
        { 
         printf("%lf\n",c[ir3]); 
         p+=1; 
        } 
       } 
      } 

     }//end t 
     printf("finalp\t%d\n",p); 
     for(i1=0;i1<NRA;i1++) 
     { 
      for(j1=0;j1<NCB;j1++) 
      { 
       ir3=j1*NRA+i1; 
       printf("%lf\t",c[ir3]); 
      } 
      printf("\n"); 
     } 
    } 

    void matrixcalc (double *a, double *b, double *c, int chunk) 
    { 
     int i,j,k,ir,ir1,ir2; 

     //printf("Thread %d starting matrix multiply...%d\n",tid,chunk); 
     double r = 1.0; 
     #pragma omp for schedule (static, chunk) collapse(3) 

     for (i=0; i<NRA; i++)  
     { 
      for(j=0; j<NCB; j++) 
      {  
       for (k=0; k<NCA; k++) 
       { 
        ir=j*NRA+i; 
        ir1=k*NRA+i; 
        ir2=j*NCA+k; 
        c[ir] += a[ir1] * b[ir2]; 
       } 
      } 
     } 
     #pragma omp for schedule (static, chunk) collapse(2) 
     for(i=0;i<NRA;i++) 
     { 
      for(j=0;j<NCB;j++) 
      { 
       ir=j*NRA+i; 
       c[ir]+=r*2.0; 
      } 
     } 
     #pragma omp single 
     { 
      double h; 
      h = 0.1; 
      h = 2.0*h; 
      for(i=0;i<NRA;i++) 
      { 
       for(j=0;j<NCB;j++) 
       { 
        ir=j*NRA+i; 
        c[ir]+=h; 
       } 
      } 
     }

来源

2017-07-17 PooriR

这些障碍是完全不必要的，因为它们在平行部分之外（不创建并行）。另外，'omp single'和'omp for'在块的末尾产生一个隐含的障碍，所以在任何情况下它都是多余的。考虑在't'循环之外使用并行（唯一）构造，然后在其余构造中使用普通的'omp for'和'omp single'。另外，改变'i'和'j'循环的顺序以顺序访问'a []'元素（改进缓存局部性）。 –

我了解这些障碍，并且我注意到了其他建议（尽管我不明白为什么！）。我试着在t循环之外放置并行构造，并且代码花费了更长的时间来提供输出，我不得不杀死程序。 – PooriR

关于你的错误 - 我不打算全面回顾诸如实质性改变的问题，但是来自并行区域内对'matrixcalc（a，b，c，chunk）'的调用需要麻烦。 – Zulan

该问题是ir上的竞争条件。由于它是在循环外定义的，因此隐含地为shared。你可以强制它为private，但最好是尽可能在本地声明变量。这使得关于OpenMP的代码很容易推理：

#pragma omp parallel for schedule (static,2) collapse(2) 
for(int i=0;i<NRA;i++) 
{ 
    for(int j=0;j<NCA;j++) 
    { 
     int ir = j*NRA+i; 
     a[ir] = 1; 
    } 
}

正如评论由豪尔赫·白龙，也有你的代码的其他问题相对于冗余的障碍和效率。

来源

2017-07-17 10:19:46 Zulan

谢谢！在类似但更大的代码中，我宣称ir，i和j是私有的，所以没有可立即发现的竞争条件，但是这对我在此处放置的代码起作用。 – PooriR

在更新中，您有@Zulan指出的类似问题。在这种情况下，它发生在'ir3'变量中。 –

我相信'ir3'只能用在并行区域之外。但我只能重复自己，默认情况下，**尽可能在本地声明变量**，并在同一行上初始化它们。你可以节省很多麻烦。 – Zulan

openmp并行的时间表构造给予不同的答案，只有很少的程序运行

回答

相关问题