不同数量的线程，不同的答案

所以我有一些神经网络模拟器代码在CPU上正常工作，并行版本与串行版本至少有6个小数位，与32个线程单个块在两个我的CUDA在Win7个人电脑下，但有1个块和64个线程的Wt产生稍微不同的值。 Wt值通常不会超过3位小数，并且当我尝试通过在循环中嵌入__syncthreads（）来消除竞争条件时，Wt值在复制回CPU时显示为非A号。不同数量的线程，不同的答案

有人可以给我一个提示，我可能做错了什么？我已经包括下面的并行代码，knlBackProp被称为与lSampleQtyReq = 10000，O = 1，和Option =“R”：

// device-global variables to facilitate data transfer 
__device__ __constant__ __align__(8) struct rohanContext devSes; 
__device__ __constant__ struct rohanLearningSet devLearn; 
__device__ __align__(16) struct rohanNetwork devNet; 

__device__ double devdReturn[1024*1024]; 
__device__ double devdRMSE=0; 
__device__ int devlReturn[1024*1024]; 
__device__ int devlTrainable=0; 

extern"C" 
int knlBackProp(struct rohanContext& rSes, long lSampleQtyReq, long o, char Option) 
{mIDfunc /*! divides error in yielded values and back-propagates corrections among weights */ 
// Option S - single sample correction only 
// Option E - keep existing weights, count trainable samples only 
// Option R - perform corrections for all trainable samples 
    int lTotal=0; 

    cudaMemcpyToSymbol("devlTrainable", &lTotal, sizeof(int)); // init return value on both sides 
     mCheckCudaWorked 
    cudaEvent_t start, stop; 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 

      cudaEventRecord(start, 0); 
     mtkBackPropMT<<< rSes.iBpropBlocks , rSes.iBpropThreads >>>(lSampleQtyReq, o, Option); 
      cudaEventRecord(stop, 0); 
      mCheckCudaWorked 

    cudaMemcpyFromSymbol(&lTotal, "devlTrainable", sizeof(long)); // retrieve return value 
     mCheckCudaWorked 
    cudaEventSynchronize(stop); 
     float elapsedTime; 
     cudaEventElapsedTime(&elapsedTime, start, stop); 
    conPrintf("DEVICE: Time to complete BackProp kernel: %3.1f ms\n", elapsedTime); 
     cudaEventDestroy(start); 
     cudaEventDestroy(stop); 

    return lTotal; 
} 


__global__ __device__ void mtkBackPropMT(long lSampleQtyReq, long o, char Option) 
{/*! divides error in yielded values and back-propagates corrections among weights */ 
// Option S - single sample correction only 
// Option E - keep existing weights, count trainable samples only 
// Option R - perform corrections for all trainable samples 

    if(Option=='E' || Option=='e'){ // 
     devlTrainable=0; // reset global mem trainable counter 
     subkBackPropEoptMT(lSampleQtyReq, o); 
    } 

    if(Option=='S' || Option=='s'){ 
     devlTrainable=0; // reset global mem trainable counter 
     subkBackPropSoptMT(lSampleQtyReq, false, devNet, devNet.Signals, devNet.Zs, devNet.Wt, devNet.Deltas, devLearn.gpuXInputs, devLearn.gpuYEval, devLearn.gpudYEval); 
    } 

    if(Option=='R' || Option=='r'){ // 
     devlTrainable=0; // reset global mem trainable counter 
     subkBackPropRoptMT(lSampleQtyReq, o); 
    } 
} 


__device__ void subkBackPropRoptMT(long lSampleQtyReq, long o) 
{/*! flags and counts samples meeting */ 
    long OUTROWLEN=devLearn.iOutputQty+1; // prepare array index and width 
    //long tIx = threadIdx.x + devSes.iEvalThreads * blockIdx.x; // tIx is thread index over the kernel 
    long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel 
    //long lTotalThreads = devSes.iBpropThreads * devSes.iBpropBlocks; // total number of threads 
    double maxSquared = devSes.dMAX * devSes.dMAX ; //needed to compart to stored delta squared values 

    devlTrainable=0; // clear global mem accumulator; out of bound samples will remain at this value 
    for (long s=0; s<lSampleQtyReq; ++s){ // iterate over samples 
     if(devLearn.gpudSE1024[IDX2C(o, s, OUTROWLEN)] > maxSquared){ // if the MAX criterion is exceeded 
      if(tIx==0)++devlTrainable; // increment the counter 
      subkBackPropSoptMT(s, true, devNet, devNet.Signals, devNet.Zs, devNet.Wt, devNet.Deltas, devLearn.gpuXInputs, devLearn.gpuYEval, devLearn.gpudYEval); 
     } 
    } 
} 


__device__ void subkBackPropSoptMT(long s, int o, rohanNetwork& Net, cuDoubleComplex * Signals, cuDoubleComplex * Zs, cuDoubleComplex * Wt, cuDoubleComplex * Deltas, cuDoubleComplex * XInputs, cuDoubleComplex * YEval, double * dYEval) 
{/*! propagates adjustment of weights backwards preceeding layers from the chosen network output. */ 
    // s is sample's index 
    // o is an optional method selection parameter; print/don't print as of 2/29/12 
    long index, kindex; // for warpwise loops 
    long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel 
    long lTotalThreads = gridDim.x * blockDim.x; // total number of threads 
    const cuDoubleComplex cdcZero = { 0, 0 }; 

    /* clear all temp values BP0 */ 
    for (long offset=0; (index =offset+tIx)< MAXNEURONS ; offset+=lTotalThreads){ // index stands for i 
     Deltas[index]=cdcZero; 
     Signals[index]=cdcZero; 
     Zs[index]=cdcZero; 
    } 
    /* re-evaluate sample to load temp values. BPI */ 
    subkEvalSampleBetaMT(devSes, s, Net, (s==0), Signals, Zs, Wt, XInputs, YEval, dYEval); 
    /* begin error calculation. BPII */ 
    cuDoubleComplex Deltastar /* measured error at the chosen network output. */ ; 
    /* calc top layer deltas. */ 
    long TOP=Net.iLayerQty-1; 
    int ROWLEN=Net.iNeuronQTY[TOP]; 
    //for(int i=0; i<Net.iNeuronQTY[TOP]; ++i){ 
    for (long offset=0; (index =offset+tIx)< Net.iNeuronQTY[TOP] ; offset+=lTotalThreads){ // index stands for i 
     // delta-star = D - Y = Desired output minus actual output from evaluation 
     // D is the cplx coords of the sector of the desired answer  Y is the complex result of evaluation of the given sample, unactivated. */ 
     Deltastar = CxSubtractCxUT( 
         devLearn.gpuDOutputs[ IDX2C(index, s, ROWLEN) ], 
         Signals[Net.iNeuronOfst[TOP]+index]); 
     /* divide the correction; delta = alpha * delta-star/n+1 (but alpha is always 1 for now). */ 
     //Deltas[Net.iNeuronOfst[TOP]+index] = CxDivideRlUT(Deltastar, Net.iDendrtQTY[TOP]); 
     Deltas[Net.iNeuronOfst[TOP]+index] = CxMultiplyRlUT(Deltastar, Net.dINV_S[TOP]); 
    } 
    __syncthreads(); 
    /* Now distribute the correction to lower layers if any. BPII.1 */ 
    if (Net.iLayerQty>2){ /* remember layer 0 = inputs, layer 1 = bottom row, layer {2..iLayerQty-2} = middle row, layer iLayerQty-1 = top row. */ 
     for (int L=Net.iLayerQty-1; L>1; --L){ 
      long LAY = L; /* setup access to layers. */ 
      long TRIB = L-1; /* trib for tributary.*/ 
      int iTributQTY=Net.iNeuronQTY[TRIB]; 
      //int Sj=Net.iDendrtQTY[TRIB]; if (TRIB==1) Sj=1; // Sj=1 for firest hidden layer 
      for (int i=1; i<Net.iNeuronQTY[LAY]; ++i) { // skip 0th neuron as its weights are either 1 (div identity) or 0 (div forbidden) and don't change anyway 
       // k index must begin at 1, neuron zero not valid for correction 
       //for (int k=1; k<iTributQTY; ++k) { /* the contribution to ith neuron's kth tributary's delta = i's delta/i's weight k. */ 
       for (long offset=1; (kindex =offset+tIx)< iTributQTY ; offset+=lTotalThreads){ // kindex stands for k 
            Deltas[Net.iNeuronOfst[TRIB]+kindex] 
        = CxAddCxUT (Deltas[Net.iNeuronOfst[TRIB]+kindex] , 
         CxDivideCxUT( 
          Deltas[Net.iNeuronOfst[LAY]+i] , 
          Wt[IDX2C(Net.iWeightOfst[LAY]+kindex, i, iTributQTY)])); 
       } 
      } 
      for (long offset=1; (kindex =offset+tIx)< iTributQTY ; offset+=lTotalThreads){ // kindex stands for k 
       //cuDoubleComplex preDiv=Deltas[Net.iNeuronOfst[TRIB]+kindex]; // diagnostic purpose only, remove if removing other diags 
       //Deltas[Net.iNeuronOfst[TRIB]+kindex] 
       // = CxDivideRlUT( 
       //  Deltas[Net.iNeuronOfst[TRIB]+kindex] , 
       //  Sj); 
       Deltas[Net.iNeuronOfst[TRIB]+kindex] 
        = CxMultiplyRlUT( 
         Deltas[Net.iNeuronOfst[TRIB]+kindex] , 
         Net.dINV_S[TRIB]); 
      } 
     } 
    } 
    __syncthreads(); 
    /* error distribution completed */ 
    /* and now update the weights BP III */ 
    /* adj weights on first hidden layer. */ 
     int FHID = 1; 
     int SIG = 0; 
     int iSignalQTY=Net.iNeuronQTY[SIG]; //rSes.rLearn->iInputQty+1; 
     int iHidWidth=Net.iNeuronQTY[FHID]; 
    for (int k=1; k<iHidWidth; ++k){ 
     //for (int i=0; i<iSignalQTY; ++i){ 
     for (long offset=0; (index =offset+tIx)< iSignalQTY ; offset+=lTotalThreads){ // index stands for i 
      /* dW=d*xbar/s1/|z|= neuron's delta * input's conjugate/(dendrites+1 * abs of input i). */ 
         Wt[IDX2C(Net.iWeightOfst[FHID]+index, k, iSignalQTY)] 
      =CxAddCxUT(Wt[IDX2C(Net.iWeightOfst[FHID]+index, k, iSignalQTY)] , 
       CxDivideRlUT( 
        CxMultiplyCxUT( 
         Deltas[Net.iNeuronOfst[FHID]+k] , 
         CxConjugateUT(Signals[Net.iNeuronOfst[SIG]+index]) 
        ) , 
        CxAbsUT(Zs[Net.iNeuronOfst[FHID]+k]) // N+1 denominator factor is considered redundant - JAW & IA 2/27/12 
       ) 
      ); 
     } 
    } 
    __syncthreads(); 
    /* re-evaluate sample to update temp values. */ 
    subkEvalSampleBetaMT(devSes, s, Net, false, Signals, Zs, Wt, XInputs, YEval, dYEval); 
    if (Net.iLayerQty>2){ 
     /* now use those outputs' conjugates and the deltas to adjust middle layers. BP III.1 */ 
     for (int L=2; L<Net.iLayerQty-1; ++L){ 
      /* setup access to layers. */ 
      long LAY = L; 
      long TRIB = L-1; 
      //int iLayWidth=Net.iNeuronQTY[LAY]; 
      int iTribWidth=Net.iNeuronQTY[TRIB]; 
      for (int k=1; k<Net.iNeuronQTY[LAY]; ++k){ 
       //for (int i=0; i<Net.iNeuronQTY[TRIB]; ++i){ 
       for (long offset=0; (index =offset+tIx)< Net.iNeuronQTY[TRIB] ; offset+=lTotalThreads){ // index stands for i 
        /* the adjustment added to kth neuron's ith trib's weight = k's delta * complex conjugate of i's signal/(abs of k's previous-wt product-sum * dendrites+1) . */ 
           Wt[IDX2C(Net.iWeightOfst[LAY]+index, k, iTribWidth)] 
        =CxAddCxUT(Wt[IDX2C(Net.iWeightOfst[LAY]+index, k, iTribWidth)] , 
         CxDivideRlUT( 
          CxMultiplyCxUT( 
           Deltas[Net.iNeuronOfst[LAY]+k] , 
           CxConjugateUT(Signals[Net.iNeuronOfst[TRIB]+index]) 
          ) , 
          ( 
           CxAbsUT(Zs[Net.iNeuronOfst[LAY]+k]) // N+1 denominator factor is considered redundant - JAW & IA 2/27/12 
          ) 
         ) 
        ); 
       } 
      } 
      /* layer is complete. */ 
      subkEvalSampleBetaMT(devSes, s, Net, true, Signals, Zs, Wt, XInputs, YEval, dYEval); 
     } 
    } 
    __syncthreads(); 

    /* correct output layer BP III.3 */ 
    long SUB = TOP-1; 
    //int iTopWidth=Net.iNeuronQTY[TOP]; 
    int iSubWidth=Net.iNeuronQTY[SUB]; 

    for (int k=1; k<Net.iNeuronQTY[TOP]; ++k){ 
     //for (int i=0; i<Net.iNeuronQTY[SUB]; ++i){ 
     for (long offset=0; (index =offset+tIx)< Net.iNeuronQTY[SUB] ; offset+=lTotalThreads){ // index stands for i 
      /* For last layer only, adjustment to kth neuron's ith weight = k's delta * complex conjugate of i's signal/(dendrites+1) . */ 
         Wt[IDX2C(Net.iWeightOfst[TOP]+index, k, iSubWidth)] 
      =CxAddCxUT(Wt[IDX2C(Net.iWeightOfst[TOP]+index, k, iSubWidth)] , 
       CxMultiplyCxUT( 
        Deltas[Net.iNeuronOfst[TOP]+k] , 
        CxConjugateUT(Signals[Net.iNeuronOfst[SUB]+index]) 
       ) 
      ); // N+1 denominator factor is considered redundant - JAW & IA 2/27/12 
     } 
    } 
    /* backprop is complete. */ 
} 


__device__ void subkEvalSampleBetaMT(rohanContext& Ses, long s, rohanNetwork& Net, int o, cuDoubleComplex * Signals, cuDoubleComplex * Zs, cuDoubleComplex * Wt, cuDoubleComplex * XInputs, cuDoubleComplex * YEval, double * dYEval) 
{// Beta uses fixed length fields instead of nested pointer layers 
    // delta squared is not updated, since they'll be updated when RMSE is checked at the end of a pass through the learning set 
    long index, kindex; // for warpwise loops 
    long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel 
    long lTotalThreads = gridDim.x * blockDim.x; // total number of threads 
    const cuDoubleComplex cdcZero = { 0, 0 }; 
    /*! layer zero (inputs) is special. */ 
    long INROWLEN=Net.iNeuronQTY[0];//rSes.rLearn->iInputQty+1; 
    //for (int i=0; i<INROWLEN; ++i){ 
    for (long offset=0; (index =offset+tIx)< INROWLEN ; offset+=lTotalThreads){ // index stands for i 
     Signals[Net.iNeuronOfst[0]+index]= XInputs[IDX2C(index, s, INROWLEN)]; 
    } 
    /*! middle and top layers. */ 
    for (int L=1; L<Net.iLayerQty; ++L){ 
     //struct rohanLayer& lay = Net.rLayer[L]; 
     long LAY=L; 
     int TRIB=L-1; // index of previous layer 
     int iNeuronQTY=Net.iNeuronQTY[LAY]; 
     int iSignalQTY=Net.iDendrtQTY[LAY]; // signal qty depends on size of previous layer 
     //for (int k=0; k<iNeuronQTY; ++k){ //Neuron zero is not skipped, its output should be 1+0i as a check 
     for (long offset=0; (kindex =offset+tIx)< iNeuronQTY ; offset+=lTotalThreads){ // kindex stands for k 
      Zs[Net.iNeuronOfst[LAY]+kindex]=cdcZero; 
      for (int i=0; i<iSignalQTY; ++i){ //walk weights on inputs from previous layer 
          Zs[Net.iNeuronOfst[LAY]+kindex] = 
       CxAddCxUT(Zs[Net.iNeuronOfst[LAY]+kindex] , 
        CxMultiplyCxUT(
         Wt[IDX2C(Net.iWeightOfst[LAY] + i, kindex, iSignalQTY)], 
         Signals[Net.iNeuronOfst[TRIB]+i])) ; 
      } 
      // ACTIVATE // 
      Signals[Net.iNeuronOfst[LAY]+kindex] = CxActivateUT(Zs[Net.iNeuronOfst[LAY]+kindex]); 
     } 
    } 
    /*! last layer values are converted and stored here */ 
    long TOP = Net.iLayerQty-1; 
    long OUTROWLEN=Net.iNeuronQTY[TOP]; 
    //for (int i=0; i<Net.iNeuronQTY[TOP]; ++i){ // continuous conversion begins here 
    for (long offset=0; (index =offset+tIx)< OUTROWLEN ; offset+=lTotalThreads){ // index stands for i 
     YEval[IDX2C(index, s, OUTROWLEN)]= Signals[Net.iNeuronOfst[TOP]+index] ; // store final complex output(s) 
     dYEval[IDX2C(index, s, OUTROWLEN)]=FUnitCxUT(YEval[IDX2C(index, s, OUTROWLEN)]) * Net.iSectorQty; // convert final complex outputs to sectors and store that 
     if(devLearn.iContOutputs==false) // round off decimal if disc activation is set 
      dYEval[IDX2C(index, s, OUTROWLEN)]=int(dYEval[IDX2C(index, s, OUTROWLEN)]); 
    } 
    /*! end of sample evaluation. */ 
} 

__device__ cuDoubleComplex CxActivateUT(const cuDoubleComplex Z) 
{/// applies ContActivation or discrete activation function to cx neuron output and returns Phi(Z) 
    /// This fn should be phased out in favor of a GPU device vector based fn 
    cuDoubleComplex phi; 
    if (devNet.bContActivation) { // apply ContActivation activation function to weighted sum : phi(z)=z/|z| 
     phi = CxDivideRlUT(Z, CxAbsUT(Z)); 
    } 
    else { // apply Discrete activation function to weighted sum : s=int(arctan(z)*k/2pi), phi(z)=(X(s),Y(s)) 
     double theta = atan2(Z.y, Z.x); // theta = arctan y/x 
     int iSector = (int)((theta * devNet.dK_DIV_TWO_PI) + devNet.iSectorQty) % devNet.iSectorQty; 
     phi = devNet.gpuSectorBdry[iSector]; 
     //printf(" %f+%fi %d Activate\n", phi.x, phi.y, threadIdx.x); 
    } 
    return phi; 
}

来源

2012-04-27 jwilson75503

我对cuda并不熟悉，但您发布的代码超过300行。曾听说[sscce.org]（http://sscce.org/）？ – 2012-04-27 05:15:15

你不可能真正期望有人坐下来筛选300多行难以理解的递归代码，这些代码实际上无法运行，以查找可能是微妙的记忆竞赛的内容。如果你不能努力将问题的范围缩小到其他人可以编译和运行的紧凑的repro案例，为什么你应该指望别人会努力回答你的问题？ – talonmies 2012-04-27 06:29:25

所以，我不会读所有的代码，但我可以给你一个强烈的暗示。 warp的大小是32个线程，所以64个线程的case会运行两个warps/block - 在前一种情况下，你不能有任何基于指令指针的竞争条件，但是，在第二种情况下，你将有两个组在不同的时间安排不同IP的线程。你可能已经知道很多这个（因此syncthreads），但是上述事实上几乎可以确定你只是还有一个你尚未解决的竞争条件。

放入同步线程是尝试隔离它的良好开端。你确定在你的循环中，一个warp的源数据不会被另一个warp覆盖吗？如果不尝试将syncthreads放入内部循环中，仅用于调试目的，以查看可能导致竞争条件的原因。

来源

2012-04-27 06:01:17 Crowley9

虽然这是一个很好的建议，但在内部循环中使用'__syncthreads（）'需要小心。执行模型要求给定warp中的每个线程在warp继续前执行'bar.sync'指令。如果warp中有分支分歧，以致一个或多个线程围绕'bar.sync'指令分支，则会导致死锁。 – talonmies 2012-04-27 06:33:41

CUDA有许多可用的源代码级调试器。我建议你单步执行代码并帮助缩小问题的范围。作为一种风格，避免长时间使用。它不是便携式的。 – 2012-04-27 06:48:38

不同数量的线程，不同的答案

回答

相关问题