2014-07-07 164 views
-5

有人能解释为什么当我的共享内存指针数组TMS在除第0个索引以外的某个索引处访问时,内核不工作(发生在最后一行)?如果最后一行使用TMS [0],则所有内容都按预期工作。当我将TMS [0]更改为任何其他索引时,出现CUDA意外错误。假设一个块上有64个线程。CUDA:在共享内存中使用全局线索索引将不起作用

#include <stdio.h> 
#include <stdlib.h> 
#include "cuda.h" 
#include <cuda_runtime.h> 
#include <curand_kernel.h> 

__global__ void myKern(float *masterForces) 
{ 
    int globalIdx = ...// set global thread id 

    volatile __shared__ float uniques[64]; 

    { 
     uniques[globalIDx] = 0; 
    } 

    __syncthreads(); 


    volatile __shared__ float *TMS[64]; 

    { 
     TMS[globalIdx] = (&uniques[globalIdx]); 
    } 

    __syncthreads(); 

    masterForces[globalIdx] = *TMS[1]; 
} 

原始环境,如果你很好奇:(你真的不需要看这个来解决我的问题)

#include <stdio.h> 
#include <stdlib.h> 
#include "cuda.h" 
#include "curand.h" 
#include <cuda_runtime.h> 
#include "math.h" 
#include <curand_kernel.h> 
#include <time.h> 


__global__ void myKern(const float *transMatrix, const int *pointerMatrix, float *masterForces, const double *rands, const int r_max) 
{ 




int globalIdx = ((blockIdx.x + (blockIdx.y * gridDim.x)) * (blockDim.x * blockDim.y)) + (threadIdx.x + (threadIdx.y * blockDim.x)); 

volatile __shared__ float uniques[51]; 

uniques[0] = transMatrix[0]; uniques[1] = transMatrix[1]; uniques[2] = transMatrix[2]; // 1 
uniques[3] = transMatrix[3]; uniques[4] = transMatrix[4]; uniques[5] = transMatrix[12]; // 2 
uniques[6] = transMatrix[14]; uniques[7] = transMatrix[15]; uniques[8] = transMatrix[24]; // 3 
uniques[9] = transMatrix[26]; uniques[10] = transMatrix[27]; uniques[11] = transMatrix[28]; // 4 
uniques[12] = transMatrix[40]; uniques[13] = transMatrix[50]; uniques[14] = transMatrix[60]; // 5 
uniques[15] = transMatrix[62]; uniques[16] = transMatrix[146]; uniques[17] = transMatrix[156]; // 6 
uniques[18] = transMatrix[158]; uniques[19] = transMatrix[168]; uniques[20] = transMatrix[170]; // 7 
uniques[21] = transMatrix[172]; uniques[22] = transMatrix[184]; uniques[23] = transMatrix[290]; // 8 
uniques[24] = transMatrix[300]; uniques[25] = transMatrix[302]; uniques[26] = transMatrix[312]; // 9 
uniques[27] = transMatrix[314]; uniques[28] = transMatrix[316]; uniques[29] = transMatrix[328]; // 10 
uniques[30] = transMatrix[1010]; uniques[31] = transMatrix[1020]; uniques[32] = transMatrix[1022]; // 11 
uniques[33] = transMatrix[1032]; uniques[34] = transMatrix[1034]; uniques[35] = transMatrix[1036]; // 12 
uniques[36] = transMatrix[1048]; uniques[37] = transMatrix[1154]; uniques[38] = transMatrix[1164]; // 13 
uniques[39] = transMatrix[1166]; uniques[40] = transMatrix[1176]; uniques[41] = transMatrix[1178]; // 14 
uniques[42] = transMatrix[1180]; uniques[43] = transMatrix[1192]; uniques[44] = transMatrix[2018]; // 15 
uniques[45] = transMatrix[2028]; uniques[46] = transMatrix[2030]; uniques[47] = transMatrix[2040]; // 16 
uniques[48] = transMatrix[2042]; uniques[49] = transMatrix[2044]; uniques[50] = transMatrix[2056]; // 17 

__syncthreads(); 


volatile __shared__ float *TMS[2592]; 

    for (int t=0; t<2592; t++)  
    { 
    for (int m=0; m< 51; m++){ 
     if (pointerMatrix[t] == m) 
     { 
     TMS[t] = (&uniques[m]); 
     } 
    } 
__syncthreads(); 


int b0 = 0; 
int c0 = 0; 
int d0 = 0; 
int e0 = 0; 
int f0 = 0; 
int g0 = 0; 
int h0 = 0; 
int i0 = 0; 
int j0 = 0; 
int k0 = 0; 
int l0 = 0; 
int m0 = 0; 
int n0 = 0; 
int o0 = 0; 
int p0 = 0; 
int q0 = 0; 
int r0 = 0; 
int s0 = 0; 
int t0 = 0; 
int u0 = 0; 
int v0 = 0; 
int w0 = 0; 
int x0 = 0; 
int y0 = 0; 




int index = 0; 
float r = 0.0; 
float temp = 0; 

int RUsnapshot = 0; 
int leftsnap = 0; 

curandState s; 
curand_init (rands[globalIdx] , 0, 0, &s); 



for (int i =0; i < 160000; i++) //@@@depends on iterations @@@@@ 
{ 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = b0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((0 * 6 + c0) * 6 + b0) * 2) * 6) ; 

      b0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = b0; 
     ///////////////////////////////////////////////////////////  
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = c0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + d0) * 6 + c0) * 2) * 6) ; 

      c0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = c0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = d0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + e0) * 6 + d0) * 2) * 6) ; 

      d0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = d0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = e0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + f0) * 6 + e0) * 2) * 6) ; 

      e0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = e0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = f0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + g0) * 6 + f0) * 2) * 6) ; 

      f0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = f0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = g0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + h0) * 6 + g0) * 2) * 6) ; 

      g0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = g0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = h0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + i0) * 6 + h0) * 2) * 6) ; 

      h0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = h0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = i0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + j0) * 6 + i0) * 2) * 6) ; 

      i0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = i0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = j0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + k0) * 6 + j0) * 2) * 6) ; 

      j0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = j0; 
     ///////////////////////////////////////////////////////////  
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = k0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + l0) * 6 + k0) * 2) * 6) ; 

      k0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = k0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = l0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + m0) * 6 + l0) * 2) * 6) ; 

      l0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = l0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = m0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + n0) * 6 + m0) * 2) * 6) ; 

      m0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = m0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = n0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + o0) * 6 + n0) * 2) * 6) ; 

      n0 += (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = n0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = o0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + p0) * 6 + o0) * 2) * 6) ; 

      o0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = o0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = p0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + q0) * 6 + p0) * 2) * 6) ; 

      p0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = p0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = q0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + r0) * 6 + q0) * 2) * 6) ; 

      q0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = q0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = r0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + s0) * 6 + r0) * 2) * 6) ; 

      r0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = r0; 
     ///////////////////////////////////////////////////////////  
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = s0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + t0) * 6 + s0) * 2) * 6) ; 

      s0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = s0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = t0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + u0) * 6 + t0) * 2) * 6) ; 

      t0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = t0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot =u0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + v0) * 6 + u0) * 2) * 6) ; 

      u0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap =u0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = v0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + w0) * 6 + v0) * 2) * 6) ; 

      v0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = v0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = w0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + x0) * 6 + w0) * 2) * 6) ; 

      w0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = w0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = x0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + y0) * 6 + x0) * 2) * 6) ; 

      x0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 

      leftsnap = x0; 
     /////////////////////////////////////////////////////////// 
     /////////////////////////////////////////////////// 
      r = curand_uniform(&s); 

      RUsnapshot = y0; 

      //index = ((((left[j] * dimen2 + right[j]) * dimen3 + RU[j +1 ]) * dimen4) * dimen5) ; 
      index = ((((leftsnap * 6 + 0) * 6 + y0) * 2) * 6) ; 

      y0+= (r < *TMS[index]) * (*TMS[index + 1]) + 
       (! (r < *TMS[index])) * (r < *TMS[index + 2]) * (*TMS[index + 3]) + 
       (! (r < *TMS[index + 2])) * (r < *TMS[index + 4]) * (*TMS[index + 5]) ; 


     /////////////////////////////////////////////////////////// 






      temp = (b0 ==4) + (b0 ==5) + (c0 ==4) + (c0 ==5) + (d0 ==4) + (d0 ==5) + (e0 ==4) + (e0 ==5) + (f0 ==4) + (f0 ==5) + 
        (g0 ==4) + (g0 ==5) + (h0 ==4) + (h0 ==5) + (i0 ==4) + (i0 ==5) + (j0 ==4) + (j0 ==5) + (k0 ==4) + (k0 ==5) + 
        (l0 ==4) + (l0 ==5) + (m0 ==4) + (m0 ==5) + (n0 ==4) + (n0 ==5) + (o0 ==4) + (o0 ==5) + (p0 ==4) + (p0 ==5) + 
        (q0 ==4) + (q0 ==5) + (r0 ==4) + (r0 ==5) + (s0 ==4) + (s0 ==5) + (t0 ==4) + (t0 ==5) + (u0 ==4) + (u0 ==5) + 
        (v0 ==4) + (v0 ==5) + (w0 ==4) + (w0 ==5) + (x0 ==4) + (x0 ==5) + (y0 ==4) + (y0 ==5); 


     masterForces[globalIdx + (r_max * i)] = *TMS[1]; 
     temp = 0.0; 

} 



} 

} 
+1

发布一个完整的代码,演示错误请。 SO期待这一点。 –

+0

在*共享内存*数组中使用* global *索引作为偏移量通常不是一个好兆头... –

+0

好的,但我警告过你,这很复杂。上面的代码应该能够解决这个问题。 – Jordan

回答

0

您的示例代码不应像Robert Crovella证明的那样给出任何错误。我查看了您的原始代码,发现由于第一个“for”循环导致的未对齐括号。祝你好运...

+0

好眼睛。我现在感觉像一个屁股发布这个可怕的问题。非常感谢。 – Jordan

4

假设你的全局线程指数(globalIdx)事实上是一个全局线程指数,如像:

int globalIdx = threadIdx.x+blockDim.x*blockIdx.x; // e.g. for 1D grid/threadblock 

那么你不能索引当你的全局线程指数超过63一64项的共享存储阵列:

uniques[globalIDx] = 0; 

TMS[globalIdx] = (&uniques[globalIdx]); 

相反,如果你只使用线程索引:

uniques[threadIdx.x] = 0; 

,并在你的代码相似的其他地方,你至少应该索引到共享存储阵列是OK ,假设你每块有64个线程。

这里是一个完全样例:

$ cat t463.cu 
#include <stdio.h> 
#define DSIZE 128 
#define nTPB 64 
__global__ void myKern(float *masterForces) 
{ 
    int globalIdx = threadIdx.x+blockDim.x*blockIdx.x; 

    volatile __shared__ float uniques[nTPB]; 

    { 
     uniques[threadIdx.x] = 0; 
    } 

    __syncthreads(); 


    volatile __shared__ float *TMS[nTPB]; 

    { 
     TMS[threadIdx.x] = &(uniques[threadIdx.x]); 
    } 

    __syncthreads(); 

    masterForces[globalIdx] = *TMS[1]; 
} 

int main(){ 

    float *d_data, *h_data; 
    h_data=(float *)malloc(DSIZE*sizeof(float)); 
    cudaMalloc(&d_data, DSIZE*sizeof(float)); 
    for (int i = 0; i< DSIZE; i++) 
    h_data[i] = 1.0f; 
    cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice); 
    myKern<<<DSIZE/nTPB, nTPB>>>(d_data); 
    cudaMemcpy(h_data, d_data, DSIZE*sizeof(float), cudaMemcpyDeviceToHost); 
    for (int i = 0; i<DSIZE; i++) 
    if (h_data[i] != 0.0f) {printf("mismatch at %d, was: %f should be: %f\n", i, h_data[i], 0.0f); return 1;} 
    printf("Success\n"); 
    return 0; 
} 
$ nvcc -arch=sm_20 -o t463 t463.cu 
$ cuda-memcheck ./t463 
========= CUDA-MEMCHECK 
Success 
========= ERROR SUMMARY: 0 errors 
$ 

在未来,请提供整个例子展示了问题,这样的问题。 SO期待这一点,如果你没有提供这个问题,这是一个有效的密切原因。

+0

对不起,迟到了,正在走我的狗。邪恶的回复,谢谢罗伯特。只有一件事我不明白。为什么globalIdx不工作?如果我有64个线程,globalIdx不应该超过63.你能帮我理解吗? – Jordan

+0

如果将整个内核*限制为64个线程,则'globalIdx'可以工作。为了证明这一点,只需将我的代码中的#define DSIZE更改为'64',然后就可以在任何具有'threadIdx.x'的地方使用'globalIdx'。 (这样做的净效果只会启动一个块)但是如果您的整个内核启动(即网格)涉及多于64个线程,则不能使用全局索引索引到共享内存数组中。一旦你到达第二个街区,全球指数从64开始,然后到127(在这个例子中)。 –