2011-12-20 45 views
2

我使用共享内存运行过程中出现的follwoing代码:CUDA的MEMCHECK未出界的报告共享内存访问

__global__ void computeAddShared(int *in , int *out, int sizeInput){ 
    //not made parameters gidata and godata to emphasize that parameters get copy of address and are different from pointers in host code 
    extern __shared__ float temp[]; 

    int tid = blockIdx.x * blockDim.x + threadIdx.x; 
    int ltid = threadIdx.x; 
    temp[ltid] = 0; 
    while(tid < sizeInput){ 
     temp[ltid] += in[tid]; 
     tid+=gridDim.x * blockDim.x; // to handle array of any size 
    } 
    __syncthreads(); 
    int offset = 1; 
    while(offset < blockDim.x){ 
     if(ltid % (offset * 2) == 0){ 
      temp[ltid] = temp[ltid] + temp[ltid + offset]; 
     } 
     __syncthreads(); 
     offset*=2; 
    } 
    if(ltid == 0){ 
     out[blockIdx.x] = temp[0]; 
    } 

} 

int main(){ 

    int size = 16; // size of present input array. Changes after every loop iteration 
    int cidata[] = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; 
    /*FILE *f; 
    f = fopen("invertedList.txt" , "w"); 
     a[0] = 1 + (rand() % 8); 
     fprintf(f, "%d,",a[0]); 
     for(int i = 1 ; i< N; i++){ 
      a[i] = a[i-1] + (rand() % 8) + 1; 
      fprintf(f, "%d,",a[i]); 
     } 
     fclose(f);*/ 
    int* gidata; 
    int* godata; 
    cudaMalloc((void**)&gidata, size* sizeof(int)); 
    cudaMemcpy(gidata,cidata, size * sizeof(int), cudaMemcpyHostToDevice); 
    int TPB = 4; 
    int blocks = 10; //to get things kicked off 
    cudaEvent_t start, stop; 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 
    cudaEventRecord(start, 0); 
    while(blocks != 1){ 
     if(size < TPB){ 
      TPB = size; // size is 2^sth 
     } 
     blocks = (size+ TPB -1)/TPB; 
     cudaMalloc((void**)&godata, blocks * sizeof(int)); 
     computeAddShared<<<blocks, TPB,TPB>>>(gidata, godata,size); 
     cudaFree(gidata); 
     gidata = godata; 
     size = blocks; 
    } 
    //printf("The error by cuda is %s",cudaGetErrorString(cudaGetLastError())); 


    cudaEventRecord(stop, 0); 
    cudaEventSynchronize(stop); 
    float elapsedTime; 
    cudaEventElapsedTime(&elapsedTime , start, stop); 
    printf("time is %f ms", elapsedTime); 
    int *output = (int*)malloc(sizeof(int)); 
    cudaMemcpy(output, gidata, sizeof(int), cudaMemcpyDeviceToHost); 
    //Cant free either earlier as both point to same location 
    cudaError_t chk = cudaFree(godata); 
    if(chk!=0){ 
     printf("First chk also printed error. Maybe error in my logic\n"); 
    } 

    printf("The error by threadsyn is %s", cudaGetErrorString(cudaGetLastError())); 
    printf("The sum of the array is %d\n", output[0]); 
    getchar(); 

    return 0; 
} 

显然,第一个while循环computeAddShared导致出界失误,因为我分配4字节到共享内存。为什么cudamemcheck不能理解这一点。以下是cuda-memcheck的输出:

========= CUDA-MEMCHECK 
time is 12.334816 msThe error by threadsyn is no errorThe sum of the array is 13 
6 

========= ERROR SUMMARY: 0 errors 

回答

3

共享内存分配粒度。硬件无疑具有分配的页面大小(可能与L1缓存线一侧相同)。每个块只有4个线程,在一个页面中“意外”会有足够的共享内存来让你编码工作。如果您使用了合理的线程数量(即,warp大小的整数倍),则会检测到错误,因为没有足够的分配内存。