2013-08-29 75 views
0

我正在尝试将2d数组传递给内核,以便每个线程都可以访问index = threadIdx.x +(blockIdx.x * blockDim.x),但我无法计算出如何做到这一点以及如何将数据复制回来。管理2D CUDA阵列

size_t pitch; 
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks); 
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks * sizeof(int)); 
kernel<<<grid_size, block_size>>>(d_array, pitch); 
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size, num_blocks, cudaMemcpyDeviceToHost); 
for (num_blocks) 
    for(block_size) 
    h_array[block][thread] should be 1 

__global__ void kernel(int *array, int pitch) { 
    int *row = (int*)((char*)array + blockIdx.x * pitch); 
    row[threadIdx.x] = 1; 
    return; 
} 

我在做什么错,在这里?

+0

为什么要将数组转换为(char *)?这将导致一个错误的指针算术 – LarryPel

+0

这就是它在这两个问题中描述的: http://stackoverflow.com/questions/1047369/allocate-2d-array-on-device-memory-in-cuda http: //stackoverflow.com/questions/5029920/how-to-use-2d-arrays-in-cuda – user1743798

+0

@LarryPel:不,它不会。间距以字节为单位,并且需要指向字节大小的类型的指针才能正确执行指针计算。 – talonmies

回答

1

您的cudaMemset2D正在占用您之前通过cudaMallocPitch分配的更大的内存空间另外,您的cudaMemcpy2D正在复制该内存的一小部分。

你应该以下面的方式使用该函数:

cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks); 
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks) // * sizeof(int)); <- This size is bigger than the previously declared 
kernel<<<grid_size, block_size>>>(d_array, pitch); 
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size * sizeof(int) /* you forgot this here */, num_blocks, cudaMemcpyDeviceToHost); 
+0

此外,如果您已经完成了[适当的cuda错误检查](http://stackoverflow.com/questions/14038589/what -is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api)你的'cudaMemset2D'(至少)会引发错误。 –

+0

此外,大概你的'h_array'没有投入。因此,您在cudaMemcpy2D中传递的'pitch'参数不正确。你应该传递'block_size * sizeof(int)'(或者类似的东西)作为'h_array'的音高。 –

1

下面是经过与@hidrargyro提到的错误的基本测试了固定的完整代码:

$ cat t236.cu 
#include <stdio.h> 

#define cudaCheckErrors(msg) \ 
    do { \ 
     cudaError_t __err = cudaGetLastError(); \ 
     if (__err != cudaSuccess) { \ 
      fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ 
       msg, cudaGetErrorString(__err), \ 
       __FILE__, __LINE__); \ 
      fprintf(stderr, "*** FAILED - ABORTING\n"); \ 
      exit(1); \ 
     } \ 
    } while (0) 


__global__ void kernel(int *array, int pitch) { 
    int *row = (int*)((char*)array + blockIdx.x * pitch); 
    row[threadIdx.x] = 1; 
    return; 
} 

int main(){ 

int *d_array, *h_array; 
int block_size = 256; 
int num_blocks = 256; 
int grid_size = num_blocks; 
h_array=(int *)malloc(block_size*num_blocks*sizeof(int)); 
if (h_array==0) {printf("malloc fail\n"); return 1;} 
cudaMalloc((void **)&d_array, block_size*num_blocks*sizeof(int)); 
cudaCheckErrors("cudaMalloc fail"); 

size_t pitch; 
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks); 
cudaCheckErrors("cudaMallocPitch fail"); 
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks); 
cudaCheckErrors("cudaMemset2D fail"); 
kernel<<<grid_size, block_size>>>(d_array, pitch); 
cudaDeviceSynchronize(); 
cudaCheckErrors("kernel fail"); 

cudaMemcpy2D(h_array, block_size*sizeof(int), d_array, pitch, block_size*sizeof(int), num_blocks, cudaMemcpyDeviceToHost); 
cudaCheckErrors("cudaMemcpy 2D fail"); 
for (int i = 0; i<num_blocks; i++) 
    for(int j = 0; j<block_size; j++) 
    if (h_array[i*block_size+j] != 1) {printf("mismatch at i=%d, j=%d, should be 1, was %d\n", i,j,h_array[i*block_size+j]); return 1;} 
printf("success\n"); 
return 0; 
} 

$ nvcc -arch=sm_20 -o t236 t236.cu 
$ ./t236 
success 
$ 

如果您打算接受答案,请接受@hidrargyro给出的答案

+0

这就像CUDA_SAFE_CALL(),我总是怀疑while(0) – hidrargyro