2013-12-08 125 views
2

我想从全局内存复制到共享内存中,我做了以下复制全局共享内存

__global__ void test(unsigned char *image, unsigned char *out, int n, int m) 
{ 
     int x = threadIdx.x + blockIdx.x * blockDim.x; 
     int y = threadIdx.y + blockIdx.y * blockDim.y; 
     int index = x + y * blockDim.x * gridDim.x; 


    __shared__ unsigned char shared [16*16*3]; 

    if (threadIdx.x < 256) 

    { 

    shared[threadIdx.x*3+0] = image[index*3+0]; 
    shared[threadIdx.x*3+1] = image[index*3+1]; 
    shared[threadIdx.x*3+2] = image[index*3+2]; 


    } 

    __syncthreads(); 

    if (threadIdx.x < 256) 

    { 
    out[index*3+0] = shared[threadIdx.x*3+0]; 
    out[index*3+1] = shared[threadIdx.x*3+1]; 
    out[index*3+2] = shared[threadIdx.x*3+2]; 
    } 

} 

我有一个512×512的图像和我打电话那样的内核:

out = (unsigned char*) malloc(n*m*3); 
cudaMalloc((void**)&dev_image, n*m*3); 
cudaMalloc((void**)&dev_out, n*m*3); 
cudaMemcpy(dev_image, image, n*m*3, cudaMemcpyHostToDevice); 
cudaMemcpy(dev_out, out, n*m*3, cudaMemcpyHostToDevice); 

dim3 threads(16,16); 
dim3 blocks(32, 32); 

test<<<blocks, threads>>>(dev_image, dev_out, n, m); 
cudaThreadSynchronize(); 

cudaMemcpy(out, dev_out, n*m*3, cudaMemcpyDeviceToHost); 

任何想法我做错了什么?如何将全局内存的一部分复制到共享内存(一维)?

回答

3

在您的内核中,您检查threadIdx.x < 256这是错误的,因为threadIdx.x不能大于15.您必须改为在16x16线程块内检查您的索引。

我已经改变了你的内核,以这样的:

__global__ void test(unsigned char *image, unsigned char *out, int n, int m) 
{ 
    int x = threadIdx.x + blockIdx.x * blockDim.x; 
    int y = threadIdx.y + blockIdx.y * blockDim.y; 
    int index = x + y * blockDim.x * gridDim.x; 
    int blockIndex = threadIdx.x + threadIdx.y * blockDim.x; 

    __shared__ unsigned char shared [16*16*3]; 

    if (blockIndex < 256 && index < n*m) 
    { 
     shared[blockIndex*3+0] = image[index*3+0]; 
     shared[blockIndex*3+1] = image[index*3+1]; 
     shared[blockIndex*3+2] = image[index*3+2]; 
    } 

    __syncthreads(); 

    if (blockIndex < 256 && index < n*m) 
    { 
     out[index*3+0] = shared[blockIndex*3+0]; 
     out[index*3+1] = shared[blockIndex*3+1]; 
     out[index*3+2] = shared[blockIndex*3+2]; 
    } 
} 

你也永远不应该忘记的内核和CUDA API调用在内核范围检查(我已经添加,太)和适当的CUDA错误检查。