存在于计算能力1.3 GPU的全局内存中的一个无符号字符数组的步进存取问题。为了绕过全局存储器的聚结的要求,螺纹依次访问全局存储器和复制仅使用2存储器事务以下示例阵列到共享存储器:如何避免将数据从全局数据加载到共享内存时发生银行冲突
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
__global__ void kernel (unsigned char *d_text, unsigned char *d_out) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
extern __shared__ unsigned char s_array[];
uint4 *uint4_text = (uint4 *) d_text;
uint4 var;
//memory transaction
var = uint4_text[0];
uchar4 c0 = *reinterpret_cast<uchar4 *>(&var.x);
uchar4 c4 = *reinterpret_cast<uchar4 *>(&var.y);
uchar4 c8 = *reinterpret_cast<uchar4 *>(&var.z);
uchar4 c12 = *reinterpret_cast<uchar4 *>(&var.w);
s_array[threadIdx.x*16 + 0] = c0.x;
s_array[threadIdx.x*16 + 1] = c0.y;
s_array[threadIdx.x*16 + 2] = c0.z;
s_array[threadIdx.x*16 + 3] = c0.w;
s_array[threadIdx.x*16 + 4] = c4.x;
s_array[threadIdx.x*16 + 5] = c4.y;
s_array[threadIdx.x*16 + 6] = c4.z;
s_array[threadIdx.x*16 + 7] = c4.w;
s_array[threadIdx.x*16 + 8] = c8.x;
s_array[threadIdx.x*16 + 9] = c8.y;
s_array[threadIdx.x*16 + 10] = c8.z;
s_array[threadIdx.x*16 + 11] = c8.w;
s_array[threadIdx.x*16 + 12] = c12.x;
s_array[threadIdx.x*16 + 13] = c12.y;
s_array[threadIdx.x*16 + 14] = c12.z;
s_array[threadIdx.x*16 + 15] = c12.w;
d_out[idx] = s_array[threadIdx.x*16];
}
int main (void) {
unsigned char *d_text, *d_out;
unsigned char *h_out = (unsigned char *) malloc (32 * sizeof (unsigned char));
unsigned char *h_text = (unsigned char *) malloc (32 * sizeof (unsigned char));
int i;
for (i = 0; i < 32; i++)
h_text[i] = 65 + i;
cudaMalloc ((void**) &d_text, 32 * sizeof (unsigned char));
cudaMalloc ((void**) &d_out, 32 * sizeof (unsigned char));
cudaMemcpy (d_text, h_text, 32 * sizeof (unsigned char), cudaMemcpyHostToDevice);
kernel<<<1,32,16128>>>(d_text, d_out);
cudaMemcpy (h_out, d_out, 32 * sizeof (unsigned char), cudaMemcpyDeviceToHost);
for (i = 0; i < 32; i++)
printf("%c\n", h_out[i]);
return 0;
}
的问题是,组冲突在将数据复制到共享内存时发生(由nvprof报告,上述示例的冲突为384次),这会导致线程的序列化访问。
共享内存分为16个(或更新设备体系结构中的32个)32位存储区,以便同时服务相同半经线的16个线程。数据交错存储在第i个32位字始终存储在i%16-1共享存储区中。
由于每个线程读取一个内存事务的16个字节,这些字符将以交叉方式存储到共享内存中。这会导致线程0,4,8,12之间的冲突; 1,5,9,13; 2,6,10,14; 3,7,11,15是同一个半经线。消除体冲突一个天真的方法是使用的if/else分支将数据存储在类似以下内容的循环方式共享内存,但导致一些严重的线程分歧:
int tid16 = threadIdx.x % 16;
if (tid16 < 4) {
s_array[threadIdx.x * 16 + 0] = c0.x;
s_array[threadIdx.x * 16 + 1] = c0.y;
s_array[threadIdx.x * 16 + 2] = c0.z;
s_array[threadIdx.x * 16 + 3] = c0.w;
s_array[threadIdx.x * 16 + 4] = c4.x;
s_array[threadIdx.x * 16 + 5] = c4.y;
s_array[threadIdx.x * 16 + 6] = c4.z;
s_array[threadIdx.x * 16 + 7] = c4.w;
s_array[threadIdx.x * 16 + 8] = c8.x;
s_array[threadIdx.x * 16 + 9] = c8.y;
s_array[threadIdx.x * 16 + 10] = c8.z;
s_array[threadIdx.x * 16 + 11] = c8.w;
s_array[threadIdx.x * 16 + 12] = c12.x;
s_array[threadIdx.x * 16 + 13] = c12.y;
s_array[threadIdx.x * 16 + 14] = c12.z;
s_array[threadIdx.x * 16 + 15] = c12.w;
} else if (tid16 < 8) {
s_array[threadIdx.x * 16 + 4] = c4.x;
s_array[threadIdx.x * 16 + 5] = c4.y;
s_array[threadIdx.x * 16 + 6] = c4.z;
s_array[threadIdx.x * 16 + 7] = c4.w;
s_array[threadIdx.x * 16 + 8] = c8.x;
s_array[threadIdx.x * 16 + 9] = c8.y;
s_array[threadIdx.x * 16 + 10] = c8.z;
s_array[threadIdx.x * 16 + 11] = c8.w;
s_array[threadIdx.x * 16 + 12] = c12.x;
s_array[threadIdx.x * 16 + 13] = c12.y;
s_array[threadIdx.x * 16 + 14] = c12.z;
s_array[threadIdx.x * 16 + 15] = c12.w;
s_array[threadIdx.x * 16 + 0] = c0.x;
s_array[threadIdx.x * 16 + 1] = c0.y;
s_array[threadIdx.x * 16 + 2] = c0.z;
s_array[threadIdx.x * 16 + 3] = c0.w;
} else if (tid16 < 12) {
s_array[threadIdx.x * 16 + 8] = c8.x;
s_array[threadIdx.x * 16 + 9] = c8.y;
s_array[threadIdx.x * 16 + 10] = c8.z;
s_array[threadIdx.x * 16 + 11] = c8.w;
s_array[threadIdx.x * 16 + 12] = c12.x;
s_array[threadIdx.x * 16 + 13] = c12.y;
s_array[threadIdx.x * 16 + 14] = c12.z;
s_array[threadIdx.x * 16 + 15] = c12.w;
s_array[threadIdx.x * 16 + 0] = c0.x;
s_array[threadIdx.x * 16 + 1] = c0.y;
s_array[threadIdx.x * 16 + 2] = c0.z;
s_array[threadIdx.x * 16 + 3] = c0.w;
s_array[threadIdx.x * 16 + 4] = c4.x;
s_array[threadIdx.x * 16 + 5] = c4.y;
s_array[threadIdx.x * 16 + 6] = c4.z;
s_array[threadIdx.x * 16 + 7] = c4.w;
} else {
s_array[threadIdx.x * 16 + 12] = c12.x;
s_array[threadIdx.x * 16 + 13] = c12.y;
s_array[threadIdx.x * 16 + 14] = c12.z;
s_array[threadIdx.x * 16 + 15] = c12.w;
s_array[threadIdx.x * 16 + 0] = c0.x;
s_array[threadIdx.x * 16 + 1] = c0.y;
s_array[threadIdx.x * 16 + 2] = c0.z;
s_array[threadIdx.x * 16 + 3] = c0.w;
s_array[threadIdx.x * 16 + 4] = c4.x;
s_array[threadIdx.x * 16 + 5] = c4.y;
s_array[threadIdx.x * 16 + 6] = c4.z;
s_array[threadIdx.x * 16 + 7] = c4.w;
s_array[threadIdx.x * 16 + 8] = c8.x;
s_array[threadIdx.x * 16 + 9] = c8.y;
s_array[threadIdx.x * 16 + 10] = c8.z;
s_array[threadIdx.x * 16 + 11] = c8.w;
}
任何人都可以想出一个更好的解决方案呢?我已经研究过SDK的缩减例子,但我不确定它适用于我的问题。
我实际上将uint4字(128位)直接存储到共享内存中。每字节应对是尝试找到解决银行冲突问题的临时方法 – charis