我使用的是GTX 280,它具有计算能力1.3并支持共享内存上的原子操作。我正在使用cuda SDK 2.2和VS 2005.在我的程序中,我必须广泛使用原子操作,因为根本没有其他办法。CUDA中的共享内存上的原子操作
一个例子是我必须计算一个数组的运行总和并找出总和超过给定截止值的索引。为此,我使用scan算法的变体并使用atomicMin
来存储索引,而值小于阈值。因此,这样最终共享内存的索引值就会小于阈值。
这只是内核的一个组件,在内核调用中有很多类似的代码块。
我有3个问题
- 首先,我一直无法编译代码,因为它说的原子操作没有定义,我已搜查,但没有找到我要补充的文件。
- 其次,我总算在由CUDA SDK提供的代码复制它来编译代码,但后来跟它的原子操作中不支持共享内存,其中,因为它是在以下program
- 运行即使我在命令行编译中使用
-arch sm_12
来解决黑客问题,使用这些原子操作的代码片段也花费了很多时间。
我相信在最坏的情况下,我应该得到某种加速,因为没有太多的原子操作,我使用1块16x16。不幸的是,串行代码运行速度提高了10倍。
下面我张贴内核鳕鱼*,这个内核调用似乎是瓶颈,如果有人可以帮助我优化那么这将是很好的。串行代码只是以串行方式执行这些操作。我使用的是16 X 16的块配置。
代码似乎很长,但实际上它包含一个if代码块,而代码块执行几乎相同的任务,但它们无法合并。
#define limit (int)(log((float)256)/log((float)2))
// This receives a pointer to an image, some variables and 4 more arrays cont(of size 256) vars(some constants), lim and buf(of image size)
// block configuration 1 block of 16x16
__global__ void kernel_Main(unsigned char* in, int height,int width, int bs,int th, double cutoff, uint* cont,int* vars, unsigned int* lim,unsigned int* buf)
{
int j = threadIdx.x;
int i = threadIdx.y;
int k = i*blockDim.x+j;
__shared__ int prefix_sum[256];
__shared__ int sum_s[256];
__shared__ int ary_shared[256];
__shared__ int he_shared[256];
// this is the threshold
int cutval = (2*width*height)*cutoff;
prefix_sum[k] = cont[k];
int l;
// a variant of scan algorithm
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > cutval)
{
atomicMin(&vars[cut],k);
}
}
__syncthreads();
}
// The first thread will store the value in global array
if(k==0)
{
vars[cuts]=prefix_sum[vars[cut]];
}
__syncthreads();
if(vars[n])
{
// bs = 7 in this case
if(i<bs && j<bs)
{
// using atomic add because the index could be same for 2 different threads
atomicAdd(&ary_shared[in[i*(width) + j]],1);
}
__syncthreads();
int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[k];
sum_s[k] = 0;
// Again prefix sum
int l;
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > minth)
{
atomicMin(&vars[hmin],k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hminc]=prefix_sum[255];
// because we will always overshoot by 1
vars[hmin]--;
}
__syncthreads();
int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[255-k];
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > maxth)
{
atomicMin(&vars[hmax], k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hmaxc]=prefix_sum[255];
vars[hmax]--;
vars[hmax]=255-vars[hmax];
}
__syncthreads();
int rng = vars[hmax] - vars[hmin];
if(rng >= vars[cut])
{
if(k <= vars[hmin])
he_shared[k] = 0;
else if(k >= vars[hmax])
he_shared[k] = 255;
else
he_shared[k] = (255 * (k - vars[hmin]))/rng;
}
__syncthreads();
// only 7x7 = 49 threads will do this
if(i>0 && i<=bs && j>0 && j<=bs)
{
int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);
if(rng >= vars[cut])
{
int value = he_shared[in[base]];
buf[base]+=value;
lim[base]++;
}
else
{
buf[base]+=255;
lim[base]++;
}
}
if(k==0)
vars[n]--;
__syncthreads();
}// if(n) block closes here
while(vars[n])
{
if(k==0)
{
if(vars[ox]==0 && vars[d1] ==3)
vars[d1] = 0; // l2r
else if(vars[ox]==0 && vars[d1]==2)
vars[d1] = 3; // l u2d
else if(vars[ox]==width-bs && vars[d1]==0)
vars[d1] = 1; // r u2d
else if(vars[ox]==width-bs && vars[d1]==1)
vars[d1] = 2; // r2l
}
// Because this value will be changed so
// all the threads should set their registers before
// they move forward
int ox_d = vars[ox];
int oy_d = vars[oy];
// Just putting it here so that all the threads should have set their
// values before moving on, as this value will be changed
__syncthreads();
if(vars[d1]==0)
{
if(i == 0 && j < bs)
{
int index = j*width + ox_d + oy_d*width;
int index2 = j*width + ox_d + oy_d*width +bs;
atomicSub(&ary_shared[in[index]],1);
atomicAdd(&ary_shared[in[index2]],1);
}
// The first thread of the first block should set this value
if(k==0)
vars[ox]++;
}
else if(vars[d1]==1||vars[d1]==3)
{
if(i == 0 && j < bs)
{
/*if(j==0)
printf("Entered 1||3\n");*/
int index = j*width + ox_d + oy_d*width;
int index2 = j*width + ox_d + (oy_d+bs)*width;
atomicSub(&ary_shared[in[index]],1);
atomicAdd(&ary_shared[in[index2]],1);
}
// The first thread of the first block should set this value
if(k==0)
vars[oy]++;
}
else if(vars[d1]==2)
{
if(i == 0 && j < bs)
{
int index = j*width + ox_d-1 + oy_d*width;
int index2 = j*width + ox_d-1 + oy_d*width +bs;
atomicAdd(&ary_shared[in[index]],1);
atomicSub(&ary_shared[in[index2]],1);
}
// The first thread of the first block should set this value
if(k==0)
vars[ox]--;
}
__syncthreads();
//ary_shared has been calculated
// Reset the hmin and hminc values
// again the same task as done in the if(n) loop
if(k==0)
{
vars[hmin]=0;
vars[hminc]=0;
vars[hmax]=0;
vars[hmaxc]=0;
}
__syncthreads();
int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[k];
int l;
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > minth)
{
atomicMin(&vars[hmin],k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hminc]=prefix_sum[255];
vars[hmin]--;
}
__syncthreads();
// Calculate maxth
int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
prefix_sum[k] = ary_shared[255-k];
for(l=0;l<=limit;l++)
{
sum_s[k]=prefix_sum[k];
if(k >= (int)pow((float)2,(float)l))
{
prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
// Find out the minimum index for which the cummulative sum crosses threshold
if(prefix_sum[k] > maxth)
{
atomicMin(&vars[hmax], k);
}
}
__syncthreads();
}
// set the maximum value here
if(k==0)
{
vars[hmaxc]=prefix_sum[255];
vars[hmax]--;
vars[hmax]=255-vars[hmax];
}
__syncthreads();
int rng = vars[hmax] - vars[hmin];
if(rng >= vars[cut])
{
if(k <= vars[hmin])
he_shared[k] = 0;
else if(k >= vars[hmax])
he_shared[k] = 255;
else
he_shared[k] = (255 * (k - vars[hmin]))/rng;
}
__syncthreads();
if(i>0 && i<=bs && j>0 && j<=bs)
{
int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);
if(rng >= vars[cut])
{
int value = he_shared[in[base]];
buf[base]+=value;
lim[base]++;
}
else
{
buf[base]+=255;
lim[base]++;
}
}
// This just might cause a little bit of problem
if(k==0)
vars[n]--;
// All threads will wait here before continuing the while loop
__syncthreads();
}// end of while(n)
}