2017-07-23 29 views
-1

我已经实现了一个向量点积如下。 它与CUDA 7.5编译为compute_20,sm_20const int THREADS_PER_BLOCK = 16;两个向量的cuda点积不适用于N> = 369

浮动和双打都发生这种情况。

它工作到n=368,但除此之外,结果是不正确的。我想知道问题出在我的实现代码还是我正在使用的值(请参阅第二个代码,初始化),例如可能是除n=368之外的添加引入了浮点错误(这可能很奇怪,因为浮点和双精度同时出现错误)。

int divUp(int total, int grain) { return (total+grain-1)/grain; } 

__device__ __forceinline__ double atomicAdd(double* address, double val) 
{ 
    unsigned long long int* address_as_ull = (unsigned long long int*)address; 
    unsigned long long int old = *address_as_ull, assumed; 
    do 
    { 
     assumed = old; 
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed))); 
    } 
    while(assumed!=old); 
    return __longlong_as_double(old); 
} 

__device__ __forceinline__ float atomicAdd(float* address, float val) 
{ 
    unsigned int *ptr = (unsigned int *)address; 
    unsigned int old, newint, ret = *ptr; 
    do { 
     old = ret; 
     newint = __float_as_int(__int_as_float(old)+val); 
    } while((ret = atomicCAS(ptr, old, newint)) != old); 

    return __int_as_float(ret); 
} 

template<typename T> 
__global__ void vecdotk(const T* a, const T* b, const int n, T* c) 
{ 
    __shared__ T temp[THREADS_PER_BLOCK]; 
    int x = threadIdx.x+blockIdx.x*blockDim.x; 
    if(x==0) c[0] = 0.0; 
    if(x<n) {temp[threadIdx.x] = a[x]*b[x]; 
    } 
    else temp[threadIdx.x] = 0.0; 
    __syncthreads(); 

    if(0==threadIdx.x) 
    { 
     T sum = 0.0; 
     for(int j=0; j<THREADS_PER_BLOCK; ++j) 
     { 
      sum += temp[j]; 
     } 
     atomicAdd(c, sum); 
    } 
} 

template<typename T> 
void dot(const T* a, const T* b, const int n, T* c) 
{ 
    dim3 block(THREADS_PER_BLOCK); 
    dim3 grid(divUp(n, block.x), 1); 
    vecdotk<T><<<grid, block>>>(a, b, n, c); 
    cudaSafeCall(cudaGetLastError()); 
}; 

我使用以下两个主机矢量来填充输入设备数组(目前我没有显示,因为它们是更大的库的一部分)。基本上我想计算平方系列的总和即

equation

// fill host vectors a and b 
for(int i=0; i<n; ++i) 
{ 
    h_vec_a[i] = i+1;//__mat_rand(); 
    h_vec_b[i] = i+1;//__mat_rand(); 
} 

回答

1

这是行不通的:

if(x==0) c[0] = 0.0; 

没有保证(在CUDA)该线程第一个0或运行该行将在其他线程到达代码中的任何点之前运行。在启动该内核之前,您需要初始化c[0]。否则,一些线程可能会将它们的原子添加到c中,然后,稍后线程0可能会将c [0]初始化为零。

此外,CUDA已经提供了一个float版本的atomicAdd,您没有理由提供自己的。并且,运行16个线程的线程块将不会给你带来好的性能(我建议只使用CUBLAS点积函数。)通过修复c[0](删除该代码行,并在内核之前初始化c[0]),代码将正确运行我:

$ cat t372.cu 
#include <stdio.h> 

const int n = 2048; 
#ifdef USE_DOUBLE 
typedef double mytype; 
#else 
typedef float mytype; 
#endif 
const int THREADS_PER_BLOCK = 16; 

int divUp(int total, int grain) { return (total+grain-1)/grain; } 
#if 0 
__device__ __forceinline__ double atomicAdd(double* address, double val) 
{ 
    unsigned long long int* address_as_ull = (unsigned long long int*)address; 
    unsigned long long int old = *address_as_ull, assumed; 
    do 
    { 
     assumed = old; 
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed))); 
    } 
    while(assumed!=old); 
    return __longlong_as_double(old); 
} 

__device__ __forceinline__ float atomicAdd(float* address, float val) 
{ 
    unsigned int *ptr = (unsigned int *)address; 
    unsigned int old, newint, ret = *ptr; 
    do { 
     old = ret; 
     newint = __float_as_int(__int_as_float(old)+val); 
    } while((ret = atomicCAS(ptr, old, newint)) != old); 

    return __int_as_float(ret); 
} 
#endif 
template<typename T> 
__global__ void vecdotk(const T* a, const T* b, const int n, T* c) 
{ 
    __shared__ T temp[THREADS_PER_BLOCK]; 
    int x = threadIdx.x+blockIdx.x*blockDim.x; 
    //if(x==0) c[0] = 0.0; 
    if(x<n) {temp[threadIdx.x] = a[x]*b[x]; 
    } 
    else temp[threadIdx.x] = 0.0; 
    __syncthreads(); 

    if(0==threadIdx.x) 
    { 
     T sum = 0.0; 
     for(int j=0; j<THREADS_PER_BLOCK; ++j) 
     { 
      sum += temp[j]; 
     } 
     atomicAdd(c, sum); 
    } 
} 

template<typename T> 
cudaError_t dot(const T* a, const T* b, const int n, T* c) 
{ 
    dim3 block(THREADS_PER_BLOCK); 
    dim3 grid(divUp(n, block.x), 1); 
    vecdotk<T><<<grid, block>>>(a, b, n, c); 
    cudaDeviceSynchronize(); 
    return cudaGetLastError(); 
}; 

int main(){ 

    mytype *h_vec_a, *h_vec_b, *d_vec_a, *d_vec_b, *h_c, *d_c; 
    int bs = n*sizeof(mytype); 
    h_vec_a = (mytype *)malloc(bs); 
    h_vec_b = (mytype *)malloc(bs); 
    h_c = (mytype *)malloc(sizeof(mytype)); 
    cudaMalloc(&d_vec_b, bs); 
    cudaMalloc(&d_vec_a, bs); 
    cudaMalloc(&d_c, sizeof(mytype)); 
// fill host vectors a and b 
    for(int i=0; i<n; ++i) 
    { 
    h_vec_a[i] = i+1;//__mat_rand(); 
    h_vec_b[i] = i+1;//__mat_rand(); 
    } 
    h_c[0] = 0; 
    cudaMemcpy(d_vec_a, h_vec_a, bs, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_vec_b, h_vec_b, bs, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_c, h_c, sizeof(mytype), cudaMemcpyHostToDevice); 
    dot(d_vec_a, d_vec_b, n, d_c); 
    cudaMemcpy(h_c, d_c, sizeof(mytype), cudaMemcpyDeviceToHost); 
    mytype test_val = 0; 
    for (int i=0; i < n; i++) 
    test_val += h_vec_a[i] * h_vec_b[i]; 
    printf("GPU result: %f, CPU result: %f\n", h_c[0], test_val); 

} 
$ nvcc -arch=sm_20 -o t372 t372.cu 
nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). 
$ cuda-memcheck ./t372 
========= CUDA-MEMCHECK 
GPU result: 2865411584.000000, CPU result: 2865411072.000000 
========= ERROR SUMMARY: 0 errors 
$ 

在最后3个数字的数值差异是由于float极限,不由于在代码的任何误差。例如,如果您更改初始化以将每个矢量初始化为全1,则在此情况下您将得到一个精确的结果。

再一次,出于性能方面的原因,可能会对您的代码提出一些批评。如果你想要一个快速点产品,我建议使用CUBLAS

+0

我正在学习CUDA的基础知识,并为通用计算编程GPU。但是,如果我在'if(x == 0)c [0] = 0.0;'行之后使用'__threadfence()'会怎么样? – user62039

+0

谢谢罗伯特。另外,我认为最好在void void(...)函数中初始化sum = 0.0。对? – user62039

+0

threadfence不强制线程执行的顺序。是的,只要在内核调用之前就可以将c初始化 –