2016-04-14 78 views
1

cublasSaxpy计算y'= a * x + y,其中x和y是向量,a是标量。有没有办法在cuBLAS中做“saypx”?

原来我需要计算y'= a * y + x。我没有看到如何扭曲cuBLAS库来做到这一点。 (当然,我可以计算y'= a * y,那么y'= y'+ x,但是在这种情况下y'往往被读取,而且我可以编写自己的CUDA代码来完成它,但是它可能没有像cuBLAS代码那么快,我只是感到惊讶,没有明显的方式直接执行“saypx”。)

[已添加]在英特尔版本中有类似于“saxpby”的函数cblas,这将做我所需要的。但奇怪的是,这不是cuBLAS。

[增加了#2]它看起来像我可以使用cudnnAddTensor函数,描述符有一些别名(我有一个指向张量的FilterDescriptor,哪个AddTensor不会接受,但我应该能够将别名TensorDescriptor具有相同的内存和形状)。

回答

2

我不知道在CUBLAS中做什么,或者在标准BLAS中做什么。您在MKL中找到的是英特尔添加的扩展,但我不记得在其他主机和加速器BLAS实现中看到类似的东西。

好消息是,您断言“我可以编写自己的CUDA代码来完成它,但那么它可能没有像cuBLAS代码那么快”,这是不真实的,至少对于操作来说是微不足道的SAXPY。即使天真地执行saxpy也会非常接近CUBLAS,因为实际上没有多少人阅读两个数组,执行FMAD并将结果写回。只要你获得正确的内存合并,编写高性能代码非常简单。例如:

#include <vector> 
#include <algorithm> 
#include <cassert> 
#include <iostream> 
#include <cmath> 

#include "cublas_v2.h" 

typedef enum 
{ 
    AXPY = 0, 
    AXPBY = 1 
} saxpy_op_t; 

__device__ __host__ __inline__ 
float axpby_op(float y, float x, float a) 
{ 
    return a * y + x; 
} 

__device__ __host__ __inline__ 
float axpy_op(float y, float x, float a) 
{ 
    return y + a * x; 
} 

template<typename T> 
class pitched_accessor 
{ 
    T * p; 
    size_t pitch; 

    public: 
    __host__ __device__ 
    pitched_accessor(T *p_, size_t pitch_) : p(p_), pitch(pitch_) {}; 

    __host__ __device__ 
    T& operator[](size_t idx) { return p[pitch*idx]; }; 

    __host__ __device__ 
    const T& operator[](size_t idx) const { return p[pitch*idx]; }; 
}; 


template<saxpy_op_t op> 
__global__ 
void saxpy_kernel(pitched_accessor<float> y, pitched_accessor<float> x, 
        const float a, const unsigned int N1) 
{ 
    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; 
    unsigned int stride = gridDim.x * blockDim.x; 

    #pragma unroll 8 
    for(; idx < N1; idx += stride) { 
     switch (op) { 
      case AXPY: 
       y[idx] = axpy_op(y[idx], x[idx], a); 
       break; 
      case AXPBY: 
       y[idx] = axpby_op(y[idx], x[idx], a); 
       break; 
     } 
    } 
} 

__host__ void saxby(const unsigned int N, const float a, 
        float *x, int xinc, float *y, int yinc) 
{ 
    int gridsize, blocksize; 
    cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPBY>); 
    saxpy_kernel<AXPBY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc), 
               pitched_accessor<float>(x, xinc), a, N); 
} 

__host__ void saxpy(const unsigned int N, const float a, 
        float *x, int xinc, float *y, int yinc) 
{ 
    int gridsize, blocksize; 
    cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPY>); 
    saxpy_kernel<AXPY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc), 
               pitched_accessor<float>(x, xinc), a, N); 
} 

void check_result(std::vector<float> &yhat, float result, float tolerance=1e-5f) 
{ 
    auto it = yhat.begin(); 
    for(; it != yhat.end(); ++it) { 
     float err = std::fabs(*it - result); 
     assert(err < tolerance); 
    } 
} 

int main() 
{ 

    const int N = 1<<22; 

    std::vector<float> x_h(N); 
    std::vector<float> y_h(N); 

    const float a = 2.f, y0 = 1234.f, x0 = 532.f; 
    std::fill(y_h.begin(), y_h.end(), y0); 
    std::fill(x_h.begin(), x_h.end(), x0); 

    float *x_d, *y_d; 
    size_t sz = sizeof(float) * size_t(N); 
    cudaMalloc((void **)&x_d, sz); 
    cudaMalloc((void **)&y_d, sz); 

    cudaMemcpy(x_d, &x_h[0], sz, cudaMemcpyHostToDevice); 

    { 
     cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice); 
     saxby(N, a, x_d, 1, y_d, 1); 
     std::vector<float> yhat(N); 
     cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost); 
     check_result(yhat, axpby_op(y0, x0, a)); 
    } 

    { 
     cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice); 
     saxpy(N, a, x_d, 1, y_d, 1); 
     std::vector<float> yhat(N); 
     cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost); 
     check_result(yhat, axpy_op(y0, x0, a)); 
    } 

    { 
     cublasHandle_t handle; 
     cublasCreate(&handle); 
     cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice); 
     cublasSaxpy(handle, N, &a, x_d, 1, y_d, 1); 
     std::vector<float> yhat(N); 
     cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost); 
     check_result(yhat, axpy_op(y0, x0, a)); 
     cublasDestroy(handle); 
    } 

    return int(cudaDeviceReset()); 
} 

这证明了一个非常简单的axpy内核可以很容易地适应执行两个标准操作和你想要的版本,和CUBLAS的运行时间的10%以内的计算5.2设备我跑测试它:

$ nvcc -std=c++11 -arch=sm_52 -Xptxas="-v" -o saxby saxby.cu -lcublas 
ptxas info : 0 bytes gmem 
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj' for 'sm_52' 
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj 
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads 
ptxas info : Used 17 registers, 360 bytes cmem[0] 
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj' for 'sm_52' 
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj 
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads 
ptxas info : Used 17 registers, 360 bytes cmem[0] 

$ nvprof ./saxby 
==26806== NVPROF is profiling process 26806, command: ./saxby 
==26806== Profiling application: ./saxby 
==26806== Profiling result: 
Time(%)  Time  Calls  Avg  Min  Max Name 
54.06% 11.190ms   5 2.2381ms  960ns 2.9094ms [CUDA memcpy HtoD] 
40.89% 8.4641ms   3 2.8214ms 2.8039ms 2.8310ms [CUDA memcpy DtoH] 
    1.73% 357.59us   1 357.59us 357.59us 357.59us void saxpy_kernel<saxpy_op_t=1>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int) 
    1.72% 355.15us   1 355.15us 355.15us 355.15us void saxpy_kernel<saxpy_op_t=0>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int) 
    1.60% 332.21us   1 332.21us 332.21us 332.21us void axpy_kernel_val<float, int=0>(cublasAxpyParamsVal<float>) 
+0

好的。另外,感谢有关如何编写CUDA代码并在Linux下进行配置的优秀简短教程! (Windows结构要复杂得多。) 我在想如果使用fmad()会照顾最后的10%。 –

相关问题