我不知道在CUBLAS中做什么,或者在标准BLAS中做什么。您在MKL中找到的是英特尔添加的扩展,但我不记得在其他主机和加速器BLAS实现中看到类似的东西。
好消息是,您断言“我可以编写自己的CUDA代码来完成它,但那么它可能没有像cuBLAS代码那么快”,这是不真实的,至少对于操作来说是微不足道的SAXPY。即使天真地执行saxpy也会非常接近CUBLAS,因为实际上没有多少人阅读两个数组,执行FMAD并将结果写回。只要你获得正确的内存合并,编写高性能代码非常简单。例如:
#include <vector>
#include <algorithm>
#include <cassert>
#include <iostream>
#include <cmath>
#include "cublas_v2.h"
typedef enum
{
AXPY = 0,
AXPBY = 1
} saxpy_op_t;
__device__ __host__ __inline__
float axpby_op(float y, float x, float a)
{
return a * y + x;
}
__device__ __host__ __inline__
float axpy_op(float y, float x, float a)
{
return y + a * x;
}
template<typename T>
class pitched_accessor
{
T * p;
size_t pitch;
public:
__host__ __device__
pitched_accessor(T *p_, size_t pitch_) : p(p_), pitch(pitch_) {};
__host__ __device__
T& operator[](size_t idx) { return p[pitch*idx]; };
__host__ __device__
const T& operator[](size_t idx) const { return p[pitch*idx]; };
};
template<saxpy_op_t op>
__global__
void saxpy_kernel(pitched_accessor<float> y, pitched_accessor<float> x,
const float a, const unsigned int N1)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int stride = gridDim.x * blockDim.x;
#pragma unroll 8
for(; idx < N1; idx += stride) {
switch (op) {
case AXPY:
y[idx] = axpy_op(y[idx], x[idx], a);
break;
case AXPBY:
y[idx] = axpby_op(y[idx], x[idx], a);
break;
}
}
}
__host__ void saxby(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPBY>);
saxpy_kernel<AXPBY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
__host__ void saxpy(const unsigned int N, const float a,
float *x, int xinc, float *y, int yinc)
{
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPY>);
saxpy_kernel<AXPY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc),
pitched_accessor<float>(x, xinc), a, N);
}
void check_result(std::vector<float> &yhat, float result, float tolerance=1e-5f)
{
auto it = yhat.begin();
for(; it != yhat.end(); ++it) {
float err = std::fabs(*it - result);
assert(err < tolerance);
}
}
int main()
{
const int N = 1<<22;
std::vector<float> x_h(N);
std::vector<float> y_h(N);
const float a = 2.f, y0 = 1234.f, x0 = 532.f;
std::fill(y_h.begin(), y_h.end(), y0);
std::fill(x_h.begin(), x_h.end(), x0);
float *x_d, *y_d;
size_t sz = sizeof(float) * size_t(N);
cudaMalloc((void **)&x_d, sz);
cudaMalloc((void **)&y_d, sz);
cudaMemcpy(x_d, &x_h[0], sz, cudaMemcpyHostToDevice);
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxby(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpby_op(y0, x0, a));
}
{
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
saxpy(N, a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
}
{
cublasHandle_t handle;
cublasCreate(&handle);
cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
cublasSaxpy(handle, N, &a, x_d, 1, y_d, 1);
std::vector<float> yhat(N);
cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
check_result(yhat, axpy_op(y0, x0, a));
cublasDestroy(handle);
}
return int(cudaDeviceReset());
}
这证明了一个非常简单的axpy内核可以很容易地适应执行两个标准操作和你想要的版本,和CUBLAS的运行时间的10%以内的计算5.2设备我跑测试它:
$ nvcc -std=c++11 -arch=sm_52 -Xptxas="-v" -o saxby saxby.cu -lcublas
ptxas info : 0 bytes gmem
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
ptxas info : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info : Function properties for _Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj
0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info : Used 17 registers, 360 bytes cmem[0]
$ nvprof ./saxby
==26806== NVPROF is profiling process 26806, command: ./saxby
==26806== Profiling application: ./saxby
==26806== Profiling result:
Time(%) Time Calls Avg Min Max Name
54.06% 11.190ms 5 2.2381ms 960ns 2.9094ms [CUDA memcpy HtoD]
40.89% 8.4641ms 3 2.8214ms 2.8039ms 2.8310ms [CUDA memcpy DtoH]
1.73% 357.59us 1 357.59us 357.59us 357.59us void saxpy_kernel<saxpy_op_t=1>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.72% 355.15us 1 355.15us 355.15us 355.15us void saxpy_kernel<saxpy_op_t=0>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
1.60% 332.21us 1 332.21us 332.21us 332.21us void axpy_kernel_val<float, int=0>(cublasAxpyParamsVal<float>)
好的。另外,感谢有关如何编写CUDA代码并在Linux下进行配置的优秀简短教程! (Windows结构要复杂得多。) 我在想如果使用fmad()会照顾最后的10%。 –