我尝试使用CUDA中的cusolver库在GPU上执行QR分解。状态:执行失败时,从CUDA库调用cusolverDnDgeqrf
我将问题简化为下面的示例。
基本上,几个步骤是:
- 我分配内存和初始化的[5X3]矩阵与 主机上1S,
- 我分配内存和复制装置上的矩阵
- 我初始化
cusolverDnCreate
- 解算处理我确定所需的工作空间的大小与
cusolverDnDgeqrf_bufferSize
- 最后,尝试做QR分解与
cusolverDnDgeqrf
不幸的是,最后的命令系统返回一个CUSOLVER_STATUS_EXECUTION_FAILED
(int值= 6)失败,我想不出哪里出了问题!
下面是错误代码:
#include <cusolverDn.h>
#include <cuda_runtime_api.h>
int main(void)
{
int N = 5, P = 3;
double *hostData;
cudaMallocHost((void **) &hostData, N * sizeof(double));
for (int i = 0; i < N * P; ++i)
hostData[i] = 1.;
double *devData;
cudaMalloc((void**)&devData, N * sizeof(double));
cudaMemcpy((void*)devData, (void*)hostData, N * sizeof(double), cudaMemcpyHostToDevice);
cusolverStatus_t retVal;
cusolverDnHandle_t solverHandle;
retVal = cusolverDnCreate(&solverHandle);
std::cout << "Handler creation : " << retVal << std::endl;
double *devTau, *work;
int szWork;
cudaMalloc((void**)&devTau, P * sizeof(double));
retVal = cusolverDnDgeqrf_bufferSize(solverHandle, N, P, devData, N, &szWork);
std::cout << "Work space sizing : " << retVal << std::endl;
cudaMalloc((void**)&work, szWork * sizeof(double));
int *devInfo;
cudaMalloc((void **)&devInfo, 1);
retVal = cusolverDnDgeqrf(solverHandle, N, P, devData, N, devTau, work, szWork, devInfo); //CUSOLVER_STATUS_EXECUTION_FAILED
std::cout << "QR factorization : " << retVal << std::endl;
int hDevInfo = 0;
cudaMemcpy((void*)devInfo, (void*)&hDevInfo, 1 * sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "Info device : " << hDevInfo << std::endl;
cudaFree(devInfo);
cudaFree(work);
cudaFree(devTau);
cudaFree(devData);
cudaFreeHost(hostData);
cudaDeviceReset();
}
你会看到任何明显的错误在我的代码,请让我知道! 非常感谢。