2017-04-16 31 views
0

在我的代码中,我使用了来自推力库的复数数组,并且我想使用cublasZgeam()来转置数组。使用来自Thrust的复数的cuBLAS

使用cuComplex.h中的复数不是一个可取的选择,因为我在数组上做了很多算术运算,cuComplex没有定义运算符,如* + =。

我这是怎么定义的数组,我要转

thrust::complex<float> u[xmax][xmax]; 

我发现这个https://github.com/jtravs/cuda_complex,但使用它是这样:

#include "cuComplex.hpp" 

犯规让我用提到的运营商进行编译时与nvcc

error: no operator "+=" matches these operands 
     operand types are: cuComplex += cuComplex 

有没有一些解决方案呢?从github代码是旧的,可​​能存在这个问题,也许我使用它是错误的

编辑:这里是代码工作,唯一区别talonmies代码是添加简单的内核和指针,以相同的数据,但被刺::复杂

#include <iostream> 
#include <thrust/fill.h> 
#include <thrust/complex.h> 
#include <cublas_v2.h> 

using namespace std; 

__global__ void test(thrust::complex<double>* u) { 

    u[0] += thrust::complex<double>(3.3,3.3); 
} 

int main() 
{ 
    int xmax = 100; 
    thrust::complex<double> u[xmax][xmax]; 
    double arrSize = sizeof(thrust::complex<double>) * xmax * xmax; 

    thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0)); 
    u[49][51] += thrust::complex<double>(665.0,665.0); 
    u[51][49] *= 2.0; 

    cout << "Before:" << endl; 
    cout << u[49][51] << endl; 
    cout << u[51][49] << endl; 
    cout << u[0][0] << endl; 

    thrust::complex<double> alpha(1.0, 0.0); 
    thrust::complex<double> beta(0.0, 0.0); 
    cublasHandle_t handle; 
    cublasCreate(&handle); 

    cuDoubleComplex* d_u; 
    cuDoubleComplex* d_v; 
    cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha); 
    cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta); 
    cudaMalloc(&d_u, arrSize); 
    cudaMalloc(&d_v, arrSize); 
    cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice); 
    thrust::complex<double>* d_vTest = reinterpret_cast<thrust::complex<double>* >(d_v); 
    cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax, 
        _alpha, d_u, xmax, 
        _beta, d_u, xmax, 
        d_v, xmax); 
    test<<<1,1>>>(d_vTest); 
    cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost); 
    cout << "After:" << endl; 
    cout << u[0][0] << endl; 
    cout << u[49][51] << endl; 
    cout << u[51][49] << endl; 

    return 0; 
} 
+0

你不能使用C++标准库的复杂类型和函数吗? – talonmies

+0

这是我试过的,它似乎没有工作https://pastebin.com/hCjPvdBm –

+0

@talonmies我已阅读此文档:http://docs.nvidia.com/cuda/cublas/#cublas-lt- t-gt-geam。我必须承认我有机会误解它,但我也检查了一些工作示例 –

回答

2

尽管你的抗议,相反,C++标准库complex(或thrust::complex)肯定不会有CUBLAS工作。 cuComplexcuDoubleComplex被设计为与标准主机复杂类型二进制兼容,以便数据在传递到在设备上使用复杂数据的CUBLAS函数时不会被翻译。

一个简单的修改,你在发表的评论代码工作完全按照自己的想象:

#include <algorithm> 
#include <iostream> 
#include <complex> 
#include <cublas_v2.h> 

using namespace std; 

int main() 
{ 
    int xmax = 100; 
    complex<double> u[xmax][xmax]; 
    double arrSize = sizeof(complex<double>) * xmax * xmax; 

    fill(&u[0][0], &u[0][0] + (xmax * xmax), complex<double>(1.0,1.0)); 
    u[49][51] += complex<double>(665.0,665.0); 
    u[51][49] *= 2.0; 

    cout << "Before:" << endl; 
    cout << u[49][51] << endl; 
    cout << u[51][49] << endl; 

    complex<double> alpha(1.0, 0.0); 
    complex<double> beta(0.0, 0.0); 
    cublasHandle_t handle; 
    cublasCreate(&handle); 

    cuDoubleComplex* d_u; 
    cuDoubleComplex* d_v; 
    cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha); 
    cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta); 
    cudaMalloc(&d_u, arrSize); 
    cudaMalloc(&d_v, arrSize); 
    cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice); 
    cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax, 
        _alpha, d_u, xmax, 
        _beta, d_u, xmax, 
        d_v, xmax); 

    cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost); 

    cout << "After:" << endl; 
    cout << u[49][51] << endl; 
    cout << u[51][49] << endl; 

    return 0; 
} 

建成和运行,像这样:

~/SO$ nvcc -std=c++11 -arch=sm_52 -o complex_transpose complex_transpose.cu -lcublas 
~/SO$ ./complex_transpose 
Before: 
(666,666) 
(2,2) 
After: 
(2,2) 
(666,666) 

唯一需要修改的显式转换std::complex<double>类型到cuDoubleComplex。这样做和一切都按预期工作。

使用推力,代码看起来几乎相同:

#include <iostream> 
#include <thrust/fill.h> 
#include <thrust/complex.h> 
#include <cublas_v2.h> 

using namespace std; 

int main() 
{ 
    int xmax = 100; 
    thrust::complex<double> u[xmax][xmax]; 
    double arrSize = sizeof(thrust::complex<double>) * xmax * xmax; 

    thrust::fill(&u[0][0], &u[0][0] + (xmax * xmax), thrust::complex<double>(1.0,1.0)); 
    u[49][51] += thrust::complex<double>(665.0,665.0); 
    u[51][49] *= 2.0; 

    cout << "Before:" << endl; 
    cout << u[49][51] << endl; 
    cout << u[51][49] << endl; 

    thrust::complex<double> alpha(1.0, 0.0); 
    thrust::complex<double> beta(0.0, 0.0); 
    cublasHandle_t handle; 
    cublasCreate(&handle); 

    cuDoubleComplex* d_u; 
    cuDoubleComplex* d_v; 
    cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha); 
    cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta); 
    cudaMalloc(&d_u, arrSize); 
    cudaMalloc(&d_v, arrSize); 
    cudaMemcpy(d_u, &u[0][0], arrSize, cudaMemcpyHostToDevice); 
    cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax, 
        _alpha, d_u, xmax, 
        _beta, d_u, xmax, 
        d_v, xmax); 

    cudaMemcpy(u, d_v, arrSize, cudaMemcpyDeviceToHost); 

    cout << "After:" << endl; 
    cout << u[49][51] << endl; 
    cout << u[51][49] << endl; 

    return 0; 
} 

也许更接近于你的使用情况,使用推力装置的容器具有内核CUBLAS呼叫之前执行一些初始化:

#include <iostream> 
#include <thrust/device_vector.h> 
#include <thrust/complex.h> 
#include <thrust/execution_policy.h> 
#include <thrust/copy.h> 
#include <cublas_v2.h> 

__global__ void setup_kernel(thrust::complex<double>* u, int xmax) 
{ 
    u[51 + 49*xmax] += thrust::complex<double>(665.0,665.0); 
    u[49 + 51*xmax] *= 2.0; 
} 

int main() 
{ 
    int xmax = 100; 

    thrust::complex<double> alpha(1.0, 0.0); 
    thrust::complex<double> beta(0.0, 0.0); 
    cublasHandle_t handle; 
    cublasCreate(&handle); 

    thrust::device_vector<thrust::complex<double>> d_u(xmax * xmax, thrust::complex<double>(1.0,1.0)); 
    thrust::device_vector<thrust::complex<double>> d_v(xmax * xmax, thrust::complex<double>(0.,0.)); 
    setup_kernel<<<1,1>>>(thrust::raw_pointer_cast(d_u.data()), xmax); 

    cuDoubleComplex* _d_u = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_u.data())); 
    cuDoubleComplex* _d_v = reinterpret_cast<cuDoubleComplex*>(thrust::raw_pointer_cast(d_v.data())); 
    cuDoubleComplex* _alpha = reinterpret_cast<cuDoubleComplex*>(&alpha); 
    cuDoubleComplex* _beta = reinterpret_cast<cuDoubleComplex*>(&beta); 

    cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_N, xmax, xmax, 
        _alpha, _d_u, xmax, 
        _beta, _d_u, xmax, 
        _d_v, xmax); 

    thrust::complex<double> u[xmax][xmax]; 

    thrust::copy(d_u.begin(), d_u.end(), &u[0][0]); 
    std::cout << "Before:" << std::endl; 
    std::cout << u[49][51] << std::endl; 
    std::cout << u[51][49] << std::endl; 

    thrust::copy(d_v.begin(), d_v.end(), &u[0][0]); 
    std::cout << "After:" << std::endl; 
    std::cout << u[49][51] << std::endl; 
    std::cout << u[51][49] << std::endl; 

    return 0; 

} 
+0

非常感谢。我没有澄清它,但我需要在转置后的内核中使用这些运算符,并且为了清晰起见而省略。我希望理解你的概念能够帮助我实现这一目标。 –

+1

@MaxK:如果你想在设备上进行操作,使用'thrust :: complex'。它在功能上与'std :: complex'相同,但具有'__device__'绑定。我的答案中发布的代码基本上没有区别,无论您使用哪种类型。 – talonmies

相关问题