2013-06-27 26 views
-1

我有以下的matlab代码;如何在Thrust或Cublas的两个矩阵W和X之间找到所有行距矩阵的行?

tempx = full(sum(X.^2, 2)); 
tempc = full(sum(C.^2, 2).'); 
D = -2*(X * C.'); 
D = bsxfun(@plus, D, tempx); 
D = bsxfun(@plus, D, tempc); 

其中X是nxm,W是kxm矩阵。一个是数据,另一个是权重矩阵。我用给定的代码找到距离矩阵D.我正在观察这种操作的高效Cublas或Thrust实施。我以cublas继承D = -2*(X * C.');这行,但剩下的部分仍然是一个新手的问​​题?任何人都可以帮助一个片段或提出建议?

这是我到目前为止: 编辑:我添加了一些更多的代码,我需要像summation实现bsxfun。将所有列的矢量V与所有行的和V2相加,作为最后一步。

#include <stdio.h> 
#include <stdlib.h> 
#include <math.h> 
#include <cuda_runtime.h> 
#include "cublas_v2.h" 
#include <algorithm> 
#include <cuda.h> 

#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <thrust/transform.h> 
#include <thrust/functional.h> 
#include <thrust/system_error.h> 
#include <thrust/sequence.h> 
#include <thrust/copy.h> 

#define N 4 
#define K 4 
#define M 3 

template <typename T> 
struct square_op{ 
    __host__ __device__ T operator()(const T& x)const{ 
     return x*x; 
    } 
}; 


int main (void){ 
    cudaError_t cudaStat; 
    cublasStatus_t stat; 
    cublasHandle_t handle; 

    stat = cublasCreate(&handle); 
    if (stat != CUBLAS_STATUS_SUCCESS){ 
     printf("CUBLAS initialization failure!!\n"); 
     return EXIT_FAILURE; 
    } 
    // Fill with random data 
    thrust::host_vector<float> C_h(N*K); 

    thrust::host_vector<float> A_h(N*M);  //data matrix 
    thrust::host_vector<float> B_h(K*M);  //weight matrix 
    thrust::sequence(A_h.begin(),A_h.end()); 
    thrust::sequence(B_h.begin(),B_h.end()); 
    // std::generate(A_h.begin(), A_h.end(), rand); 
    // std::generate(B_h.begin(), B_h.end(), rand); 

    thrust::device_vector<float> A_d = A_h; 
    thrust::device_vector<float> B_d = B_h; 
    thrust::device_vector<float> C_d(N*K); 
    thrust::device_vector<float> dummy_x(M,1); 
    thrust::device_vector<float> A_sum_vec_d(N,0); 
    thrust::device_vector<float> B_sum_vec_d(K,0); 

    // TEST variables 
    thrust::host_vector<float> A_sum_vec_h(N,0); 
    thrust::host_vector<float> B_sum_vec_h(K,0); 

    for (int i = 0; i < N; ++i) { 
     for (int j = 0; j < M; ++j) { 
      printf("%f ",A_h[i*M+j]); 
     } 
     printf("\n"); 
    } 

    printf("\n"); 

    for (int i = 0; i < K; ++i) { 
     for (int j = 0; j < M; ++j) { 
      printf("%f ",B_h[i*M+j]); 
     } 
     printf("\n"); 
    } 

    printf("\n"); 

    std::cout<< "Starting GPU run" <<std::endl; //add this line 
    cudaEvent_t start, stop; 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 
    cudaEventRecord(start, 0); 

    //************************************ 
    // Calculate Square Elements 
    //************************************ 
    square_op<float> unary_op = square_op<float>(); 
    thrust::transform(A_d.begin(),A_d.end(),A_d.begin(),unary_op); 
    thrust::transform(B_d.begin(),B_d.end(),B_d.begin(),unary_op); 

    // TEST 
    thrust::copy(A_d.begin(),A_d.end(), A_h.begin()); 
    printf("Matrix A after square!!\n"); 
    for (int i = 0; i < N; ++i) { 
     for (int j = 0; j < M; ++j) { 
      printf("%f ",A_h[i*M+j]); 
     } 
     printf("\n"); 
    } 

    printf("\n"); 

    thrust::copy(B_d.begin(),B_d.end(), B_h.begin()); 
    printf("Matrix B after square!!\n"); 
    for (int i = 0; i < K; ++i) { 
     for (int j = 0; j < M; ++j) { 
      printf("%f ",B_h[i*M+j]); 
     } 
     printf("\n"); 
    } 

    printf("\n"); 

    //************************************ 
    // Sum of the Rows 
    //************************************ 
    float alpha = 1.0f; 
    float beta = 0.0f; 

    stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,N,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&A_sum_vec_d[0]),1); 
    if (stat != CUBLAS_STATUS_SUCCESS){ 
     printf("1 CUBLAS initialization failure!!\n"); 
     return EXIT_FAILURE; 
    } 

    stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,K,&alpha,thrust::raw_pointer_cast(&B_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&B_sum_vec_d[0]),1); 
    if (stat != CUBLAS_STATUS_SUCCESS){ 
     printf("2 CUBLAS initialization failure!!\n"); 
     return EXIT_FAILURE; 
    } 

    // TEST 
    thrust::copy(A_sum_vec_d.begin(), A_sum_vec_d.end(), A_sum_vec_h.begin()); 

    printf("A_vec after row sum!!\n"); 
    for (int j = 0; j < N; ++j) { 
     printf("%f ",A_sum_vec_h[j]); 
    } 
    printf("\n \n"); 

    thrust::copy(B_sum_vec_d.begin(), B_sum_vec_d.end(), B_sum_vec_h.begin()); 

    printf("B_vec after row sum!!\n"); 
    for (int j = 0; j < K; ++j) { 
     printf("%f ",B_sum_vec_h[j]); 
    } 
    printf("\n \n"); 



    //************************************ 
    // Matrix Multiplication 
    //************************************ 
    alpha = 2.0f; 
    beta = 0.0f; 
    //alpha*(A*B')+beta in row_major_order 
    stat = cublasSgemm_v2(handle,CUBLAS_OP_T,CUBLAS_OP_N,N,K,M,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&B_d[0]), M, &beta,thrust::raw_pointer_cast(&C_d[0]), N); 
    if (stat != CUBLAS_STATUS_SUCCESS){ 
     printf("CUBLAS initialization failure!!\n"); 
     return EXIT_FAILURE; 
    } 

    cudaEventRecord(stop, 0); 
    cudaEventSynchronize(stop); 
    float elapsedTime; 
    float totalTime; 
    cudaEventElapsedTime(&elapsedTime, start, stop); 
    cudaEventDestroy(start); 
    cudaEventDestroy(stop); 
    totalTime = elapsedTime/1000; 
    std::cout<<"Elapsed_time:"<< totalTime<<std::endl; 

    //copy back data 
    thrust::copy(C_d.begin(),C_d.end(),C_h.begin()); 

    for (int i = 0; i < N; ++i) { 
     for (int j = 0; j < K; ++j) { 
      printf("%f ",C_h[i*K+j]); 
     } 
     printf("\n"); 
    } 

    //************************************ 
    // Final summation 
    //************************************ 
    //.... NEED CODE 

    if (stat != CUBLAS_STATUS_SUCCESS){ 
     printf("CUBLAS initialization failure!!\n"); 
     return EXIT_FAILURE; 
    } 

    printf("Execution ends!!\n"); 
    return EXIT_SUCCESS; 
} 

回答

2

这是我的最终代码,正如我的描述。它可能不是最有效率的,但至少起作用。

#include <stdio.h> 
#include <stdlib.h> 
#include <math.h> 
#include <cuda_runtime.h> 
#include "cublas_v2.h" 
#include <algorithm> 
#include <cuda.h> 

#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <thrust/transform.h> 
#include <thrust/functional.h> 
#include <thrust/system_error.h> 
#include <thrust/sequence.h> 
#include <thrust/device_free.h> 
#include <thrust/reduce.h> 
#include <thrust/copy.h> 

#define N 4 
#define K 4 
#define M 3 


//---------------------------- 
// Find min argument of array 
// --------------------------- 

// C-style indexing 
int ci(int row, int column, int nColumns) { 
    return row*nColumns+column; 
} 

// Convert a linear index to a row index 
template <typename T> 
struct linear_index_to_row_index : public thrust::unary_function<T,T> 
{ 
    T C; // number of columns 

    __host__ __device__ 
    linear_index_to_row_index(T C) : C(C) {} 

    __host__ __device__ 
    T operator()(T i) 
    { 
     return i/C; 
    } 
}; 

typedef thrust::tuple<int,float> argMinType; 

struct argMin : public thrust::binary_function<argMinType,argMinType,argMinType> 
{ 
    __host__ __device__ 
    argMinType operator()(const argMinType& a, const argMinType& b) const 
    { 
     if (thrust::get<1>(a) < thrust::get<1>(b)){ 
      return a; 
     } else { 
      return b; 
     } 
    } 
}; 

thrust::device_vector<argMinType> minsOfRowSpace(thrust::device_vector<float> A, int nRows, int nColumns) { 
    // allocate storage for row argmins and indices 
    thrust::device_vector<argMinType> row_argmins(nRows); 
    thrust::device_vector<int> row_indices(nRows); 

    // compute row argmins by finding argmin values with equal row indices 
    thrust::reduce_by_key 
    (thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(nColumns)), 
      thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(nColumns)) + (nRows*nColumns), 
      thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(0),A.begin())), 
      row_indices.begin(), 
      row_argmins.begin(), 
      thrust::equal_to<int>(), 
      argMin()); 
    return row_argmins; 
} 


template <typename T> 
struct square_op{ 
    __host__ __device__ T operator()(const T& x)const{ 
     return x*x; 
    } 
}; 



int main (void){ 
    cublasStatus_t stat; 
    cublasHandle_t handle; 

    stat = cublasCreate(&handle); 
    if (stat != CUBLAS_STATUS_SUCCESS){ 
     printf("CUBLAS initialization failure!!\n"); 
     return EXIT_FAILURE; 
    } 


    // Fill with random data 
    thrust::host_vector<float> C_h(N*K); 
    thrust::host_vector<argMinType> minOfC_h(N); 
    thrust::host_vector<float> A_h(N*M);  //data matrix 
    thrust::host_vector<float> B_h(K*M);  //weight matrix 
    thrust::sequence(A_h.begin(),A_h.end()); 
    thrust::sequence(B_h.begin(),B_h.end()); 
    // std::generate(A_h.begin(), A_h.end(), rand); 
    // std::generate(B_h.begin(), B_h.end(), rand); 

    thrust::device_vector<float> A_d = A_h; 
    thrust::device_vector<float> B_d = B_h; 
    thrust::device_vector<float> C_d(N*K); 
    thrust::device_vector<float> dummy_x(M,1); 
    thrust::device_vector<float> A_sum_vec_d(N,0); 
    thrust::device_vector<float> B_sum_vec_d(K,0); 
    thrust::device_vector<argMinType> minsOfC_d(N); 

    // TEST variables 
    thrust::host_vector<float> A_sum_vec_h(N,0); 
    thrust::host_vector<float> B_sum_vec_h(K,0); 



    // TEST 
    // for (int i = 0; i < N; ++i) { 
    //  for (int j = 0; j < M; ++j) { 
    //   printf("%f ",A_h[i*M+j]); 
    //  } 
    //  printf("\n"); 
    // } 
    // 
    // printf("\n"); 
    // 
    // for (int i = 0; i < K; ++i) { 
    //  for (int j = 0; j < M; ++j) { 
    //   printf("%f ",B_h[i*M+j]); 
    //  } 
    //  printf("\n"); 
    // } 
    // 
    // printf("\n"); 

    std::cout<< "Starting GPU run" <<std::endl; //add this line 
    cudaEvent_t start, stop; 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 
    cudaEventRecord(start, 0); 


    //************************************ 
    // Matrix Multiplication 
    //************************************ 
    float alpha = -2.0f; 
    float beta = 0.0f; 
    //alpha*(A*B')+beta in row_major_order 
    stat = cublasSgemm_v2(handle,CUBLAS_OP_T,CUBLAS_OP_N,N,K,M,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&B_d[0]), M, &beta,thrust::raw_pointer_cast(&C_d[0]), N); 
    if (stat != CUBLAS_STATUS_SUCCESS){ 
     printf("CUBLAS initialization failure!!\n"); 
     return EXIT_FAILURE; 
    } 

    //TEST 
    // C_h = C_d; 
    // printf("After Matrix Multiplicaton\n"); 
    // for (int i = 0; i < N; ++i) { 
    //  for (int j = 0; j < K; ++j) { 
    //   printf("%f ",C_h[i*K+j]); 
    //  } 
    //  printf("\n"); 
    // } 
    // printf("\n"); 

    //************************************ 
    // Calculate Square Elements 
    //************************************ 
    try{ 
     square_op<float> unary_op = square_op<float>(); 
     thrust::transform(A_d.begin(),A_d.end(),A_d.begin(),unary_op); 
     thrust::transform(B_d.begin(),B_d.end(),B_d.begin(),unary_op); 
    }catch(thrust::system_error &e) 
    { 
     // output an error message and exit 
     std::cerr << "Error Transform square elements: " << e.what() << std::endl; 
     exit(-1); 
    } 

    // TEST 
    // thrust::copy(A_d.begin(),A_d.end(), A_h.begin()); 
    // printf("Matrix A after square!!\n"); 
    // for (int i = 0; i < N; ++i) { 
    //  for (int j = 0; j < M; ++j) { 
    //   printf("%f ",A_h[i*M+j]); 
    //  } 
    //  printf("\n"); 
    // } 
    // 
    // printf("\n"); 
    // 
    // thrust::copy(B_d.begin(),B_d.end(), B_h.begin()); 
    // printf("Matrix B after square!!\n"); 
    // for (int i = 0; i < K; ++i) { 
    //  for (int j = 0; j < M; ++j) { 
    //   printf("%f ",B_h[i*M+j]); 
    //  } 
    //  printf("\n"); 
    // } 
    // 
    // printf("\n"); 

    //************************************ 
    // Sum of the Rows 
    //************************************ 
    alpha = 1.0f; 
    beta = 0.0f; 

    stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,N,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&A_sum_vec_d[0]),1); 
    if (stat != CUBLAS_STATUS_SUCCESS){ 
     printf("1 CUBLAS initialization failure!!\n"); 
     return EXIT_FAILURE; 
    } 

    stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,K,&alpha,thrust::raw_pointer_cast(&B_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&B_sum_vec_d[0]),1); 
    if (stat != CUBLAS_STATUS_SUCCESS){ 
     printf("2 CUBLAS initialization failure!!\n"); 
     return EXIT_FAILURE; 
    } 

    //thrust::device_free(thrust::raw_pointer_cast(&dummy_x[0])); 

    // TEST 
    // thrust::copy(A_sum_vec_d.begin(), A_sum_vec_d.end(), A_sum_vec_h.begin()); 
    // 
    // printf("A_vec after row sum!!\n"); 
    // for (int j = 0; j < N; ++j) { 
    //  printf("%f ",A_sum_vec_h[j]); 
    // } 
    // printf("\n \n"); 
    // 
    // thrust::copy(B_sum_vec_d.begin(), B_sum_vec_d.end(), B_sum_vec_h.begin()); 
    // 
    // printf("B_vec after row sum!!\n"); 
    // for (int j = 0; j < K; ++j) { 
    //  printf("%f ",B_sum_vec_h[j]); 
    // } 
    // printf("\n \n"); 

    //************************************ 
    // Final summation 
    //************************************ 
    thrust::device_vector<float> dummy_y(N,1); 
    alpha = 1.0f; 

    // Column-wise sum 
    stat = cublasSger_v2(handle,K,N,&alpha,thrust::raw_pointer_cast(&dummy_y[0]),1,thrust::raw_pointer_cast(&A_sum_vec_d[0]),1,thrust::raw_pointer_cast(&C_d[0]),K); 

    if (stat != CUBLAS_STATUS_SUCCESS){ 
     printf("CUBLAS final summation failure!!\n"); 
     return EXIT_FAILURE; 
    } 

    // Row-wise sum 
    stat = cublasSger_v2(handle,K,N,&alpha,thrust::raw_pointer_cast(&B_sum_vec_d[0]),1,thrust::raw_pointer_cast(&dummy_y[0]),1,thrust::raw_pointer_cast(&C_d[0]),K); 

    if (stat != CUBLAS_STATUS_SUCCESS){ 
     printf("CUBLAS final summation failure!!\n"); 
     return EXIT_FAILURE; 
    } 

    //thrust::device_free(&dummy_y[0]); 

    // TEST 
    // printf("C matrix after first final sum\n"); 
    // C_h = C_d; 
    // for (int i = 0; i < N; ++i) { 
    //  for (int j = 0; j < K; ++j) { 
    //   printf("%f ",C_h[i*K+j]); 
    //  } 
    //  printf("\n"); 
    // } 
    // printf("\n"); 

    //***************************** 
    // Find min elements 
    // **************************** 
    minsOfC_d=minsOfRowSpace(C_d, N, K); 
    //thrust::copy(minsOfC_d.begin(),minsOfC_d.end(),minOfC_h.begin()); 
    minOfC_h = minsOfC_d; 

    // TEST 
    // for (size_t i = 0; i < N; i++){ 
    //  for (size_t j = 0; j < 1; j++){ 
    //   argMinType tmp=minOfC_h[ci(i,j,1)]; 
    //   std::cout << thrust::get<0>(tmp) << " "; 
    //  } 
    //  std::cout << "\n"; 
    // } 
    // std::cout << "\n"; 

    cudaEventRecord(stop, 0); 
    cudaEventSynchronize(stop); 
    float elapsedTime; 
    float totalTime; 
    cudaEventElapsedTime(&elapsedTime, start, stop); 
    cudaEventDestroy(start); 
    cudaEventDestroy(stop); 
    totalTime = elapsedTime/1000; 
    std::cout<<"Elapsed_time:"<< totalTime<<std::endl; 


    printf("Execution ends!!\n"); 
    return EXIT_SUCCESS; 
} 
相关问题