2014-01-13 72 views
1

以下程序使用cuSPARSE测试密集到稀疏转换。它在前几行输出中产生垃圾。但是,如果我将标记为(2)的行移动到标记为(1)的行后面,那么该程序可以正常工作。有人能告诉我可能是什么原因吗?使用cuSPARSE进行密集稀疏和稀疏到密集的转换

编辑: 为了使演示更清楚,我重写了程序与thrust,同样的问题仍然存在。

编辑: 作为建议的罗伯特,我改回的版本不thrust和新增的API级别的错误校验码。

#include <iostream> 
#include <cusparse_v2.h> 

using std::cerr; 
using std::cout; 
using std::endl; 

#define WRAP(x) do {x} while (0) 
#define CHKcusparse(x) WRAP(          \ 
    cusparseStatus_t err = (x);          \ 
    if (err != CUSPARSE_STATUS_SUCCESS) {        \ 
    cerr << "Cusparse Error #" << int(err) << "\"TODO\" at Line " \ 
     << __LINE__ << " of " << __FILE__ << ": " << #x << endl; \ 
    exit(1);              \ 
    }                 \ 
) 
#define CHKcuda(x) WRAP(           \ 
    cudaError_t err = (x);            \ 
    if (err != cudaSuccess) {           \ 
    cerr << "Cuda Error #" << int(err) << ", \""      \ 
     << cudaGetErrorString(err) << "\" at Line " << __LINE__  \ 
     << " of " << __FILE__ << ": " << #x << endl;    \ 
    exit(1);               \ 
    }                 \ 
) 
#define ALLOC(X, T, N) do {       \ 
    h##X = (T*) malloc(sizeof(T) * (N));     \ 
    CHKcuda(cudaMalloc((void**)&d##X, sizeof(T) * (N))); \ 
} while(0) 

int main() { 
    srand(100); 

    cusparseHandle_t g_cusparse_handle; 
    CHKcusparse(cusparseCreate(&g_cusparse_handle)); 

    const int n = 100, in_degree = 10; 
    int nnz = n * in_degree, nn = n * n; 

    int *dnnz, *dridx, *dcols; 
    int *hnnz, *hridx, *hcols; 
    float *dvals, *dmat; 
    float *hvals, *hmat; 

    // (1) The number of non-zeros in each column. 
    ALLOC(nnz, int, n); 

    // The dense matrix. 
    ALLOC(mat, float, nn); 

    // The values in sparse matrix. 
    ALLOC(vals, float, nnz); 

    // (2) The row indices of the sparse matrix. 
    ALLOC(ridx, int, nnz); 

    // The column offsets of the sparse matrix. 
    ALLOC(cols, int, n+1); 

    // Fill and copy dense matrix and number of non-zeros. 
    for (int i = 0; i < nn; i++) {hmat[i] = rand();} 
    for (int i = 0; i < n; i++) {hnnz[i] = in_degree;} 
    CHKcuda(cudaMemcpyAsync(dnnz, hnnz, sizeof(int) * n, cudaMemcpyHostToDevice)); 
    CHKcuda(cudaMemcpyAsync(dmat, hmat, sizeof(float) * nn, cudaMemcpyHostToDevice)); 
    CHKcuda(cudaDeviceSynchronize()); 

    // Perform dense to CSC format 
    cusparseMatDescr_t cspMatDesc; 
    CHKcusparse(cusparseCreateMatDescr(&cspMatDesc)); 
    CHKcusparse(cusparseSdense2csc(
     g_cusparse_handle, n, n, cspMatDesc, dmat, n, 
     dnnz, dvals, dridx, dcols 
)); 

    // Copy row indices back. 
    CHKcuda(cudaMemcpyAsync(hridx, dridx, sizeof(int) * nnz, cudaMemcpyDeviceToHost)); 
    CHKcuda(cudaDeviceSynchronize()); 
    CHKcusparse(cusparseDestroyMatDescr(cspMatDesc)); 

    // Display row indices. 
    for (int i = 0; i < n; i++) { 
    for (int j = 0; j < in_degree; j++) { 
     std::cout << hridx[i * in_degree + j] << ", "; 
    } 
    std::cout << std::endl; 
    } 

    CHKcuda(cudaFree(dnnz)); 
    CHKcuda(cudaFree(dvals)); 
    CHKcuda(cudaFree(dridx)); 
    CHKcuda(cudaFree(dcols)); 
    CHKcuda(cudaFree(dmat)); 
    free(hnnz); 
    free(hmat); 
    free(hvals); 
    free(hridx); 
    free(hcols); 
    return 0; 
} 
+0

没有错误检查?在向其他人求助之前,您应该利用针对CUDA和cusparse API的基本API级错误检查。你有cusparse函数返回错误,无论第1行或第2行是否定位。你声明每列的nnz是10,但实际上你正在用每列超过10个非零元素初始化你的密集矩阵,这导致了密集到稀疏转换爆炸。 Cusparse提供了预先计算每列nnz的函数。但在你的情况下,你可以简单地通过将'in_degree'设置为100而不是10来消除错误。 –

+0

感谢您的提醒。我从一个大的代码库中提取它来提问。我已经测试过这些调用,他们都返回成功。至于密集矩阵,我打算用n来生成稠密矩阵n,然后将它转化为大小为n的稀疏矩阵,每列有10个非零元素。如果这是设置,我的方式调用转换功能是否正确?还是有什么我明白错误的? – shaoyl85

+0

@RobertCrovella对不起,我明白你的观点,你的意思是如果我的密集矩阵在每一列中有10个以上的非零元素,我不能称nnz每列等于10的转换?转换不会自动选择10个最不为零的元素吗? – shaoyl85

回答

1

基本问题是您将内部不一致的数据传递到dense-to-sparse routine。您正在传递一个密集的矩阵,每列有100个非零元素,但是您告诉说每个列只有10个非零元素。

如果您使用cuda-memcheck运行您的代码,您将看到存在错误的错误。

对于这段代码,你可以通过改变你的in_degree变量解决问题100

对于一般的情况下,cusparse提供a convenient routine正确填充每列非零元素的数量。

+0

所以我认为我真正的问题是如何有效地从每列中选择具有最大绝对值的k个元素,并将其余值设置为零。你有什么主意吗?我认为它归结为多个k选择并行的问题。 – shaoyl85

+0

唯一想到的就是在每一列上做一个[按键排序](http://thrust.github.io/doc/group__sorting.html#ga2bb765aeef19f6a04ca8b8ba11efff24)(key = element绝对值,value = element索引),然后按值(元素索引)将前k个原始元素填充回零列。我建议你把它作为一个新的SO问题来提出更好的想法。 –

0

正如Robert Crovella所强调的那样,使用cuSPARSEcusparse<t>nnz()cusparse<t>dense2csr()例程可以有效地执行从密集到稀疏的传递。反之亦然可以通过cusparse<t>csr2dense()例程完成。下面是一个完整的例子,展示了如何在CSR格式中使用cuSPARSE从密集到稀疏,反之亦然。

cuSparseUtilities.cuh

#ifndef CUSPARSEUTILITIES_CUH 
#define CUSPARSEUTILITIES_CUH 

#include "cusparse_v2.h" 

void setUpDescriptor(cusparseMatDescr_t &, cusparseMatrixType_t, cusparseIndexBase_t); 
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A, 
    int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA, 
    const cusparseHandle_t handle, const int Nrows, const int Ncols); 

#endif 

cuSparseUtilities.cu

#include "cuSparseUtilities.cuh" 
#include "Utilities.cuh" 

/*****************************/ 
/* SETUP DESCRIPTOR FUNCTION */ 
/*****************************/ 
void setUpDescriptor(cusparseMatDescr_t &descrA, cusparseMatrixType_t matrixType, cusparseIndexBase_t indexBase) { 
    cusparseSafeCall(cusparseCreateMatDescr(&descrA)); 
    cusparseSafeCall(cusparseSetMatType(descrA, matrixType)); 
    cusparseSafeCall(cusparseSetMatIndexBase(descrA, indexBase)); 
} 

/********************************************************/ 
/* DENSE TO SPARSE CONVERSION FOR REAL DOUBLE PRECISION */ 
/********************************************************/ 
void dense2SparseD(const double * __restrict__ d_A_dense, int **d_nnzPerVector, double **d_A, 
        int **d_A_RowIndices, int **d_A_ColIndices, int &nnz, cusparseMatDescr_t descrA, 
        const cusparseHandle_t handle, const int Nrows, const int Ncols) { 

    const int lda = Nrows;      // --- Leading dimension of dense matrix 

    gpuErrchk(cudaMalloc(&d_nnzPerVector[0], Nrows * sizeof(int))); 

    // --- Compute the number of nonzero elements per row and the total number of nonzero elements in the dense d_A_dense 
    cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], &nnz)); 

    // --- Device side sparse matrix 
    gpuErrchk(cudaMalloc(&d_A[0], nnz * sizeof(double))); 
    gpuErrchk(cudaMalloc(&d_A_RowIndices[0], (Nrows + 1) * sizeof(int))); 
    gpuErrchk(cudaMalloc(&d_A_ColIndices[0], nnz * sizeof(int))); 

    cusparseSafeCall(cusparseDdense2csr(handle, Nrows, Ncols, descrA, d_A_dense, lda, d_nnzPerVector[0], d_A[0], d_A_RowIndices[0], d_A_ColIndices[0])); 

} 

kernel.cu

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include <stdio.h> 

#include <cusparse_v2.h> 

#include "cuSparseUtilities.cuh" 
#include "Utilities.cuh" 

/********/ 
/* MAIN */ 
/********/ 
int main() { 

    cusparseHandle_t handle; 

    // --- Initialize cuSPARSE 
    cusparseSafeCall(cusparseCreate(&handle)); 

    cusparseMatDescr_t descrA = 0; 

    /**************************/ 
    /* SETTING UP THE PROBLEM */ 
    /**************************/ 
    const int Nrows = 5;      // --- Number of rows 
    const int Ncols = 4;      // --- Number of columns 
    const int N = Nrows; 

    // --- Host side dense matrix 
    double *h_A_dense = (double*)malloc(Nrows * Ncols * sizeof(*h_A_dense)); 

    // --- Column-major storage 
    h_A_dense[ 0] = 0.4612f; h_A_dense[ 5] = -0.0006f; h_A_dense[10] = 1.3f;  h_A_dense[15] = 0.0f; 
    h_A_dense[ 1] = 0.0f;  h_A_dense[ 6] = 1.443f;  h_A_dense[11] = 0.0f;  h_A_dense[16] = 0.0f; 
    h_A_dense[ 2] = -0.0006f; h_A_dense[ 7] = 0.4640f; h_A_dense[12] = 0.0723f; h_A_dense[17] = 0.0f; 
    h_A_dense[ 3] = 0.3566f; h_A_dense[ 8] = 0.0723f; h_A_dense[13] = 0.7543f; h_A_dense[18] = 0.0f; 
    h_A_dense[ 4] = 0.f;  h_A_dense[ 9] = 0.0f;  h_A_dense[14] = 0.0f;  h_A_dense[19] = 0.1f; 

    // --- Create device array and copy host array to it 
    double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, Nrows * Ncols * sizeof(double))); 
    gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, Nrows * Ncols * sizeof(*d_A_dense), cudaMemcpyHostToDevice)); 

    /*******************************/ 
    /* FROM DENSE TO SPARSE MATRIX */ 
    /*******************************/ 
    // --- Descriptor for sparse matrix A 
    setUpDescriptor(descrA, CUSPARSE_MATRIX_TYPE_GENERAL, CUSPARSE_INDEX_BASE_ONE); 

    int nnz = 0;        // --- Number of nonzero elements in dense matrix 
    int *d_nnzPerVector;      // --- Device side number of nonzero elements per row 

    double *d_A;        // --- Sparse matrix values - array of size nnz 
    int *d_A_RowIndices;      // --- "Row indices" 
    int *d_A_ColIndices;      // --- "Column indices" 

    dense2SparseD(d_A_dense, &d_nnzPerVector, &d_A, &d_A_RowIndices, &d_A_ColIndices, nnz, descrA, handle, Nrows, Ncols); 

    /*******************************************************/ 
    /* CHECKING THE RESULTS FOR DENSE TO SPARSE CONVERSION */ 
    /*******************************************************/ 
    // --- Host side number of nonzero elements per row 
    int *h_nnzPerVector = (int *)malloc(Nrows * sizeof(int)); 
    gpuErrchk(cudaMemcpy(h_nnzPerVector, d_nnzPerVector, Nrows * sizeof(int), cudaMemcpyDeviceToHost)); 

    printf("Number of nonzero elements in dense matrix = %i\n\n", nnz); 
    for (int i = 0; i < Nrows; ++i) printf("Number of nonzero elements in row %i = %i \n", i, h_nnzPerVector[i]); 
    printf("\n"); 

    // --- Host side sparse matrix 
    double *h_A = (double *)malloc(nnz * sizeof(double)); 
    int *h_A_RowIndices = (int *)malloc((Nrows + 1) * sizeof(int)); 
    int *h_A_ColIndices = (int *)malloc(nnz * sizeof(int)); 
    gpuErrchk(cudaMemcpy(h_A, d_A, nnz * sizeof(double), cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (Nrows + 1) * sizeof(int), cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnz * sizeof(int), cudaMemcpyDeviceToHost)); 

    printf("\nOriginal matrix in CSR format\n\n"); 
    for (int i = 0; i < nnz; ++i) printf("A[%i] = %f\n", i, h_A[i]); printf("\n"); 

    printf("\n"); 
    for (int i = 0; i < (Nrows + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n"); 

    for (int i = 0; i < nnz; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]); 

    /*******************************/ 
    /* FROM SPARSE TO DENSE MATRIX */ 
    /*******************************/ 
    double *d_A_denseReconstructed; gpuErrchk(cudaMalloc(&d_A_denseReconstructed, Nrows * Ncols * sizeof(double))); 
    cusparseSafeCall(cusparseDcsr2dense(handle, Nrows, Ncols, descrA, d_A, d_A_RowIndices, d_A_ColIndices, 
             d_A_denseReconstructed, Nrows)); 

    /*******************************************************/ 
    /* CHECKING THE RESULTS FOR SPARSE TO DENSE CONVERSION */ 
    /*******************************************************/ 
    double *h_A_denseReconstructed = (double *)malloc(Nrows * Ncols * sizeof(double)); 
    gpuErrchk(cudaMemcpy(h_A_denseReconstructed, d_A_denseReconstructed, Nrows * Ncols * sizeof(double), cudaMemcpyDeviceToHost)); 

    printf("\nReconstructed dense matrix \n"); 
    for (int m = 0; m < Nrows; m++) { 
     for (int n = 0; n < Ncols; n++) 
      printf("%f\t", h_A_denseReconstructed[n * Nrows + m]); 
     printf("\n"); 
    } 

    return 0; 
}