2015-02-23 23 views
2

我有以下MATLAB代码:MATLAB产生不同的结果比CUBLAS +内核

[N, d] = size(X); % data size and dimensions 

R = rand(d,dt); % Form a random matrix with elements in [0,1] 

% Random projection 
Y = X * R; 

w=720; % hashing step 

b = w * rand(dt,1); 

% Compute the hash codes of the data 
binId = floor(bsxfun(@plus, Y, b')/w); 

,我试图使它平行使用CUBLAS和内核如下:

__global__ void compute(const int N,const int dt,const int w,const float *old, int *newt){ 
    int col = blockDim.y * blockIdx.y + threadIdx.y; 
    int row = blockDim.x * blockIdx.x + threadIdx.x; 
    int id = row+N*col; 
    if(row<N && col<dt){ 
     newt[id]=(floor)(old[id]/w); 
    } 
} 

void gpu_blas_mmul(cublasHandle_t handle, const float *A, const float *B, float *C, const int m, const int k, const int n, const float bet) { 
    int lda=m,ldb=k,ldc=m; 
    const float alf = 1.0; 
    const float *alpha = &alf; 
    const float *beta = &bet; 

    // Do the actual multiplication and addition 
    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); 
} 

float *d_R, *d_RX, *d_B_row; 
int *d_H; 

thrust::device_vector<float> d_X(h_X, h_X + N * d); 

cudaMalloc(&d_R,d * dt * sizeof(float)); 
cudaMemcpy(d_R,h_R,d * dt * sizeof(float),cudaMemcpyHostToDevice); 

cudaMalloc(&d_B_row,dt * sizeof(float)); 
cudaMemcpy(d_B_row,h_B_row,dt * sizeof(float),cudaMemcpyHostToDevice); 

cudaMalloc(&d_RX,N * dt * sizeof(float)); 
cudaMalloc(&d_H,N * dt * sizeof(int)); 

//-------------------------CuBLAS----------------------- 

cublasHandle_t handle; 
cublasCreate(&handle); 

thrust::device_vector<float> d_B_col(N, w); 

gpu_blas_mmul(handle, thrust::raw_pointer_cast(&d_B_col[0]), d_B_row, d_RX, N, 1, dt,0.0); 

gpu_blas_mmul(handle, thrust::raw_pointer_cast(&d_X[0]), d_R, d_RX, N, d, dt, 1.0); 

cublasDestroy(handle); 

//-----------------------Kernel---------------------------- 
dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE,1); 
int linGrid1 = (int)ceil(N/(float)BLOCK_SIZE); 
int linGrid2 = (int)ceil(dt/(float)BLOCK_SIZE); 
dim3 gridSize(linGrid1,linGrid2,1); 
compute<<<gridSize, blockSize>>>(N, dt, w, d_RX, d_H); 

在H_X, h_R和h_B_row我已经保存了(按列的主要顺序)X,R和b由MATLAB生成。我使用的数据集是http://corpus-texmex.irisa.fr/的ANN_SIFT1M

对于大约10000个值,产生的结果是完全相同的,但是当我尝试50000个值时,例如有一些差异随着值的数量增加而变得越来越多。

有关我在做什么错的任何想法?

回答

4

您的MATLAB代码使用双点精度,因此结果更准确。与此相反,您提供的CUDA内核使用单点精度,类型为float,因此产生的准确度较低。通常,当面对单点和双点精度问题时,一旦开始增加输入数据的大小,问题就会变得更糟。

解决方案将使用类型double而不是float

相关问题