我正在尝试在GPU(使用CUDA)上实现矩阵向量乘法。在我的C++代码(CPU)中,我将矩阵加载为一个稠密矩阵,然后使用CUDA执行矩阵向量乘法。我也使用共享内存来提高性能。CUDA中的稀疏矩阵向量乘法
- 如何知道我的矩阵是一个稀疏矩阵以有效的方式加载矩阵?
下面是我的C++函数加载矩阵:
int readMatrix(char* filename, float* &matrix, unsigned int *dim = NULL, int majority = ROW_MAJOR)
{
unsigned int w, h, x, y, num_entries;
float val;
std::ifstream file(filename);
if (file)
{
file >> h >> w >> num_entries;
cout << w << " " << h << " " << num_entries << "\n";
assert(w == h || w == 1 || h == 1);
if(dim != NULL) *dim = std::max(w, h);
matrix = new float[ w * h ];
unsigned int i;
for(i = 0; i < num_entries; i++){
if(file.eof()) break;
file >> y >> x >> val;
if(majority == ROW_MAJOR){
matrix[ w * y + x ] = val;
} else if(majority == COLUMN_MAJOR){
matrix[ h * x + y ] = val;
}
}
file.close();
if(i == num_entries)
std::cout << "\nFile read successfully\n";
else
std::cout << "\nFile read successfully but seems defective:\n num entries read = " << i << ", entries epected = " << num_entries << "\n";
// print first few elements
if(w == h){
for(unsigned int i = 0; i < w; i++){
printf("\n");
for(unsigned int j = 0; j < h; j++){
printf("%.2f ", matrix[ j + w * i ]);
}
}
}
else{
printf("\n");
for(unsigned int j = 0; j < h; j++){
printf("%.2f ", matrix[ j ]);
}
}
} else {
std::cout << "Unable to open file\n";
return false;
}
return true;
}
下面是我的CUDA内核函数处理该矩阵 - 向量乘法:
__global__ void
_cl_matrix_vector_(float *A, float *b, float *x, int dim)
{
extern __shared__ float vec[];
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
float temp = 0.0;
int vOffs = 0;
//load vector into shared memory
for (int i = 0; i < (dim/blockDim.x) + 1 ; ++i, vOffs+= blockDim.x) {
vec[vOffs + threadIdx.x] = b[vOffs + threadIdx.x];
}
//make sure all threads are synchronized
__syncthreads();
if (idx < dim) {
temp = 0.0;
//dot product (multiplication)
for (int i = 0; i < dim; i++){
temp += A[idx * dim + i] * vec[i];
}
x[idx] = temp;
}
}
- 我必须对我的CUDA代码进行必要的修改,以考虑到我的矩阵是一个稀疏矩阵?
- 我从论坛中发现我们也可以使用填充来优化性能,但这需要我改变读取矩阵/排序矩阵的方式。任何想法如何实现这种填充的方式我读矩阵和执行计算?
正确答案完全取决于稀疏矩阵的存储格式。请参阅http://www.nvidia.com/object/nvidia_research_pub_001.html,其中讨论了GPU上不同稀疏格式的优点。 – talonmies 2011-05-12 09:38:59