用CUDA减少矩阵列

我有一个矩阵，我想用CUDA并以最快的方式计算列方式的平均值（归结为总和），即返回一个包含平均值的行向量该矩阵中的每一列。用于计算单个列向量之和的和减少的实现看起来像这样：用CUDA减少矩阵列

template<typename T> 
__global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) { 
    extern __shared__ T sdata[]; 

    size_t tid = blockIdx.x * blockDim.x + threadIdx.x; 

    // load input into __shared__ memory 
    T x = 0.0; 
    if (tid < n) { 
     x = input[tid]; 
    } 
    sdata[threadIdx.x] = x; 
    __syncthreads(); 

    // contiguous range pattern 
    for(int offset = blockDim.x/2; offset > 0; offset >>= 1) { 
     if(threadIdx.x < offset) { 
      // add a partial sum upstream to our own 
      sdata[threadIdx.x] += sdata[threadIdx.x + offset]; 
     } 
     // wait until all threads in the block have 
     // updated their partial sums 
     __syncthreads(); 
    } 

    // thread 0 writes the final result 
    if(threadIdx.x == 0) { 
     per_block_results[blockIdx.x] = sdata[0]; 
    } 
}

并且这被援引作为：

int n = ... // vector size 
const int BLOCK_SIZE = 1024; 
int number_of_blocks = (n + BLOCK_SIZE - 1)/BLOCK_SIZE; 
double* per_block_results = NULL; 
cudaMalloc((void**) &per_block_results, sizeof(double)*(number_of_blocks + 1)); 
// launch one kernel to compute, per-block, a partial sum 
kernelSum<double> <<<number_of_blocks, BLOCK_SIZE, BLOCK_SIZE*sizeof(double)>>>(a, per_block_results, n); 
// launch a single block to compute the sum of the partial sums 
kernelSum<double> <<<1, number_of_blocks, number_of_blocks*sizeof(double)>>>(per_block_results, per_block_results + number_of_blocks, number_of_blocks);

我可以概括这个内核任意数量的列的矩阵，但我受共享内存的限制。我的GPU具有计算能力3.5，因此它具有共享内存的48KB和最大块大小1024即每块的线程数。由于我对双精度感兴趣，因此我拥有共享内存的最大双倍数48*1024/8= 6144。由于减少是每块完成的，所以我可以有最多6144 (doubles in shared memory)/1024 (block size) = 6列，我可以同时计算总和减少量。减小块大小将允许同时计算更多的列，例如， 6144 (doubles in shared memory)/512 (block size) = 12。

这个更复杂的策略是否会击败矩阵的每一列上的简单CPU循环并调用总和减少。有没有更好的方法来做到这一点？

来源

2014-01-29 Giovanni Azua

一个简单的替代办法是设置您的问题，因为一个矩阵向量乘你矩阵和所有'1'的向量使用'cublas gemv（）'。 – JackOLantern

事实上，一个解决方案实际上是给予矩阵A来做的：A^T * 1，这会给出列的和。请详细说明答案，我会接受。但在不利的情况下做这么多的FLOP就像是浪费。它写道：我会滥用GEMV内核，因为实际上不需要乘法。 –

@talonmies回答了您的具体问题。确实，您正在加载虚拟'1'矢量并将矩阵'A'的元素与它们相乘，从而有点滥用'cublas gemv（）'。但是cuBLAS例程经过高度优化，评估你自己的实现是否比我们正在谈论的“天真”cuBLAS更快或更快。也许，您可能有兴趣对talonmie的解决方案进行比较，并在此处发布结果... – JackOLantern

什么阻止你做这样的事情：

template<typename T> 
__global__ void kernelSum(const T* __restrict__ input, 
          T* __restrict__ per_block_results, 
          const size_t lda, const size_t n) 
{ 
    extern __shared__ T sdata[]; 

    // Accumulate per thread partial sum 
    T x = 0.0; 
    T * p = &input[blockIdx.x * lda]; 
    for(int i=threadIdx.x; i < n; i += blockDim.x) { 
     x += p[i]; 
    } 

    // load partial sum into __shared__ memory 
    sdata[threadIdx.x] = x; 
    __syncthreads(); 

    // contiguous range pattern 
    for(int offset = blockDim.x/2; offset > 0; offset >>= 1) { 
     if(threadIdx.x < offset) { 
      // add a partial sum upstream to our own 
      sdata[threadIdx.x] += sdata[threadIdx.x + offset]; 
     } 
     // wait until all threads in the block have 
     // updated their partial sums 
     __syncthreads(); 
    } 

    // thread 0 writes the final result 
    if(threadIdx.x == 0) { 
     per_block_results[blockIdx.x] = sdata[0]; 
    } 
}

[标准免责声明：写在浏览器中，从来没有编译或测试，在风险自负]

即。对于共享内存减少，您只需要为块中的每个线程使用sdata中的一个条目。每个线程总计尽可能多的值以覆盖整列输入。然后没有共享内存限制，您可以使用相同的块大小对任何大小的列进行求和。

编辑：显然，使用每线程部分和的想法是新的给你，所以这里是一个完整的例子来研究：

#include <iostream> 

template<typename T> 
__global__ 
void kernelSum(const T* __restrict__ input, 
     const size_t lda, // pitch of input in words of sizeof(T) 
     T* __restrict__ per_block_results, 
       const size_t n) 
{ 
    extern __shared__ T sdata[]; 

    T x = 0.0; 
    const T * p = &input[blockIdx.x * lda]; 
    // Accumulate per thread partial sum 
    for(int i=threadIdx.x; i < n; i += blockDim.x) { 
     x += p[i]; 
    } 

    // load thread partial sum into shared memory 
    sdata[threadIdx.x] = x; 
    __syncthreads(); 

    for(int offset = blockDim.x/2; offset > 0; offset >>= 1) { 
     if(threadIdx.x < offset) { 
      sdata[threadIdx.x] += sdata[threadIdx.x + offset]; 
     } 
     __syncthreads(); 
    } 

    // thread 0 writes the final result 
    if(threadIdx.x == 0) { 
     per_block_results[blockIdx.x] = sdata[0]; 
    } 
} 


int main(void) 
{ 
    const int m = 10000, n = 16; 
    float * a = new float[m*n]; 

    for(int i=0; i<(m*n); i++) { a[i] = (float)(i%10); } 

    float *a_; 
    size_t size_a = m * n * sizeof(float); 
    cudaMalloc((void **)&a_, size_a); 
    cudaMemcpy(a_, a, size_a, cudaMemcpyHostToDevice); 

    float *b_; 
    size_t size_b = n * sizeof(float); 
    cudaMalloc((void **)&b_, size_b); 

    // select number of warps per block according to size of the 
    // colum and launch one block per column. Probably makes sense 
    // to have at least 4:1 column size to block size 
    dim3 blocksize(256); 
    dim3 gridsize(n); 
    size_t shmsize = sizeof(float) * (size_t)blocksize.x; 
    kernelSum<float><<<gridsize, blocksize, shmsize>>>(a_, b_, m, m); 

    float * b = new float[n]; 
    cudaMemcpy(b, b_, size_b, cudaMemcpyDeviceToHost); 

    for(int i=0; i<n; i++) { 
     std::cout << i << " " << b[i] << std::endl; 
    } 

    cudaDeviceReset(); 

    return 0; 
}

您应该相对于你的矩阵块大小实验以获得最佳性能，但通常情况下，内核所做的每个线程的工作量越多，总体性能就越好（因为共享内存减少相当昂贵）。您可以在this answer中看到一种针对类似内存带宽约束问题的阻塞和网格大小启发式方法。

来源

2014-01-29 14:12:15 talonmies

对不起，你可以提供一个示例内核调用，我不能看到它是如何工作的。 –

@GiovanniAzua：请看编辑中的代码。根据你对这个memset内核的另一个回答，你似乎可能需要花一些时间来研究整个“每个线程的多个数据项”设计模式。这是一个非常强大的技术，可以改善像这样的简单的内存绑定操作的性能。 – talonmies

谢谢！很有帮助。 –

以替代已经talonmies提供的答案，我在这里报告4列减少其他方法的基础上，采用基于使用cublas<t>gemv()与1的一列CUDA推力和1他们3，作为建议在我上面的评论中。

的CUDA推力方法是Reduce matrix rows with CUDA与由

thrust::make_permutation_iterator(d_matrix.begin(),     
     thrust::make_transform_iterator(thrust::make_counting_iterator(0), 
       (_1 % Nrows) * Ncols + _1/Nrows))

获得的隐式换位的类似下面是全码：

#include <cublas_v2.h> 

#include <thrust/host_vector.h> 
#include <thrust/device_vector.h> 
#include <thrust/generate.h> 
#include <thrust/reduce.h> 
#include <thrust/functional.h> 
#include <thrust/random.h> 
#include <thrust/sequence.h> 

#include <stdio.h> 
#include <iostream> 

#include "Utilities.cuh" 
#include "TimingGPU.cuh" 

using namespace thrust::placeholders; 

// --- Required for approach #2 
__device__ float *vals; 

/**************************************************************/ 
/* CONVERT LINEAR INDEX TO ROW INDEX - NEEDED FOR APPROACH #1 */ 
/**************************************************************/ 
template <typename T> 
struct linear_index_to_row_index : public thrust::unary_function<T,T> { 

    T Ncols; // --- Number of columns 

    __host__ __device__ linear_index_to_row_index(T Ncols) : Ncols(Ncols) {} 

    __host__ __device__ T operator()(T i) { return i/Ncols; } 
}; 

/******************************************/ 
/* ROW_REDUCTION - NEEDED FOR APPROACH #2 */ 
/******************************************/ 
struct col_reduction { 

    const int Nrows; // --- Number of rows 
    const int Ncols; // --- Number of cols 

    col_reduction(int _Nrows, int _Ncols) : Nrows(_Nrows), Ncols(_Ncols) {} 

    __device__ float operator()(float& x, int& y) { 
     float temp = 0.f; 
     for (int i = 0; i<Nrows; i++) { 
      temp += vals[y + (i*Ncols)]; 
     } 
     return temp; 
    } 
}; 

/**************************/ 
/* NEEDED FOR APPROACH #3 */ 
/**************************/ 
template<typename T> 
struct MulC: public thrust::unary_function<T, T> 
{ 
    T C; 
    __host__ __device__ MulC(T c) : C(c) { } 
    __host__ __device__ T operator()(T x) { return x * C; } 
}; 

/********/ 
/* MAIN */ 
/********/ 
int main() 
{ 
    const int Nrows = 5;  // --- Number of rows 
    const int Ncols = 8;  // --- Number of columns 

    // --- Random uniform integer distribution between 10 and 99 
    thrust::default_random_engine rng; 
    thrust::uniform_int_distribution<int> dist(10, 99); 

    // --- Matrix allocation and initialization 
    thrust::device_vector<float> d_matrix(Nrows * Ncols); 
    for (size_t i = 0; i < d_matrix.size(); i++) d_matrix[i] = (float)dist(rng); 

    TimingGPU timerGPU; 

    /***************/ 
    /* APPROACH #1 */ 
    /***************/ 
    timerGPU.StartCounter(); 
    // --- Allocate space for row sums and indices 
    thrust::device_vector<float> d_col_sums(Ncols); 
    thrust::device_vector<int> d_col_indices(Ncols); 

    // --- Compute row sums by summing values with equal row indices 
    thrust::reduce_by_key(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)), 
          thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols), 
          thrust::make_permutation_iterator(
           d_matrix.begin(), 
           thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1/Nrows)), 
          d_col_indices.begin(), 
          d_col_sums.begin(), 
          thrust::equal_to<int>(), 
          thrust::plus<float>()); 

    //thrust::reduce_by_key(
//    thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)), 
//    thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols), 
//    thrust::make_permutation_iterator(
    //    d_matrix.begin(), 
    //    thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1/Nrows)), 
//    thrust::make_discard_iterator(), 
//    d_col_sums.begin()); 

    printf("Timing for approach #1 = %f\n", timerGPU.GetCounter()); 

    // --- Print result 
    for(int j = 0; j < Ncols; j++) { 
     std::cout << "[ "; 
     for(int i = 0; i < Nrows; i++) 
      std::cout << d_matrix[i * Ncols + j] << " "; 
     std::cout << "] = " << d_col_sums[j] << "\n"; 
    } 

    /***************/ 
    /* APPROACH #2 */ 
    /***************/ 
    timerGPU.StartCounter(); 
    thrust::device_vector<float> d_col_sums_2(Ncols, 0); 
    float *s_vals = thrust::raw_pointer_cast(&d_matrix[0]); 
    gpuErrchk(cudaMemcpyToSymbol(vals, &s_vals, sizeof(float *))); 
    thrust::transform(d_col_sums_2.begin(), d_col_sums_2.end(), thrust::counting_iterator<int>(0), d_col_sums_2.begin(), col_reduction(Nrows, Ncols)); 

    printf("Timing for approach #2 = %f\n", timerGPU.GetCounter()); 

    for(int j = 0; j < Ncols; j++) { 
     std::cout << "[ "; 
     for(int i = 0; i < Nrows; i++) 
      std::cout << d_matrix[i * Ncols + j] << " "; 
     std::cout << "] = " << d_col_sums_2[j] << "\n"; 
    } 

    /***************/ 
    /* APPROACH #3 */ 
    /***************/ 

    timerGPU.StartCounter(); 
    thrust::device_vector<float> d_col_sums_3(Ncols, 0); 
    thrust::device_vector<float> d_temp(Nrows * Ncols); 
    thrust::inclusive_scan_by_key(
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)), 
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), linear_index_to_row_index<int>(Nrows)) + (Nrows*Ncols), 
       thrust::make_permutation_iterator(
         d_matrix.begin(), 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0),(_1 % Nrows) * Ncols + _1/Nrows)), 
       d_temp.begin()); 
    thrust::copy(
       thrust::make_permutation_iterator(
         d_temp.begin() + Nrows - 1, 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))), 
       thrust::make_permutation_iterator(
         d_temp.begin() + Nrows - 1, 
         thrust::make_transform_iterator(thrust::make_counting_iterator(0), MulC<int>(Nrows))) + Ncols, 
       d_col_sums_3.begin()); 

    printf("Timing for approach #3 = %f\n", timerGPU.GetCounter()); 

    for(int j = 0; j < Ncols; j++) { 
     std::cout << "[ "; 
     for(int i = 0; i < Nrows; i++) 
      std::cout << d_matrix[i * Ncols + j] << " "; 
     std::cout << "] = " << d_col_sums_3[j] << "\n"; 
    } 

    /***************/ 
    /* APPROACH #4 */ 
    /***************/ 
    cublasHandle_t handle; 

    timerGPU.StartCounter(); 
    cublasSafeCall(cublasCreate(&handle)); 

    thrust::device_vector<float> d_col_sums_4(Ncols); 
    thrust::device_vector<float> d_ones(Nrows, 1.f); 

    float alpha = 1.f; 
    float beta = 0.f; 
    cublasSafeCall(cublasSgemv(handle, CUBLAS_OP_N, Ncols, Nrows, &alpha, thrust::raw_pointer_cast(d_matrix.data()), Ncols, 
           thrust::raw_pointer_cast(d_ones.data()), 1, &beta, thrust::raw_pointer_cast(d_col_sums_4.data()), 1)); 

    printf("Timing for approach #4 = %f\n", timerGPU.GetCounter()); 

    for(int j = 0; j < Ncols; j++) { 
     std::cout << "[ "; 
     for(int i = 0; i < Nrows; i++) 
      std::cout << d_matrix[i * Ncols + j] << " "; 
     std::cout << "] = " << d_col_sums_4[j] << "\n"; 
    } 

    return 0; 
}

来源

2015-04-17 05:21:27 JackOLantern

用CUDA减少矩阵列

回答

相关问题