2017-06-09 39 views
1

我编写了一个内核,用于使用约简计算约100,000个浮点数的最小值和最大值(请参见下面的代码)。我使用线程块将1024个值的块减少为单个值(在共享内存中),然后在CPU上的块之间进行最终减少。CUDA中的最小/最大浮点比CPU版本慢。为什么?

然后,我将它与仅在CPU上进行的串行计算进行了比较。 CUDA版本需要2.2ms,CPU版本需要0.21ms。为什么CUDA版本要慢得多?数组大小不够大,无法利用并行性,或者我的代码没有经过优化?

这是Udacity并行编程课程练习的一部分。我通过他们的网站运行它,所以我不知道确切的硬件是什么,但他们声称代码在实际的GPU上运行。

这里是CUDA代码:

__global__ void min_max_kernel(const float* const d_logLuminance, 
          const size_t length, 
          float* d_min_logLum, 
          float* d_max_logLum) { 
    // Shared working memory 
    extern __shared__ float sh_logLuminance[]; 

    int blockWidth = blockDim.x; 
    int x = blockDim.x * blockIdx.x + threadIdx.x; 

    float* min_logLuminance = sh_logLuminance; 
    float* max_logLuminance = sh_logLuminance + blockWidth; 

    // Copy this block's chunk of the data to shared memory 
    // We copy twice so we compute min and max at the same time 
    if (x < length) { 
     min_logLuminance[threadIdx.x] = d_logLuminance[x]; 
     max_logLuminance[threadIdx.x] = min_logLuminance[threadIdx.x]; 
    } 
    else { 
     // Pad if we're out of range 
     min_logLuminance[threadIdx.x] = FLT_MAX; 
     max_logLuminance[threadIdx.x] = -FLT_MAX; 
    } 

    __syncthreads(); 

    // Reduce 
    for (int s = blockWidth/2; s > 0; s /= 2) { 
     if (threadIdx.x < s) { 
      if (min_logLuminance[threadIdx.x + s] < min_logLuminance[threadIdx.x]) { 
       min_logLuminance[threadIdx.x] = min_logLuminance[threadIdx.x + s]; 
      } 

      if (max_logLuminance[threadIdx.x + s] > max_logLuminance[threadIdx.x]) { 
       max_logLuminance[threadIdx.x] = max_logLuminance[threadIdx.x + s]; 
      } 
     } 

     __syncthreads(); 
    } 

    // Write to global memory 
    if (threadIdx.x == 0) { 
     d_min_logLum[blockIdx.x] = min_logLuminance[0]; 
     d_max_logLum[blockIdx.x] = max_logLuminance[0]; 
    } 
} 

size_t get_num_blocks(size_t inputLength, size_t threadsPerBlock) { 
    return inputLength/threadsPerBlock + 
     ((inputLength % threadsPerBlock == 0) ? 0 : 1); 
} 

/* 
* Compute min, max over the data by first reducing on the device, then 
* doing the final reducation on the host. 
*/ 
void compute_min_max(const float* const d_logLuminance, 
        float& min_logLum, 
        float& max_logLum, 
        const size_t numRows, 
        const size_t numCols) { 
    // Compute min, max 
    printf("\n=== computing min/max ===\n"); 
    const size_t blockWidth = 1024; 
    const size_t numPixels = numRows * numCols; 
    size_t numBlocks = get_num_blocks(numPixels, blockWidth); 

    printf("Num min/max blocks = %d\n", numBlocks); 

    float* d_min_logLum; 
    float* d_max_logLum; 
    int alloc_size = sizeof(float) * numBlocks; 
    checkCudaErrors(cudaMalloc(&d_min_logLum, alloc_size)); 
    checkCudaErrors(cudaMalloc(&d_max_logLum, alloc_size)); 

    min_max_kernel<<<numBlocks, blockWidth, sizeof(float) * blockWidth * 2>>> 
     (d_logLuminance, numPixels, d_min_logLum, d_max_logLum); 

    float* h_min_logLum = (float*) malloc(alloc_size); 
    float* h_max_logLum = (float*) malloc(alloc_size); 
    checkCudaErrors(cudaMemcpy(h_min_logLum, d_min_logLum, alloc_size, cudaMemcpyDeviceToHost)); 
    checkCudaErrors(cudaMemcpy(h_max_logLum, d_max_logLum, alloc_size, cudaMemcpyDeviceToHost)); 

    min_logLum = FLT_MAX; 
    max_logLum = -FLT_MAX; 

    // Reduce over the block results 
    // (would be a bit faster to do it on the GPU, but it's just 96 numbers) 
    for (int i = 0; i < numBlocks; i++) { 
     if (h_min_logLum[i] < min_logLum) { 
      min_logLum = h_min_logLum[i]; 
     } 
     if (h_max_logLum[i] > max_logLum) { 
      max_logLum = h_max_logLum[i]; 
     } 
    } 

    printf("min_logLum = %.2f\nmax_logLum = %.2f\n", min_logLum, max_logLum); 

    checkCudaErrors(cudaFree(d_min_logLum)); 
    checkCudaErrors(cudaFree(d_max_logLum)); 
    free(h_min_logLum); 
    free(h_max_logLum); 
} 

这里是主机版本:

void compute_min_max_on_host(const float* const d_logLuminance, size_t numPixels) { 
    int alloc_size = sizeof(float) * numPixels; 
    float* h_logLuminance = (float*) malloc(alloc_size); 
    checkCudaErrors(cudaMemcpy(h_logLuminance, d_logLuminance, alloc_size, cudaMemcpyDeviceToHost)); 
    float host_min_logLum = FLT_MAX; 
    float host_max_logLum = -FLT_MAX; 
    printf("HOST "); 
    for (int i = 0; i < numPixels; i++) { 
     if (h_logLuminance[i] < host_min_logLum) { 
      host_min_logLum = h_logLuminance[i]; 
     } 
     if (h_logLuminance[i] > host_max_logLum) { 
      host_max_logLum = h_logLuminance[i]; 
     } 
    } 
    printf("host_min_logLum = %.2f\nhost_max_logLum = %.2f\n", 
     host_min_logLum, host_max_logLum); 
    free(h_logLuminance); 
} 
+0

为什么不直接以不同的数组大小运行代码并查看相对性能差异是否更改? – talonmies

回答

2
  1. 作为@talonmies表明,行为可能是对于较大规模的不同; 100,000实际上没有那么多:它们大部分都适用于现代CPU上内核的总体L1高速缓存;其中一半适合单核心的二级缓存。
  2. 通过PCI Express传输需要时间;在你的情况下,可能会增加一倍,因为你不使用固定内存。你不是重叠计算和PCI Express I/O(不是说它对于只有100,000个元素才有意义)
  3. 你的内核相当慢,原因不止一个;不其中最重要的是广泛使用共享存储器,其中大部分是不必要

更一般的:始终使用轮廓nvvp代码(nvprof或用于获取用于进一步分析的文本信息)。

+0

谢谢!你能给我指点我的内核如何慢吗?我认为共享内存非常快。我假设你的意思是我应该以某种方式使用寄存器而不是共享内存。是对的吗? –

+1

@ GuyGur-Ari:想想你的内核为每个输入元素执行的指令数量。你真的需要把它降下来。让每个线程自己处理大量更多的输入元素,并最终执行线程间交互。此外,使用基于洗牌的减少。 – einpoklum