2014-07-15 25 views
1

我试图使用3数据流实现“3”“重叠”,如CUDA streams and concurrency webinar中的示例所示。但我无法实现。费米体系结构的错误依赖问题

我有Geforce GT 550M(费米体系结构与一个副本引擎),我使用Windows 7(64位)。

这是我写的代码。

#include <iostream> 

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

// includes, project 
#include "helper_cuda.h" 
#include "helper_functions.h" // helper utility functions 

#include <stdio.h> 

using namespace std; 

#define DATA_SIZE 6000000 
#define NUM_THREADS 32 
#define NUM_BLOCKS 16 
#define NUM_STREAMS 3 

__global__ void kernel(const int *in, int *out, int dataSize) 
{ 
    int start = blockIdx.x * blockDim.x + threadIdx.x; 
    int end = dataSize; 
    for (int i = start; i < end; i += blockDim.x * gridDim.x) 
    { 
     out[i] = in[i] * in[i]; 
    } 
} 

int main() 
{ 
    const int dataSize = DATA_SIZE; 
    int *h_in = new int[dataSize]; 
    int *h_out = new int[dataSize]; 
    int *h_groundTruth = new int[dataSize]; 

    // Input population 
    for(int i = 0; i < dataSize; i++) 
     h_in[i] = 5; 

    for(int i = 0; i < dataSize; i++) 
     h_out[i] = 0; 

    // CPU calculation for ground truth 
    for(int i = 0; i < dataSize; i++) 
     h_groundTruth[i] = h_in[i] * h_in[i]; 

    // Choose which GPU to run on, change this on a multi-GPU system. 
    checkCudaErrors(cudaSetDevice(0)); 

    int *d_in = 0; 
    int *d_out = 0; 
    int streamSize = dataSize/NUM_STREAMS; 
    size_t memSize = dataSize * sizeof(int); 
    size_t streamMemSize = memSize/NUM_STREAMS; 

    checkCudaErrors(cudaMalloc((void **)&d_in, memSize)); 
    checkCudaErrors(cudaMalloc((void **)&d_out, memSize)); 

    // registers host memory as page-locked (required for asynch cudaMemcpyAsync) 
    checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable)); 
    checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable)); 

    // set kernel launch config 
    dim3 nThreads = dim3(NUM_THREADS,1,1); 
    dim3 nBlocks = dim3(NUM_BLOCKS,1,1); 

    cout << "GPU Kernel Configuration : " << endl; 
    cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl; 
    cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl; 
    cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl; 

    // create cuda stream 
    cudaStream_t streams[NUM_STREAMS]; 
    for(int i = 0; i < NUM_STREAMS; i++) 
     checkCudaErrors(cudaStreamCreate(&streams[i])); 

    // create cuda event handles 
    cudaEvent_t start, stop; 
    checkCudaErrors(cudaEventCreate(&start)); 
    checkCudaErrors(cudaEventCreate(&stop)); 

    cudaEventRecord(start, 0); 

    // overlapped execution using version 2 

    for(int i = 0; i < NUM_STREAMS; i++) 
    { 
     int offset = i * streamSize; 
     cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice,  streams[i]); 
    } 

    //cudaMemcpy(d_in, h_in, memSize, cudaMemcpyHostToDevice); 

    for(int i = 0; i < NUM_STREAMS; i++) 
    { 
     int offset = i * streamSize; 
     dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x/2)); 

     //kernel<<<nBlocks, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize); 
     kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2); 
     kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2); 
    } 

    for(int i = 0; i < NUM_STREAMS; i++) 
    { 
     int offset = i * streamSize; 
     cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]); 
    } 



    for(int i = 0; i < NUM_STREAMS; i++) 
     checkCudaErrors(cudaStreamSynchronize(streams[i])); 

    cudaEventRecord(stop, 0); 

    checkCudaErrors(cudaStreamSynchronize(0)); 

    checkCudaErrors(cudaDeviceSynchronize()); 

    float gpu_time = 0; 
    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop)); 


    // release resources 
    checkCudaErrors(cudaEventDestroy(start)); 
    checkCudaErrors(cudaEventDestroy(stop)); 
    checkCudaErrors(cudaHostUnregister(h_in)); 
    checkCudaErrors(cudaHostUnregister(h_out)); 
    checkCudaErrors(cudaFree(d_in)); 
    checkCudaErrors(cudaFree(d_out)); 

    for(int i = 0; i < NUM_STREAMS; i++) 
     checkCudaErrors(cudaStreamDestroy(streams[i])); 

    cudaDeviceReset(); 

    cout << "Execution Time of GPU: " << gpu_time << "ms" << endl; 


    // GPU output check 
    int sum = 0; 
    for(int i = 0; i < dataSize; i++)  
     sum += h_groundTruth[i] - h_out[i]; 

    cout << "Error between CPU and GPU: " << sum << endl; 

    delete[] h_in; 
    delete[] h_out; 
    delete[] h_groundTruth; 

    return 0; 
} 

使用Nsight的分析,我有这样的结果:

enter image description here

这似乎是正确的,但为什么在流#1的D2H转移才开始当最后一个内核启动流#2而不是之前? 我也试过用8流(只是通过改变NUM_STREAM8)来实现这样的“往来港澳3重叠”,这里是结果:

enter image description here

有趣的是,当我使用8流,计算和内存传输之间的重叠似乎要好得多。

这个问题的原因是什么?是由于WDDM驱动程序还是我的程序有问题?

+0

我认为这是一个错误的依赖性问题,另请参阅[Hyper-Q Example](http://docs.nvidia.com/cuda/samples/6_Advanced/simpleHyperQ/doc/HyperQ.pdf)。 – JackOLantern

+0

你能明确你的意思吗?3路重叠?您是否想同时查看D2H memcpy,计算内核和H2D memcpy?我认为这是3路重叠的通常意义。 –

+0

@RobertCrovella如果我注释掉第二个内核调用,我的时间线(运行在GeForce 540M上)显示第一个流的D2H恰好在第三个流的最后一个H2D之后开始(内存传输中没有空闲时间)。我认为OP也喜欢遵守同样的行为。但是,从上述时间线开始,第一流的D2H仅在第二流的第二内核调用结束之后开始,而不是紧接在第三流的H2D之后。这是我对他的帖子的理解。 – JackOLantern

回答

4

从上面的评论看来,OP的问题似乎是一个错误的依赖关系问题,受到Fermi架构的影响,并且由开普勒架构的Hyper-Q特性解决。总之,OP强调第一个D2H传输(流#1)不是在最后一个H2D(流#3)完成后立即开始的事实,而原则上它可能是。时间间隔是由下图中的红圈突出(以下,但是对于不同的规定,所有的测试是指属于费米家族的GeForce GT540M):

enter image description here

的OP的做法是一个广度优先方法,它根据下面的方案进行操作:

for(int i = 0; i < NUM_STREAMS; i++) 
    cudaMemcpyAsync(..., cudaMemcpyHostToDevice, streams[i]); 

for(int i = 0; i < NUM_STREAMS; i++) 
{ 
    kernel_launch_1<<<..., 0, streams[i]>>>(...); 
    kernel_launch_2<<<..., 0, streams[i]>>>(...); 
} 

for(int i = 0; i < NUM_STREAMS; i++) 
    cudaMemcpyAsync(..., cudaMemcpyDeviceToHost, streams[i]); 

使用深度优先方法,操作按以下方案

for(int i = 0; i < NUM_STREAMS; i++) 
{ 
    cudaMemcpyAsync(...., cudaMemcpyHostToDevice, streams[i]); 

    kernel_launch_1<<<...., 0, streams[i]>>>(....); 
    kernel_launch_2<<<...., 0, streams[i]>>>(....); 

    cudaMemcpyAsync(...., cudaMemcpyDeviceToHost, streams[i]); 
} 

似乎没有改善的情况下,根据下面的时间表(深度优先码时,则报告回答的底部),但它似乎显示出恶化的重叠:

enter image description here

根据广度优先的方针,并征求意见的第二内核启动,第一D2H副本立即开始,因为它可以,如下面的时间表报道:

enter image description here

最后,运行于一个开普勒K20C的代码,这个问题不显示,通过下图所示:

enter image description here

下面是深度优先的方法的代码:

#include <iostream> 

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

// includes, project 
#include "helper_cuda.h" 
#include "helper_functions.h" // helper utility functions 

#include <stdio.h> 

using namespace std; 

#define DATA_SIZE 6000000 
#define NUM_THREADS 32 
#define NUM_BLOCKS 16 
#define NUM_STREAMS 3 

__global__ void kernel(const int *in, int *out, int dataSize) 
{ 
    int start = blockIdx.x * blockDim.x + threadIdx.x; 
    int end = dataSize; 
    for (int i = start; i < end; i += blockDim.x * gridDim.x) 
    { 
     out[i] = in[i] * in[i]; 
    } 
} 

int main() 
{ 
    const int dataSize = DATA_SIZE; 
    int *h_in = new int[dataSize]; 
    int *h_out = new int[dataSize]; 
    int *h_groundTruth = new int[dataSize]; 

    // Input population 
    for(int i = 0; i < dataSize; i++) 
     h_in[i] = 5; 

    for(int i = 0; i < dataSize; i++) 
     h_out[i] = 0; 

    // CPU calculation for ground truth 
    for(int i = 0; i < dataSize; i++) 
     h_groundTruth[i] = h_in[i] * h_in[i]; 

    // Choose which GPU to run on, change this on a multi-GPU system. 
    checkCudaErrors(cudaSetDevice(0)); 

    int *d_in = 0; 
    int *d_out = 0; 
    int streamSize = dataSize/NUM_STREAMS; 
    size_t memSize = dataSize * sizeof(int); 
    size_t streamMemSize = memSize/NUM_STREAMS; 

    checkCudaErrors(cudaMalloc((void **)&d_in, memSize)); 
    checkCudaErrors(cudaMalloc((void **)&d_out, memSize)); 

    // registers host memory as page-locked (required for asynch cudaMemcpyAsync) 
    checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable)); 
    checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable)); 

    // set kernel launch config 
    dim3 nThreads = dim3(NUM_THREADS,1,1); 
    dim3 nBlocks = dim3(NUM_BLOCKS,1,1); 

    cout << "GPU Kernel Configuration : " << endl; 
    cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl; 
    cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl; 
    cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl; 

    // create cuda stream 
    cudaStream_t streams[NUM_STREAMS]; 
    for(int i = 0; i < NUM_STREAMS; i++) 
     checkCudaErrors(cudaStreamCreate(&streams[i])); 

    // create cuda event handles 
    cudaEvent_t start, stop; 
    checkCudaErrors(cudaEventCreate(&start)); 
    checkCudaErrors(cudaEventCreate(&stop)); 

    cudaEventRecord(start, 0); 

    for(int i = 0; i < NUM_STREAMS; i++) 
    { 
     int offset = i * streamSize; 

     cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice,  streams[i]); 

     dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x/2)); 

     kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2); 
     kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2); 

     cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]); 
    } 



    for(int i = 0; i < NUM_STREAMS; i++) 
     checkCudaErrors(cudaStreamSynchronize(streams[i])); 

    cudaEventRecord(stop, 0); 

    checkCudaErrors(cudaStreamSynchronize(0)); 

    checkCudaErrors(cudaDeviceSynchronize()); 

    float gpu_time = 0; 
    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop)); 


    // release resources 
    checkCudaErrors(cudaEventDestroy(start)); 
    checkCudaErrors(cudaEventDestroy(stop)); 
    checkCudaErrors(cudaHostUnregister(h_in)); 
    checkCudaErrors(cudaHostUnregister(h_out)); 
    checkCudaErrors(cudaFree(d_in)); 
    checkCudaErrors(cudaFree(d_out)); 

    for(int i = 0; i < NUM_STREAMS; i++) 
     checkCudaErrors(cudaStreamDestroy(streams[i])); 

    cudaDeviceReset(); 

    cout << "Execution Time of GPU: " << gpu_time << "ms" << endl; 


    // GPU output check 
    int sum = 0; 
    for(int i = 0; i < dataSize; i++)  
     sum += h_groundTruth[i] - h_out[i]; 

    cout << "Error between CPU and GPU: " << sum << endl; 

    delete[] h_in; 
    delete[] h_out; 
    delete[] h_groundTruth; 

    return 0; 
}