我试图使用3
数据流实现“3
”“重叠”,如CUDA streams and concurrency webinar中的示例所示。但我无法实现。费米体系结构的错误依赖问题
我有Geforce GT 550M(费米体系结构与一个副本引擎),我使用Windows 7(64位)。
这是我写的代码。
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions
#include <stdio.h>
using namespace std;
#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
__global__ void kernel(const int *in, int *out, int dataSize)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = dataSize;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}
int main()
{
const int dataSize = DATA_SIZE;
int *h_in = new int[dataSize];
int *h_out = new int[dataSize];
int *h_groundTruth = new int[dataSize];
// Input population
for(int i = 0; i < dataSize; i++)
h_in[i] = 5;
for(int i = 0; i < dataSize; i++)
h_out[i] = 0;
// CPU calculation for ground truth
for(int i = 0; i < dataSize; i++)
h_groundTruth[i] = h_in[i] * h_in[i];
// Choose which GPU to run on, change this on a multi-GPU system.
checkCudaErrors(cudaSetDevice(0));
int *d_in = 0;
int *d_out = 0;
int streamSize = dataSize/NUM_STREAMS;
size_t memSize = dataSize * sizeof(int);
size_t streamMemSize = memSize/NUM_STREAMS;
checkCudaErrors(cudaMalloc((void **)&d_in, memSize));
checkCudaErrors(cudaMalloc((void **)&d_out, memSize));
// registers host memory as page-locked (required for asynch cudaMemcpyAsync)
checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));
// set kernel launch config
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS,1,1);
cout << "GPU Kernel Configuration : " << endl;
cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;
// create cuda stream
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamCreate(&streams[i]));
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
cudaEventRecord(start, 0);
// overlapped execution using version 2
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}
//cudaMemcpy(d_in, h_in, memSize, cudaMemcpyHostToDevice);
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x/2));
//kernel<<<nBlocks, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
}
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamSynchronize(streams[i]));
cudaEventRecord(stop, 0);
checkCudaErrors(cudaStreamSynchronize(0));
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0;
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaHostUnregister(h_in));
checkCudaErrors(cudaHostUnregister(h_out));
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;
// GPU output check
int sum = 0;
for(int i = 0; i < dataSize; i++)
sum += h_groundTruth[i] - h_out[i];
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_groundTruth;
return 0;
}
使用Nsight的分析,我有这样的结果:
这似乎是正确的,但为什么在流#1的D2H转移才开始当最后一个内核启动流#2而不是之前? 我也试过用8
流(只是通过改变NUM_STREAM
到8
)来实现这样的“往来港澳3
重叠”,这里是结果:
有趣的是,当我使用8
流,计算和内存传输之间的重叠似乎要好得多。
这个问题的原因是什么?是由于WDDM驱动程序还是我的程序有问题?
我认为这是一个错误的依赖性问题,另请参阅[Hyper-Q Example](http://docs.nvidia.com/cuda/samples/6_Advanced/simpleHyperQ/doc/HyperQ.pdf)。 – JackOLantern
你能明确你的意思吗?3路重叠?您是否想同时查看D2H memcpy,计算内核和H2D memcpy?我认为这是3路重叠的通常意义。 –
@RobertCrovella如果我注释掉第二个内核调用,我的时间线(运行在GeForce 540M上)显示第一个流的D2H恰好在第三个流的最后一个H2D之后开始(内存传输中没有空闲时间)。我认为OP也喜欢遵守同样的行为。但是,从上述时间线开始,第一流的D2H仅在第二流的第二内核调用结束之后开始,而不是紧接在第三流的H2D之后。这是我对他的帖子的理解。 – JackOLantern