CUDA C - 使用clock（）和cudaEvent的CPU和GPU执行时间，是否正确？

我写了一个程序来添加两个二维数组来检查CPU和GPU的性能。我使用clock（）函数来测量CPU执行，并使用cudaEvent来测量GPU中的核心执行时间。由于我是Udacity下学习CUDA，我想他们的服务器上执行的程序，发现的结果，CUDA C - 使用clock（）和cudaEvent的CPU和GPU执行时间，是否正确？

Output: 
GPU: 0.001984 ms 
CPU : 30.000000 ms

现在到我真正的问题，我觉得这些结果是惊人的快的GPU和现在我对这些结果是否准确或者是否在程序中犯了错误持怀疑态度？

这是我的计划：

#include "stdio.h" 
#include<time.h> 
#define COLUMNS 900 
#define ROWS 900 
long a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS],d[ROWS][COLUMNS]; 
__global__ void add(long *a, long *b, long *c,long *d) 
{ 
int x = blockIdx.x; 
int y = blockIdx.y; 
int i = (COLUMNS*y) + x; 
c[i] = a[i] + b[i]; 
a[i]=d[i]; 
} 

int main() 
{ 
    long *dev_a, *dev_b, *dev_c,*dev_d; 
    float ms; 
    clock_t startc, end; 
    double cpu_time_used; 
    cudaEvent_t start,stop; 


cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int)); 
cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int)); 
cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int)); 
cudaMalloc((void **) &dev_d, ROWS*COLUMNS*sizeof(int)); 

startc = clock(); 
for (long y = 0; y < ROWS; y++) // Fill Arrays 
for (long x = 0; x < COLUMNS; x++) 
{ 
    a[y][x] = x; 
    b[y][x] = y; 
    d[y][x]=rand()%4; 
    c[y][x]=a[y][x]+b[y][x]; 
} 
end = clock(); 

cpu_time_used = ((double) (end - startc))/CLOCKS_PER_SEC; 
cpu_time_used*=1000; 


cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int), 
cudaMemcpyHostToDevice); 
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int), 
cudaMemcpyHostToDevice); 
cudaMemcpy(dev_d, d, ROWS*COLUMNS*sizeof(int), 
cudaMemcpyHostToDevice); 


cudaEventCreate(&start); 
cudaEventCreate(&stop); 
cudaEventRecord(start, 0); 
cudaEventRecord(stop, 0); 


add<<<dim3(1024,1024),dim3(128,128)>>>(dev_a, dev_b, dev_c,dev_d); 

cudaEventSynchronize(stop); 
cudaEventElapsedTime(&ms, start, stop); 
cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),cudaMemcpyDeviceToHost); 
cudaEventDestroy(start); 
cudaEventDestroy(stop); 




printf("GPU: %f ms",ms); 
printf("\n CPU : %f ms",cpu_time_used); 

return 0; 
}

谢谢大家提供给我的查询答案，这里是我的代码和更新的结果所做的更改，

更新的代码：

#include "stdio.h" 
#include <time.h> 
#include <sys/time.h> 
#include <unistd.h> 
#define COLUMNS 500 
#define ROWS 500 
long a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS],d[ROWS][COLUMNS]; 



__global__ void add(long *a, long *b, long *c,long *d) 
{ 
int x = blockIdx.x; 
int y = blockIdx.y; 
int i = (COLUMNS*y) + x; 
c[i] = a[i] + b[i]; 
a[i]=d[i]; 
} 
int main() 
{ 
long *dev_a, *dev_b, *dev_c,*dev_d; 
struct timeval startc, end; 
float ms; 
long mtime, seconds, useconds; 
// clock_t startc, end; 
// double cpu_time_used; 
long ns; 
cudaEvent_t start,stop; 


cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int)); 
cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int)); 
cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int)); 
cudaMalloc((void **) &dev_d, ROWS*COLUMNS*sizeof(int)); 

gettimeofday(&startc, NULL); 
for (long y = 0; y < ROWS; y++) // Fill Arrays 
for (long x = 0; x < COLUMNS; x++) 
{ 
    a[y][x] = x; 
    b[y][x] = y; 
    d[y][x]=rand()%4; 
    c[y][x]=a[y][x]+b[y][x]; 
} 
    gettimeofday(&end, NULL); 

seconds = end.tv_sec - startc.tv_sec; 
useconds = end.tv_usec - startc.tv_usec; 
mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5; 


for (long y = ROWS-1; y < ROWS; y++) // Output Arrays 
{ 
for (long x = COLUMNS-1; x < COLUMNS; x++) 
{ 
    // printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]); 
    // printf("[%d][%d]=%d ",y,x,d[y][x]); 
} 
printf("\n"); 
} 



cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int), 
cudaMemcpyHostToDevice); 
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int), 
cudaMemcpyHostToDevice); 
cudaMemcpy(dev_d, d, ROWS*COLUMNS*sizeof(int), 
cudaMemcpyHostToDevice); 


cudaEventCreate(&start); 
cudaEventCreate(&stop); 
cudaEventRecord(start, 0); 



add<<<dim3(1024,1024),dim3(128,128)>>>(dev_a, dev_b, dev_c,dev_d); 

cudaThreadSynchronize(); 
cudaEventRecord(stop, 0); 
cudaEventSynchronize(stop); 
cudaEventElapsedTime(&ms, start, stop); 

cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),cudaMemcpyDeviceToHost); 
cudaEventDestroy(start); 
cudaEventDestroy(stop); 



//cpu_time_used = ((double) (end - start))/CLOCKS_PER_SEC; 
printf("GPU: %f ms",ms); 
printf("\n CPU : %ld ms",mtime); 
for (long y = ROWS-1; y < ROWS; y++) // Output Arrays 
{ 
    for (long x = COLUMNS-1; x < COLUMNS; x++) 
    { 
     // printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]); 
     // printf("[%d][%d]=%d ",y,x,d[y][x]); 
    } 
    printf("\n"); 
} 
return 0; 
}

输出：

GPU: 0.011040 ms 
CPU : 9 ms

现在我可以安全地告诉它是否正确？

来源

2015-02-24 Avinash

为[定时CUDA运算]的答案（HTTP ：//stackoverflow.com/questions/7876624/timing-cuda-operations）可能是你的兴趣。 – JackOLantern 2015-02-24 21:06:18

你认为加速太快，CPU时间太长是正确的。使用此方法计时CPU C++ obtaining milliseconds time on Linux -- clock() doesn't seem to work properly也可能需要将cudaEventRecord(stop, 0);移至内核之后。

我看到5在您的内核读取和写入。服用5*4Bytes*500*500/(1024^3*0.009)你的记忆中有大约0.517 GB/s，这是可用的一小部分。我会说你的CPU版本需要一些工作。相比之下，你的GPU在5*4Bytes*500*500/(1024^3*0.01104e-3)约为421GB/s。我会说你不是那里。

那么，这么多的错误....

#include "stdio.h" 
#include <time.h> 
#include <sys/time.h> 
#include <unistd.h> 
#include <cuda.h> 
#include <cuda_runtime.h> 

#define COLUMNS 500 
#define ROWS 500 
long a[ROWS*COLUMNS], b[ROWS*COLUMNS], c[ROWS*COLUMNS],d[ROWS*COLUMNS]; 



__global__ void add(long *a, long *b, long *c,long *d) 
{ 
int x = blockIdx.x; 
int y = blockIdx.y; 
int i = (COLUMNS*y) + x; 
c[i] = a[i] + b[i]; 
a[i]=d[i]; 
} 
int main() 
{ 
long *dev_a, *dev_b, *dev_c,*dev_d; 
struct timeval startc, end; 
float ms; 
long seconds, useconds; 
double mtime; 
cudaEvent_t start,stop; 


for(int i=0; i<ROWS*COLUMNS; i++) 
    d[i]=rand()%4; 

for(int i=0; i<ROWS; i++){ 
    for(int j=0; j<COLUMNS; j++){ 
     a[i*COLUMNS+j]=j; 
     b[i*COLUMNS+j]=i; 
    } 
} 

cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int)); 
cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int)); 
cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int)); 
cudaMalloc((void **) &dev_d, ROWS*COLUMNS*sizeof(int)); 



gettimeofday(&startc, NULL); 
for (long i = 0; i < ROWS*COLUMNS; i++){ // Fill Arrays 
    c[i]=a[i]+b[i]; 
    a[i]=d[i]; 
} 
    gettimeofday(&end, NULL); 

seconds = end.tv_sec - startc.tv_sec; 
useconds = end.tv_usec - startc.tv_usec; 
mtime = useconds; 
mtime/=1000; 
mtime+=seconds*1000; 

for (long y = ROWS-1; y < ROWS; y++) // Output Arrays 
{ 
for (long x = COLUMNS-1; x < COLUMNS; x++) 
{ 
    // printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]); 
    // printf("[%d][%d]=%d ",y,x,d[y][x]); 
} 
printf("\n"); 
} 



cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int), 
cudaMemcpyHostToDevice); 
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int), 
cudaMemcpyHostToDevice); 
cudaMemcpy(dev_d, d, ROWS*COLUMNS*sizeof(int), 
cudaMemcpyHostToDevice); 


cudaEventCreate(&start); 
cudaEventCreate(&stop); 
cudaEventRecord(start, 0); 



add<<<dim3(1024,1024),dim3(128,128)>>>(dev_a, dev_b, dev_c,dev_d); 



cudaEventRecord(stop, 0); 
cudaEventSynchronize(stop); 
cudaEventElapsedTime(&ms, start, stop); 

cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),cudaMemcpyDeviceToHost); 
cudaEventDestroy(start); 
cudaEventDestroy(stop); 

printf("GPUassert: %s\n", cudaGetErrorString(cudaGetLastError())); 

//cpu_time_used = ((double) (end - start))/CLOCKS_PER_SEC; 
double memXFers=5*4*COLUMNS*ROWS; 
memXFers/=1024*1024*1024; 


printf("GPU: %f ms bandwidth %g GB/s",ms, memXFers/(ms/1000.0)); 
printf("\n CPU : %g ms bandwidth %g GB/s",mtime, memXFers/(mtime/1000.0)); 
for (long y = ROWS-1; y < ROWS; y++) // Output Arrays 
{ 
    for (long x = COLUMNS-1; x < COLUMNS; x++) 
    { 
     // printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]); 
     // printf("[%d][%d]=%d ",y,x,d[y][x]); 
    } 
    printf("\n"); 
} 

return 0; 
}

我现在的结果的方式（显然不是正确的）...

GPU: 0.001792 ms bandwidth 2598.56 GB/s 
CPU : 0.567 ms bandwidth 8.21272 GB/s

来源

2015-02-24 18:42:59

您*必须*将'cudaEventRecord（stop，0）;'移到内核之后。 – ArchaeaSoftware 2015-02-25 08:35:44

首先感谢你的真棒回复，是的，我已经移动了'cudaEventRecord（stop，0）;'在执行完毕后。 – Avinash 2015-02-25 15:00:22

也感谢计时器功能，我试图在我的代码中实现它，并发现结果是promiising。 – Avinash 2015-02-25 15:37:12

CUDA C - 使用clock（）和cudaEvent的CPU和GPU执行时间，是否正确？

回答

相关问题