我写了一个程序来添加两个二维数组来检查CPU和GPU的性能。 我使用clock()函数来测量CPU执行,并使用cudaEvent来测量GPU中的核心执行时间。由于我是Udacity下学习CUDA,我想他们的服务器上执行的程序,发现的结果,CUDA C - 使用clock()和cudaEvent的CPU和GPU执行时间,是否正确?
Output:
GPU: 0.001984 ms
CPU : 30.000000 ms
现在到我真正的问题,我觉得这些结果是惊人的快的GPU和现在我对这些结果是否准确或者是否在程序中犯了错误持怀疑态度?
这是我的计划:
#include "stdio.h"
#include<time.h>
#define COLUMNS 900
#define ROWS 900
long a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS],d[ROWS][COLUMNS];
__global__ void add(long *a, long *b, long *c,long *d)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
a[i]=d[i];
}
int main()
{
long *dev_a, *dev_b, *dev_c,*dev_d;
float ms;
clock_t startc, end;
double cpu_time_used;
cudaEvent_t start,stop;
cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_d, ROWS*COLUMNS*sizeof(int));
startc = clock();
for (long y = 0; y < ROWS; y++) // Fill Arrays
for (long x = 0; x < COLUMNS; x++)
{
a[y][x] = x;
b[y][x] = y;
d[y][x]=rand()%4;
c[y][x]=a[y][x]+b[y][x];
}
end = clock();
cpu_time_used = ((double) (end - startc))/CLOCKS_PER_SEC;
cpu_time_used*=1000;
cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_d, d, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaEventRecord(stop, 0);
add<<<dim3(1024,1024),dim3(128,128)>>>(dev_a, dev_b, dev_c,dev_d);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("GPU: %f ms",ms);
printf("\n CPU : %f ms",cpu_time_used);
return 0;
}
谢谢大家提供给我的查询答案,这里是我的代码和更新的结果所做的更改,
更新的代码:
#include "stdio.h"
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#define COLUMNS 500
#define ROWS 500
long a[ROWS][COLUMNS], b[ROWS][COLUMNS], c[ROWS][COLUMNS],d[ROWS][COLUMNS];
__global__ void add(long *a, long *b, long *c,long *d)
{
int x = blockIdx.x;
int y = blockIdx.y;
int i = (COLUMNS*y) + x;
c[i] = a[i] + b[i];
a[i]=d[i];
}
int main()
{
long *dev_a, *dev_b, *dev_c,*dev_d;
struct timeval startc, end;
float ms;
long mtime, seconds, useconds;
// clock_t startc, end;
// double cpu_time_used;
long ns;
cudaEvent_t start,stop;
cudaMalloc((void **) &dev_a, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_b, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_c, ROWS*COLUMNS*sizeof(int));
cudaMalloc((void **) &dev_d, ROWS*COLUMNS*sizeof(int));
gettimeofday(&startc, NULL);
for (long y = 0; y < ROWS; y++) // Fill Arrays
for (long x = 0; x < COLUMNS; x++)
{
a[y][x] = x;
b[y][x] = y;
d[y][x]=rand()%4;
c[y][x]=a[y][x]+b[y][x];
}
gettimeofday(&end, NULL);
seconds = end.tv_sec - startc.tv_sec;
useconds = end.tv_usec - startc.tv_usec;
mtime = ((seconds) * 1000 + useconds/1000.0) + 0.5;
for (long y = ROWS-1; y < ROWS; y++) // Output Arrays
{
for (long x = COLUMNS-1; x < COLUMNS; x++)
{
// printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]);
// printf("[%d][%d]=%d ",y,x,d[y][x]);
}
printf("\n");
}
cudaMemcpy(dev_a, a, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(dev_d, d, ROWS*COLUMNS*sizeof(int),
cudaMemcpyHostToDevice);
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
add<<<dim3(1024,1024),dim3(128,128)>>>(dev_a, dev_b, dev_c,dev_d);
cudaThreadSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&ms, start, stop);
cudaMemcpy(c, dev_c, ROWS*COLUMNS*sizeof(int),cudaMemcpyDeviceToHost);
cudaEventDestroy(start);
cudaEventDestroy(stop);
//cpu_time_used = ((double) (end - start))/CLOCKS_PER_SEC;
printf("GPU: %f ms",ms);
printf("\n CPU : %ld ms",mtime);
for (long y = ROWS-1; y < ROWS; y++) // Output Arrays
{
for (long x = COLUMNS-1; x < COLUMNS; x++)
{
// printf("\n[%ld][%ld]=%ld ",y,x,c[y][x]);
// printf("[%d][%d]=%d ",y,x,d[y][x]);
}
printf("\n");
}
return 0;
}
输出:
GPU: 0.011040 ms
CPU : 9 ms
现在我可以安全地告诉它是否正确?
为[定时CUDA运算]的答案(HTTP ://stackoverflow.com/questions/7876624/timing-cuda-operations)可能是你的兴趣。 – JackOLantern 2015-02-24 21:06:18