我正在GPU上进行哈里斯角点检测。我正在观察CPU性能的异常行为。调用GPU内核后CPU性能下降
以下是我的Main.cpp文件,如果我通过评论我的核函数(此函数调用GPU的各种核函数)运行此调用"Harris_Algo(a,d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response,Res,Height,length,SIZE);"
我的函数调用读取下一帧并转换为灰度(“cap.read(Masrc )和cvtColor(Masrc,src,CV_BGR2GRAY))平均每帧0.003和0.004秒
令人惊讶的是,当我取消我的GPU内核调用函数“Harris_Algo”相同的CPU函数(cap.read(Masrc)和cvtColor Masrc,src,CV_BGR2GRAY))平均每帧需要0.009秒和0.008秒
由于在我的应用程序中,时机非常关键,这种变化正在消除我们获得的优势这两个函数调用与GPU无关,但我调用GPU函数(内核)时仍需要更多时间。
什么,我认为是调用我的GPU功能(内核)增加了CPU的开销,所以它的利用率增加,性能下降。但是这种变化是巨大的。任何其他合适的方法来做到这一点。
任何帮助表示赞赏。
我正在使用Jetson TK1 GPU板。
Main.cpp的文件
#include <iostream>
#include <time.h>
#include <fstream>
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/highgui/highgui.hpp"
using namespace std;
using namespace cv;
void Cuda_Free(unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response);
void Harris_Algo(unsigned char *a,unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response,int *Res, int Height,int length,int SIZE);
void MemAlloc(unsigned char *&d_a,unsigned char *&d_g,int *&dx_My,int *&dy_My,int *&dxdy_My,int *&suppressed,int *&corner_response,int SIZE);
int main(int argc, char** argv)
{
cv::VideoCapture cap(argv[1]);
if (!cap.isOpened())
{
std::cout << "!!! Failed to open file: " << argv[1] << std::endl;
return -1;
}
double time_spent;
clock_t begin3, end3,begin4;
bool start = false;
Mat src;
unsigned char *a,*d_a,*d_g;
int *dx_My,*Res;
int *dy_My;
int *dxdy_My;
int *suppressed;
int *corner_response;
int length;
int Height;
int SIZE;
Size S;
VideoWriter outputVideo;
Mat Masrc;
for(;;)
{
begin4 = clock();
begin3 = clock();
if (!cap.read(Masrc))
break;
end3 = clock();
time_spent = (double)(end3 - begin3)/CLOCKS_PER_SEC;
cout<<"Read Frame : "<<time_spent<<endl;
begin3 = clock();
cvtColor(Masrc, src, CV_BGR2GRAY);
end3 = clock();
time_spent = (double)(end3 - begin3)/CLOCKS_PER_SEC;
cout<<"Gray Convert : "<<time_spent<<endl;
begin3 = clock();
if(start == false)
{
length = src.cols;
Height = src.rows;
cout<<"Width"<<length<<endl;
cout<<"Height"<<Height<<endl;
SIZE = ((length)*(Height));
Res = new int [SIZE];
MemAlloc(d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response,SIZE);
start = true;
}
a = src.data;
end3 = clock();
time_spent = (double)(end3 - begin3)/CLOCKS_PER_SEC;
cout<<"Initial Processsing Time : "<<time_spent<<endl;
Harris_Algo(a,d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response,Res,Height,length,SIZE);
begin3 = clock();
// imshow("Harris_OUT", Masrc);
// char key = cvWaitKey(1);
// if (key == 27) // ESC
// break;
end3 = clock();
time_spent = (double)(end3 - begin3)/CLOCKS_PER_SEC;
cout<<"Time After Displaying image on Output : "<<time_spent<<endl;
time_spent = (double)(end3 - begin4)/CLOCKS_PER_SEC;
cout<<"Overall Time of entire program exec : "<<time_spent<<endl;
cout<<"-----------------------------------------------------------------------------"<<endl;
}
Cuda_Free(d_a,d_g,dx_My,dy_My,dxdy_My,suppressed,corner_response);
delete Res;
cvWaitKey(0);
}
Kernal.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <time.h>
#include <fstream>
using namespace std;
__global__ void Harris_Mat(int *corner_response,int* dx_My,int* dy_My,int* dxdy_My,int rows, int cols,int Size)
{
/*...*/
}
__global__ void Supress_Neighbour(int *input,int *output, int rows, int cols, int Size)
{
/* ... */
}
__global__ void VectorGauss(unsigned char *D, unsigned char *M,int Length, int size_m)
{
float Val;
int i = blockIdx . x * blockDim . x + threadIdx . x;
if(i>0 & i<size_m)
{
if ((i%Length) ==(0) || (i%Length) == (Length-1)|| (i<Length) || (i>(size_m-Length))){
M[i] = 0;
}
Val = ((D[i] +(D[Length+i]) + D[2*Length+i]) +(D[i]+ (D[Length+i])+ D[2*Length+i])
+(D[i+1] + D[i+Length+1] + D[2*Length+i+]));
}
}
__global__ void VectorAdd(unsigned char *D,int* dx,int* dy,int* dxdy,int Length, int size_m)
{
/* ... */
}
__host__ void MemAlloc(unsigned char *&d_a,unsigned char *&d_g,int *&dx_My,int *&dy_My,int *&dxdy_My,int *&suppressed,int *&corner_response,int SIZE)
{
cudaMalloc (&d_a,SIZE*sizeof(unsigned char));
cudaMalloc (&d_g,SIZE*sizeof(unsigned char));
cudaMalloc (&dx_My,SIZE*sizeof(int));
cudaMalloc (&dy_My,SIZE*sizeof(int));
cudaMalloc (&dxdy_My,SIZE*sizeof(int));
cudaMalloc (&suppressed,SIZE*sizeof(int));
cudaMalloc (&corner_response,SIZE*sizeof(int));
}
__host__ void Harris_Algo(unsigned char *a,unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response,int *Res, int Height,int length,int SIZE)
{
double time_spent;
clock_t begin3, end3;
begin3 = clock();
cudaMemcpy(d_a,a,SIZE*sizeof(unsigned char),cudaMemcpyHostToDevice);
VectorGauss<<< SIZE/512+1,512>>>(d_a,d_g,length,SIZE);
VectorAdd<<< SIZE/512+1,512>>>(d_g,dx_My,dy_My,dxdy_My,length,SIZE);
Harris_Mat<<< SIZE/512+1,512>>>(corner_response,dx_My,dy_My,dxdy_My,Height,length,SIZE);
Supress_Neighbour<<< SIZE/512+1,512>>>(corner_response, suppressed,Height, length, SIZE);
cudaMemcpy(Res,suppressed,SIZE*sizeof(int),cudaMemcpyDeviceToHost);
end3 = clock();
time_spent = (double)(end3 - begin3)/CLOCKS_PER_SEC;
cout<<"Processsing Time of Algorithm : "<<time_spent<<endl;
}
__host__ void Cuda_Free(unsigned char *d_a,unsigned char *d_g,int *dx_My,int *dy_My,int *dxdy_My,int *suppressed,int *corner_response)
{
cudaFree(d_a);
cudaFree(d_g);
cudaFree(dx_My);
cudaFree(dy_My);
cudaFree(dxdy_My);
cudaFree(corner_response);
cudaFree(suppressed);
}
我有使用NVCC编译并还使用(NVCC和g ++)两者,但同样的结果。
运行使用
g++-4.8 -c Main.cpp
nvcc -c Kernal.cu
g++-4.8 -o Output Main.o Kernal.o -L/usr/local/cuda/lib -lcudart -lcuda `pkg-config opencv --cflags --libs`
我认为你的CUDA内存分配正在创建开销。我认为,因为你只静态定义一次内存,并且只清除一次。为什么不尝试每个帧的cudaMalloc和cudaFree(在函数Harris_Algo中),因为在算法中以前的信息不是必需的。 –
是的,CUDA内存分配是很昂贵的,这是我只做一次处理整个视频帧的原因。如果我为每一帧做它,那么我的开销将是巨大的。 我也试过,但总体时间会更多,如果我们为每一帧分配内存。 –
你的整个时间测量方法是错误的。请阅读clock()的手册页。你*不能*使用时钟来按你所做的方式计时。CPU秒和秒不是一回事 – talonmies