您好,我必须使用CUFFT来卷积两个信号[pulseMatrixRow [i]和pulse [i]]。对于我的代码是 INT主(INT的argc,焦炭** argv的)如何在CUDA中执行零卷积以进行卷积
{
FILE *fileWritePtr;
cufftComplex h_signal[NX*BATCH];
cufftComplex h_filter_signal[NX*BATCH];
cufftComplex hf_signal[NX*BATCH];
// Initalize the memory for the signal
for (unsigned int i = 0; i < SIGNAL_SIZE; ++i)
{
h_signal[i].x = pulseMatrixRow[i];
h_signal[i].y = pulseMatrixRow[i];
}
// device memory allocation
cudaMalloc((void**)&d_signal, sizeof(cufftComplex)*NX*BATCH);
// transfer to device memory
cudaMemcpy(d_signal, h_signal, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice);
// Initalize the memory for the filter
for (unsigned int i = 0; i < FILTER_signal_SIZE; ++i)
{
h_filter_signal[i].x = pulse[i];
h_filter_signal[i].y = pulse[i];
}
// device memory allocation
cudaMalloc((void**)&d_filter_signal, sizeof(cufftComplex)*NX*BATCH);
// transfer to device memory
cudaMemcpy(d_filter_signal, h_filter_signal, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice);
// CUFFT plan
cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
// Transform signal and fsignal
printf("Transforming signal cufftExecC2C\n");
cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_FORWARD);
printf("Transforming filter_signal cufftExecC2C\n");
cufftExecC2C(plan, (cufftComplex *)d_filter_signal, (cufftComplex *)d_filter_signal, CUFFT_FORWARD);
// Multiply the coefficients together
ComplexPointwiseMulAndScale<<<blocksPerGrid, threadsPerBlock>>>(d_signal, d_filter_signal, NX, 1.0f/NX*BATCH);
// Transform signal back
printf("Transforming signal back cufftExecC2C\n");
cufftExecC2C(plan, (cufftComplex *)d_signal, (cufftComplex *)d_signal, CUFFT_INVERSE);
// transfer results from GPU memory
cudaMemcpy(hf_signal, d_signal, sizeof(cufftComplex)*NX*BATCH,cudaMemcpyDeviceToHost);
fileWritePtr = fopen("OutputData1.txt","w+");
for(i = 0; i < NX ; i++){
//printf("%f %f\n", i, hf_signal[i].x, hf_signal[i].y);
fprintf(fileWritePtr,"%d %f %f\n", i, hf_signal[i].x, hf_signal[i].y);
}
fclose(fileWritePtr);
//Destroy CUFFT context
cufftDestroy(plan);
// cleanup memory
cudaFree(d_signal);
cudaFree(d_filter_signal);
// free(h_signal);
// free(h_filter_signal);
return 0;
}
由matlab产生
我pulseMatrix代码被给定为:
pulse = [ones(1,50) zeros(1,500-50)];
pulseMatrix = repmat(pulse,10,1);
pulseMatrix = pulseMatrix.';
pulseMatrixRow = pulseMatrix(:);
,但我只处理1000个样本的pulseMatrixRow在一次和休息一组一千个。 由于我的fft是1024,请告诉我如何以及在哪个阶段我必须在输入信号的末尾填充零点,以及我的滤波器信号,它简单地给出如下: pulse = [ones(1,50)零( 1,500-50)];
这似乎是一个信号处理/ FFT/DFT理论的问题,而不是编程/ CUDA库的问题。我认为你可以在做fft前将'd_signal'的尾部24个元素和'd_filter_signal'的尾部524个元素设置为零。 – kangshiyin
是的先生,但我是编程新手,请告诉如何将元素设置为零,并在什么阶段....... – Ani