奇怪的CUDA错误输出

我想我不理解这里非常关键的东西。以下代码尝试使用FFT方法计算两个信号的卷积。我遇到的问题是，有时我会得到一个错误/奇怪的输出。当我尝试并显式运行我的卷积函数（在第104行）中的每一步时，它都可以工作。现在，如果我通过卷积函数正常运行代码，它的工作原理！当我得到我期望的输出后，我无法重新创建让我错误答案的设置。我对此如何发生感到不知所措。奇怪的CUDA错误输出

编辑 - 代码块包含数据。

#include <stdio.h> 
#include <stdlib.h> 
#include <math.h> 

#include <cuda_runtime.h> 
#include <cufft.h> 
#include <cuda.h> 

typedef enum signaltype {REAL, COMPLEX} signal; 

typedef float2 Complex; 

void 
printData(Complex *a, int size, char *msg) { 

    if (msg == "") printf("\n"); 
    else printf("%s\n", msg); 

    for (int i = 0; i < size; i++) 
    printf("%f %f\n", a[i].x, a[i].y); 
} 

void 
normData(Complex *a, int size, float norm) { 

    for (int i = 0; i < size; i++) { 
    a[i].x /= norm; 
    a[i].y /= norm; 
    } 
} 

// flag = 1 for real signals. 
void 
randomFill(Complex *h_signal, int size, int flag) { 

    // Real signal. 
    if (flag == REAL) { 
    for (int i = 0; i < size; i++) { 
     h_signal[i].x = rand()/(float) RAND_MAX; 
     h_signal[i].y = 0; 
    } 
    } 
} 

// FFT a signal that's on the _DEVICE_. 
void 
signalFFT(Complex *d_signal, int signal_size) { 

    // Handle type used to store and execute CUFFT plans. 
    // Essentially allocates the resouecwes and sort of interns 
    // them. 

    cufftHandle plan; 
    if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS) { 
    printf("Failed to plan FFT\n"); 
    exit(0); 
    } 

    // Execute the plan. 
    if (cufftExecC2C(plan, (cufftComplex *) d_signal, (cufftComplex *) d_signal, CUFFT_FORWARD) != CUFFT_SUCCESS) { 
    printf ("Failed Executing FFT\n"); 
    exit(0); 
    } 

} 


// Reverse of the signalFFT(.) function. 
void 
signalIFFT(Complex *d_signal, int signal_size) { 

    cufftHandle plan; 
    if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS) { 
    printf("Failed to plan IFFT\n"); 
    exit(0); 
    } 

    if (cufftExecC2C(plan, (cufftComplex *) d_signal, (cufftComplex *) d_signal, CUFFT_INVERSE) != CUFFT_SUCCESS) { 
    printf ("Failed Executing FFT\n"); 
    exit(0); 
    } 
} 


// Pointwise Multiplication Kernel. 
__global__ void 
pwProd(Complex *signal1, int size1, Complex *signal2, int size2) { 

    int threadsPerBlock, blockId, globalIdx; 

    threadsPerBlock = blockDim.x * blockDim.y; 
    blockId = blockIdx.x + (blockIdx.y * gridDim.x); 
    globalIdx = (blockId * threadsPerBlock) + threadIdx.x + (threadIdx.y * blockDim.x); 

    if (globalIdx <= size1) { 

     signal1[globalIdx].x = (signal1[globalIdx].x * signal2[globalIdx].x - signal1[globalIdx].y * signal2[globalIdx].y); 
     signal1[globalIdx].y = (signal1[globalIdx].x * signal2[globalIdx].y + signal1[globalIdx].y * signal2[globalIdx].x); 
    } 

} 

void 
cudaConvolution(Complex *d_signal1, int size1, Complex *d_signal2, 
       int size2, dim3 blockSize, dim3 gridSize) { 

    signalFFT(d_signal1, size1); 
    signalFFT(d_signal2, size2); 

    pwProd<<<gridSize, blockSize>>>(d_signal1, size1, d_signal2, size2); 

    //signalIFFT(d_signal1, size1); 

} 


int allocateAndPad(Complex **a, int s1, Complex **b, int s2) { 

    int oldsize, newsize, i; 

    newsize = s1 + s2 - 1; 

    while (!((newsize != 0) && !(newsize & (newsize - 1)))) { 
    newsize++; 
    } 

    oldsize = s1; 
    *a = (Complex *) malloc(sizeof(Complex) * newsize); 
    for (i = oldsize; i < newsize; i++) { 
    (*a)[i].x = 0; 
    (*a)[i].y = 0; 
    } 

    oldsize = s2; 
    *b = (Complex *) malloc(sizeof(Complex) * newsize); 
    for (i = oldsize; i < newsize; i++) { 
    (*b)[i].x = 0; 
    (*b)[i].y = 0; 
    } 

    return newsize; 
} 

int main() 
{ 

    Complex *h1, *h2, *d1, *d2; 

    int s1, s2, newsize, i, dim; 

    int deviceCount; 
    cudaError_t e = cudaGetDeviceCount(&deviceCount); 
    if (e != cudaSuccess) { 
    return -1; 
    } 

    dim = 1; 

    s1 = 16; 
    s2 = 16; 

    for (i = 0; i < dim; i++) { 

     newsize = allocateAndPad(&h1, s1, &h2, s2); 

     /*h1 = (Complex *) malloc(sizeof(Complex) * s1); 
     h2 = (Complex *) malloc(sizeof(Complex) * s2); 
     newsize = 16;*/ 

     randomFill(h1, s1, REAL); 
     randomFill(h2, s2, REAL); 

     // Kernel Block and Grid Size. 
     const dim3 blockSize(16, 16, 1); 
     const dim3 gridSize(newsize/16 + 1, newsize/16 + 1, 1); 

     printData(h1, newsize, "H Signal 1"); 
     printData(h2, newsize, "H Signal 2"); 

     cudaMalloc(&d1, sizeof(Complex) * newsize); 
     cudaMalloc(&d2, sizeof(Complex) * newsize); 
     cudaMemcpy(d1, h1, sizeof(Complex) * newsize, cudaMemcpyHostToDevice); 
     cudaMemcpy(d2, h2, sizeof(Complex) * newsize, cudaMemcpyHostToDevice); 

     cudaConvolution(d1, newsize, d2, newsize, blockSize, gridSize); 

     // Explicit code run below, 

     /*signalFFT(d1, newsize); 
     cudaMemcpy(h1, d1, sizeof(Complex) * newsize, cudaMemcpyDeviceToHost); 
     printData(h1, newsize, "1 FFT"); 
     cudaMemcpy(d1, h1, sizeof(Complex) * newsize, cudaMemcpyHostToDevice); 
     signalFFT(d2, newsize); 
     cudaMemcpy(h2, d2, sizeof(Complex) * newsize, cudaMemcpyDeviceToHost); 
     printData(h2, newsize, "2 FFT"); 
     cudaMemcpy(d2, h2, sizeof(Complex) * newsize, cudaMemcpyHostToDevice); 

     pwProd<<<gridSize, blockSize>>>(d1, newsize, d2, newsize); 

     signalIFFT(d1, newsize);*/ 

     cudaDeviceSynchronize(); 

     cudaMemcpy(h1, d1, sizeof(Complex) * newsize, cudaMemcpyDeviceToHost); 

     //normData(h1, newsize, newsize); 

     printData(h1, newsize, "PwProd"); 

     free(h1); free(h2); 
     cudaFree(d1); cudaFree(d2); 

     cudaDeviceReset(); 
    } 

    return 0; 
} 


EDIT: Required Output Data 
0.840188 0.000000 
0.394383 0.000000 
0.783099 0.000000 
0.798440 0.000000 
0.911647 0.000000 
0.197551 0.000000 
0.335223 0.000000 
0.768230 0.000000 
0.277775 0.000000 
0.553970 0.000000 
0.477397 0.000000 
0.628871 0.000000 
0.364784 0.000000 
0.513401 0.000000 
0.952230 0.000000 
0.916195 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 H Signal 2 
0.635712 0.000000 
0.717297 0.000000 
0.141603 0.000000 
0.606969 0.000000 
0.016301 0.000000 
0.242887 0.000000 
0.137232 0.000000 
0.804177 0.000000 
0.156679 0.000000 
0.400944 0.000000 
0.129790 0.000000 
0.108809 0.000000 
0.998924 0.000000 
0.218257 0.000000 
0.512932 0.000000 
0.839112 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 
0.000000 0.000000 PwProd 
64.765198 0.000000 
-20.097927 72.754028 
1.797580 1.074046 
-5.184547 7.412243 
0.148326 0.121253 
-3.457163 3.253345 
0.834668 -0.752979 
-0.414450 0.328347 
-1.268492 0.297919 
1.634082 -2.054814 
0.542893 0.087469 
0.244198 -1.392576 
0.680159 -0.110084 
0.938037 1.743742 
1.318125 -2.269666 
-1.448638 1.534995 
-0.207152 -0.000000 
-1.448638 -1.534995 
1.318125 2.269666 
0.938037 -1.743742 
0.680159 0.110084 
0.244198 1.392576 
0.542893 -0.087469 
1.634082 2.054814 
-1.268492 -0.297919 
-0.414450 -0.328347 
0.834668 0.752980 
-3.457164 -3.253347 
0.148326 -0.121253 
-5.184546 -7.412243 
1.797580 -1.074046 
-20.097923 -72.754013

坏输出有pwprod的另一半（最后16行），就像H信号2没有填充的数据。

来源

2013-05-31 viki.omega9

我想你没有注意[当我建议]（http://stackoverflow.com/questions/16781653/cuda-inverse-fft-bug）你使用'cufftComplex'，而且还建议你发布实际数据以及您期望的数据。没有人知道你的意思是“奇/怪”。 –

对不起。我现在添加了这些。 –

问题是，5.0工具包附带的cuda示例似乎使用Complex没有任何问题。我怀疑这是否是问题，但是一旦我的代码正常运行，我就会改变它。 –

对于所有的cuda API调用和内核调用（您已在错误检查调用API调用时），您应该执行cuda error checking。

另一个有用的工具是cuda-memcheck。当我通过CUDA-MEMCHECK运行你的代码，我得到了一些错误，其中第一个是指向你的内核pwProd：

========= Invalid __global__ read of size 8 
=========  at 0x00000088 in pwProd(float2*, int, float2*, int) 
=========  by thread (0,2,0) in block (0,0,0) 
=========  Address 0x400200300 is out of bounds 
=========  Saved host backtrace up to driver entry point at kernel launch time 
=========  Host Frame:/usr/lib64/libcuda.so (cuLaunchKernel + 0x3dc) [0xc9edc] 
=========  Host Frame:/usr/local/cuda/lib64/libcudart.so.5.0 [0xf513] 
=========  Host Frame:/usr/local/cuda/lib64/libcudart.so.5.0 (cudaLaunch + 0x183) [0x30f13] 
=========  Host Frame:./t171 [0x13e1] 
=========  Host Frame:./t171 (__gxx_personality_v0 + 0x2d2) [0xdea] 
=========  Host Frame:./t171 (__gxx_personality_v0 + 0x2fd) [0xe15] 
=========  Host Frame:./t171 [0x108b] 
=========  Host Frame:./t171 [0x1322] 
=========  Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf4) [0x1d994] 
=========  Host Frame:./t171 (__gxx_personality_v0 + 0x51) [0xb69]

然后我注意到，内核线程检查看起来是这样的：

if (globalIdx <= size1) {

我想应该是这样的：

if (globalIdx < size1) {

当我作出这样的变化，所有的CUDA-MEMCHECK错误消失。

来源

2013-06-01 01:07:55

奇怪的CUDA错误输出

回答

相关问题