Cuda产生的数组乘法错误

我想乘两个数组，我的Cuda RunTime API有问题。有一个错误：cuda运行时API错误11：无效的参数。什么是论据？救救我，请代码：Cuda产生的数组乘法错误

// includes, system 
#include <stdlib.h> 
#include <stdio.h> 
#include <string.h> 
#include <math.h> 
#include <fstream> 

// includes, project 
#include <cufft.h> 
#include <cuda.h> 
#include <cutil_inline.h> 
#include <shrQATest.h> 

#define SIGNAL_SIZE  8192 
#define SIGNAL_COUNT 384 

// Complex data type 
typedef float2 Complex;  

static __device__ __host__ inline Complex ComplexMul(Complex, Complex); 
static __device__ __host__ inline Complex ComplexAdd(Complex); 
static __global__ void ComplexPointwiseMA(Complex* , Complex* , Complex*); 

// Complex addition  
static __device__ __host__ inline Complex ComplexAdd(Complex a,Complex b) 
{ 
    Complex c; 
    c.x = a.x + b.x; 
    c.y = a.y + b.y; 
    return c; 
} 

// Complex multiplication 
static __device__ __host__ inline Complex ComplexMul(Complex a, Complex b) 
{ 
    Complex c; 
    c.x = a.x * b.x - a.y * b.y; 
    c.y = a.x * b.y + a.y * b.x; 
    return c; 
} 

// Complex Mul 'n Add function 
static __global__ void ComplexPointwiseMA(Complex* mas1, Complex* mas2, Complex*mas_tmp) 
{ 
    const int numThreads = blockDim.x * gridDim.x; 
    const int threadID = blockIdx.x * blockDim.x + threadIdx.x; 
    int j=0, k=0;  
    Complex c; 
    int size=SIGNAL_COUNT*SIGNAL_SIZE; 
    for (int i = threadID; i <size; i += numThreads) 
    { 
        if((i%SIGNAL_SIZE==0)&&(j<SIGNAL_COUNT)) 
            j++; 
        c=ComplexMul(mas1[i], mas2[j]); 
        mas_tmp[k]=ComplexAdd(mas_tmp[k],c); 
        if(k>=SIGNAL_SIZE-1) 
            k=0; 
        else 
            k++; 
    } 
} 

int main(int argc, char** argv) 
{ 
    int threads_per_block=256; 
    int blocks_per_grid=16; 
    //dim3 dimBlock(SIGNAL_COUNT, SIGNAL_SIZE); // threads  
    //dim3 dimGrid(SIGNAL_COUNT); // 384 blocks in a grid 
    cudaEvent_t start, stop; 
    float elapsedTime; 
    //shrQAStart(argc, argv); 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 

    // For random 
    srand(1001); 

    // Allocate host memory for the mass 
    Complex* mas_tmp=NULL; 
    mas_tmp =new Complex [SIGNAL_COUNT*SIGNAL_SIZE]; 

    Complex* mas1=NULL; 
    mas1 =new Complex [SIGNAL_COUNT*SIGNAL_SIZE]; 
    for(unsigned int i = 0; i <SIGNAL_COUNT*SIGNAL_SIZE; ++i) 
    { 
        mas1[i].x = rand()/(float)RAND_MAX; 
        mas1[i].y = rand()/(float)RAND_MAX; 
    } 

    Complex* mas2=NULL; 
    mas2 =new Complex [SIGNAL_COUNT]; 
    for(unsigned int i = 0; i < SIGNAL_COUNT; ++i) 
    { 
        mas2[i].x = rand()/(float)RAND_MAX; 
        mas2[i].y = rand()/(float)RAND_MAX; 
    } 

    // Timer 
    cudaEventRecord(start,0); 

    // Allocate device memory for mass 
    Complex* mastmp_=NULL; // temporary  
    cutilSafeCall(cudaMalloc(&mastmp_, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex))); 
    Complex* mas1_=NULL; // signal 
    cutilSafeCall(cudaMalloc(&mas1_, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex))); 
    Complex* mas2_=NULL; // coefficient 
    cutilSafeCall(cudaMalloc(&mas2_, SIGNAL_COUNT*sizeof(Complex))); 

    // Copy host data to device 
    cutilSafeCall(cudaMemcpy(mas1_, mas1, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex),    cudaMemcpyHostToDevice)); 
    cutilSafeCall(cudaMemcpy(mas2_, mas2, SIGNAL_COUNT*sizeof(Complex), cudaMemcpyHostToDevice)); 

    // Calling  Mul 'n Add function 
    ComplexPointwiseMA<<<blocks_per_grid, threads_per_block>>>(mas1_, mas2_, mastmp_); 

    // Check if kernel execution generated and error 
    cutilCheckMsg("Kernel execution failed [ ComplexPointwiseMA ]"); 

    // Copy device memory to host 
    cutilSafeCall(cudaMemcpy(mastmp_, mas_tmp, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex), cudaMemcpyDeviceToHost)); 

    cudaEventRecord(stop,0); 
    cudaEventSynchronize(stop); 
    cudaEventElapsedTime(&elapsedTime, start,stop); 
    printf("Time %3.10f ms\n", elapsedTime); 

    // Free memory 
    free(mas1); 
    free(mas2); 
    free(mas_tmp); 
    cudaEventDestroy(start); 
    cudaEventDestroy(stop); 

    cutilSafeCall(cudaFree(mas1_)); 
    cutilSafeCall(cudaFree(mas2_)); 
    cutilSafeCall(cudaFree(mastmp_)); 

    cutilDeviceReset(); 
    system ("pause"); 
    return 0; 
}

来源

2013-01-09 Andrew Listopadoff

请更加注意你的代码在未来的格式。你可以在[这个meta stackoverflow问题]（http://meta.stackexchange.com/q/22186/163653）上看到关于发布和格式化代码约定的提示。 – talonmies

此外，您还没有解释代码中发生错误的位置。 – talonmies

你可以使用'cublasCgemv（）'做矩阵向量乘法来简化你的代码并使性能最大化。 – kangshiyin

在你的代码行会产生的Invalid Argument错误。

// Copy device memory to host 
    cutilSafeCall(cudaMemcpy(mastmp_, mas_tmp, SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex), cudaMemcpyDeviceToHost));

在功能cudaMemcpy，第一个参数是目标指针和第二参数是源指针。您已指定标志cudaMemcpyDeviceToHost，但主机（目标）和设备（源）指针已互换。

你应该做这个（交换第2个参数）：

cutilSafeCall(cudaMemcpy(mas_tmp, mastmp_,SIGNAL_SIZE*SIGNAL_COUNT*sizeof(Complex), cudaMemcpyDeviceToHost));

来源

2013-01-09 09:34:42 sgarizvi

Cuda产生的数组乘法错误

回答

相关问题