CUDA矩阵乘法 - 再次

我觉得有点不好使一个已经有10个同名的论坛线程，但是在检查完所有线程以及大多数指南后，我仍然无法确定问题。CUDA矩阵乘法 - 再次

我有一个char数组[40090] [11]，我想对它的两个元素（我认为整个11字节串作为一个元素）的每个可能的组合进行自定义操作。我明白这是一种矩阵乘法，矩阵是一列和一行。

遵循SDK手册我想每个输出元素有1个线程。由于40090 = 19 * 2110，我使用：

dim3 threadsperblock(19,19); 
dim3 blocksingrid(2110,2110); 
xkernel<<<blocksingrid, threadsperblock>>>(dev_b2);

问题1：这个罚款？

好吧，然后，我想想我正在关注SDK的maunal示例（不是使用共享内存的示例）。但是，每当我敢于在数据上进行我想要的操作的一部分时，我就会得到一个大量无用的错误30：未知的错误。所以，问题2：我做错了什么？注意：忽略内核不保存任何地方。

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 
#include <cstdlib> 
#include <iostream> 
#include <fstream> 
#include <iomanip> 
#include <ctime> 
#include <stdio.h> 
using namespace std; 

cudaError_t cudafunct(void); 
__global__ void xkernel(char * dev_b2); 
__device__ unsigned char typecheck(unsigned char type1,unsigned char type2); 


#define b2c 40090 
unsigned char block2[b2c][11];// 
//unsigned int i,b1,b2,counter=0;//Block(2),Piece,Rotation,Type(of block2),InterconnectinTriangle 
//unsigned char *block4,type=0; 
ofstream ofile; 




int main() 
{ 
    ifstream block2file("2.blk",ios::binary); 
    block2file.read((char*)(&block2),b2c*11); 
    block2file.close(); 
    //block4=new unsigned char[200000000];//200MB will do, better than doing constant reallocs 

    cudaError_t cudaStatus = cudafunct(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudafunct failed!"); 
     system("PAUSE"); 
     return 1; 
    } 
    /* 

    // cudaDeviceReset must be called before exiting in order for profiling and 
    // tracing tools such as Nsight and Visual Profiler to show complete traces. 
    cudaStatus = cudaDeviceReset(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaDeviceReset failed!"); 
     return 1; 
    }*/ 
    cout<<"Sequence end. Saving to file...\n";  
    //ofile.open("blk4.et2",ios::binary); 
    //ofile.write((char*)block4,17*counter); 
    //ofile.close(); 
    int t=clock(); 
    //cout<<"\nFound a total of "<<counter<<" block4s.\nTime elapsed: "<<t<<" clocks/"<<(double)t/(double)CLOCKS_PER_SEC<<" seconds\n"; 
    system("PAUSE"); 
} 

// Helper function for using CUDA to add vectors in parallel. 
cudaError_t cudafunct(void) 
{ 
    char *dev_b2 = 0; 
    cudaError_t cudaStatus; 

    cudaStatus = cudaMalloc((void**)&dev_b2, sizeof(block2)); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMalloc failed!"); 
     goto Error; 
    } 

    cudaStatus = cudaMemcpy(dev_b2, block2, sizeof(block2), cudaMemcpyHostToDevice); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    } 

    dim3 threadsperblock(19,19); 
    dim3 blocksingrid(2110,2110); 
    xkernel<<<blocksingrid, threadsperblock>>>(dev_b2); 

    // cudaDeviceSynchronize waits for the kernel to finish, and returns 
    // any errors encountered during the launch. 
    cudaStatus = cudaDeviceSynchronize(); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching xkernel!\n", cudaStatus); 
     goto Error; 
    } 
    /* 
    // Copy output vector from GPU buffer to host memory. 
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); 
    if (cudaStatus != cudaSuccess) { 
     fprintf(stderr, "cudaMemcpy failed!"); 
     goto Error; 
    }*/ 

Error: 
    cudaFree(dev_b2); 
    return cudaStatus; 
} 


__global__ void xkernel(char *dev_b2) 
{ 
     int i = blockIdx.x * blockDim.x + threadIdx.x; 
     int j = blockIdx.y * blockDim.y + threadIdx.y; 
     /*for(int k=0;k<11;k++) 
     { 
      lb2[0][k]=dev_b2[i*b2c+k]; 
      lb2[1][k]=dev_b2[j*b2c+k]; 
     }*/ 
     int b00; 
     b00=dev_b2[i*b2c]; 

     //int type=typecheck(dev_b2[i*b2c+4],dev_b2[j*b2c+4]); 
     //if(!j && !(i % 100))cout<<setw(6)<<i<<"/"<<jc<<" ("<<setw(10)<<(float)100*i/jc<<" %)"<<endl;  
     /*if(
      (dev_b2[i*b2c+7]!=dev_b2[j*b2c+9])||//SW~NW  
      (dev_b2[i*b2c+6]!=dev_b2[j*b2c+10])//SE~NE                       
     ) return; 
     if((type=typecheck(dev_b2[i*b2c+4],dev_b2[j*b2c+4])) ==255) return;*/ 
     /*if(
      (dev_b2[i*b2c+0]==dev_b2[j*b2c+0])||//1st=3rd 
      (dev_b2[i*b2c+0]==dev_b2[j*b2c+2])||//1st=4th 
      (dev_b2[i*b2c+2]==dev_b2[j*b2c+0])||//2nd=3rd 
      (dev_b2[i*b2c+2]==dev_b2[j*b2c+2])//2nd=4th 
     ) return;*/ 
     /* 
     *(block4+counter*17+0)=b2[i][0];//1st piece 
     *(block4+counter*17+1)=b2[i][1];//1st rotation 
     *(block4+counter*17+2)=b2[i][2];//2nd piece 
     *(block4+counter*17+3)=b2[i][3];//2nd rotation 
     *(block4+counter*17+4)=b2[j][0];//3rd piece 
     *(block4+counter*17+5)=b2[j][1];//3rd rotation 
     *(block4+counter*17+6)=b2[j][2];//4th piece 
     *(block4+counter*17+7)=b2[j][3];//4th rotation 
     *(block4+counter*17+8)=type; 
     *(block4+counter*17+9)=b2[i][5];//Right frame colours, down->up 
     *(block4+counter*17+10)=b2[j][5]; 
     *(block4+counter*17+11)=b2[j][6];//Up frame colours, right->left 
     *(block4+counter*17+12)=b2[j][7]; 
     *(block4+counter*17+13)=b2[j][8];//Left frame colours, up->down 
     *(block4+counter*17+14)=b2[i][8]; 
     *(block4+counter*17+15)=b2[i][9];//Down frame colours, left->right 
     *(block4+counter++*17+16)=b2[i][10];*/ 
} 



__device__ unsigned char typecheck(unsigned char type1,unsigned char type2) 
{//Warning! Previous error! First partenthesis is t*2* = upper piece! 
     if((type1==4) && (type2==0)) return 0; 
     if((type1==6) && (type2==1)) return 1; 
     if((type1==2) && (type2==6)) return 2; 
     if((type1==3) && (type2==4)) return 3; 
     if((type1==4) && (type2==4)) return 4; 
     if((type1==8) && (type2==5)) return 5; 
     if((type1==6) && (type2==6)) return 6; 
     if((type1==7) && (type2==8)) return 7; 
     if((type1==8) && (type2==8)) return 8; 
     if((type1==9) && (type2==8)) return 9; 
     if((type1==10) && (type2==8)) return 10; 
     if((type1==8) && (type2==11)) return 11; 
     if((type1==8) && (type2==12)) return 12; 
     if((type1==8) && (type2==13)) return 13; 
     return 255; 
}

来源

2012-10-11 user1058795

您确定CUDA驱动程序正在运行？请从SDK中测试bandwidthTest或deviceQuery。 – ahmad

带宽测试工作正常。 – user1058795

我中有你从你的dev_b2阵列读出界外的感觉。 blockIdx.x在[0..2110]的范围内，所以变量i在[0..]的范围内。但是，你将它乘以b2c。因此，您读取的最高地址将是b2c*= 930488900。

但是dev_b2只有大小b2c*11 = 440990。

来源

2012-10-11 08:57:22 CygnusX1

我不认为这些是范围。正如我发布的，blockIdx.x的范围是2110，而线程equivelant是19.另一个有趣的事情：我发布的代码实际上工作。但是，如果不是int b00，我使用int b [0] [0]并尝试将相同的值赋给b [0] [0]，这就是我得到错误的地方。 – user1058795

这可能会更好，如果你发布的代码实际上失败了。我不太清楚你的意思是什么“然而，如果不是int b00，我做了一个int b [0] [0] ...在内核中你有int b00; b00 = dev_b2 [i @ b2c];我将其更改为int b [1] [1]; b [0] [0] = dev_b2 [i * b2c];它的编译和运行方式与更改前相同。是32的倍数，warp大小 –

@ user1058795是的，我把b2x和gridDim.x混合在一起，后者有点小，但即使如此，你仍然走出界限，我用数字来解决我的反应。你发布的代码实际上什么都不做，CUDA将通过死代码消除产生一个空内核。 – CygnusX1

CUDA矩阵乘法 - 再次

回答

相关问题