2016-12-08 30 views
-3

我正在CUDA项目中工作,但我遇到了一些严重的麻烦,我找不到解决方案。CUDA内核崩溃没有任何理由20k +线程

我使用NVIDIA Quadro K2000m实现了该项目,并在我的PC(pA)中运行。但是当我在一个拥有Nvidia Tesla GPU的集群上部署该项目时,以及在另一台PC(pB)(NVIDIA GTX 960m)上部署该项目时,该项目将无法执行!

有趣的是,当我在Visual Studio中使用Nsight调试器上的pB(第二PC),它会执行,而不是显示错误:未指定发射失败

这是代码第一核心:

__global__ void calcKernel(float *dev_calcMatrix, 

         int *documentarray, 
         int *documentTermArray, 
         int *distincttermsarray, 
         int *distinctclassarray, 
         int *startingPointOfClassDoc, 
         int *endingPOintOfClassDoc, 
         int sizeOfDistinctClassarray, 
         int sizeOfTerms) 
{ 

int index = blockIdx.x * blockDim.x + threadIdx.x; 

int term = distincttermsarray[index]; 

if (index <= sizeOfTerms) { 

    for (int i = 0; i < sizeOfDistinctClassarray; i++) 
    { 
     int save = (index * sizeOfDistinctClassarray) + i; 
     bool test = false; 
     for (int j = startingPointOfClassDoc[i]; j <= endingPOintOfClassDoc[i]; j++) 
     { 
      if (term == documentarray[j]) 
      { 
       printf("%i \t", index); 
       dev_calcMatrix[save] = dev_calcMatrix[save] + documentTermArray[j]; 

       //printf("TermArray: documentTermArray[j] %d\n", dev_calcMatrix[save], documentTermArray[j]); 

       test = true; 
      } 
     } 

     if (!test) dev_calcMatrix[save] = 0; 


    } 
} 
} 

这是我使用来创建线程和块代码:

float blockNotFinal = data.sizeOfDistinctTerms/1024; 
int threads = 0; 
int blocks = (int)floor(blockNotFinal); 

dim3 dimGrid((blocks + 1), 1, 1); 
if (data.sizeOfDistinctTerms < 1024) 
{ 
    threads = data.sizeOfDistinctTerms; 
} 
else 
{ 
    threads = 1024; 
} 
dim3 dimBlock(threads, 1, 1); 

所以,我需要创建23,652个线程。我正在做的是23,652/1024 = 23.09。在获得23.09值后,我将它舍入到23并加上+ 1 = 24块。所以我创建了24个块* 1024个线程:24,576个线程。

我知道,即使他们不会使用一些线程将被创建,这就是为什么我说这个if语句中的内核beggining:

int index = blockIdx.x * blockDim.x + threadIdx.x; 

if (index <= sizeOfTerms (23,652 is the size)) { .... } 

的问题是,我加了一些PRINTF( )在IF语句之前和IF语句之后。

IF语句坠毁前线程的最大指数前:24479 里面的IF语句的最大线程指数坠毁之前曾是:23487.

所以,从上面的信息,数量线程不会达到最大值。另外,在群集上,它给了我另一个错误:遇到非法内存访问。我知道这个错误意味着它有一个索引超出限制,但是我给出了与线程数相等的数组大小。

下面是代码,我在GPU分配内存:

cudaStatus = cudaSetDevice(0); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); 
    goto Error; 
} 

cout << "\n Allocated GPU buffers"; 
// Allocate GPU buffers for input and output vectors 
cudaStatus = cudaMalloc((void**)&dev_calcMatrix, data.sizeOfDistinctTerms * data.sizeOfDistinctClassarray * sizeof(float)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 
cudaStatus = cudaMalloc((void**)&dev_probMatrix, data.sizeOfDistinctTerms * data.sizeOfDistinctClassarray * sizeof(float)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 

cudaStatus = cudaMalloc((void**)&classSummationTerms, data.sizeOfDistinctClassarray * sizeof(int)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 

cudaStatus = cudaMalloc((void**)&documentarray, data.sizeOfTotalTermsDocsFreq * sizeof(int)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 

cudaStatus = cudaMalloc((void**)&documentTermArray, data.sizeOfTotalTermsDocsFreq * sizeof(int)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 

cudaStatus = cudaMalloc((void**)&distincttermsarray, data.sizeOfDistinctTerms * sizeof(int)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 

cudaStatus = cudaMalloc((void**)&distinctclassarray, data.sizeOfDistinctClassarray * sizeof(int)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 

cudaStatus = cudaMalloc((void**)&startingPointOfClassDoc, data.sizeOfDistinctClassarray * sizeof(int)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 

cudaStatus = cudaMalloc((void**)&endingPOintOfClassDoc, data.sizeOfDistinctClassarray * sizeof(int)); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMalloc failed!"); 
    goto Error; 
} 

cout << "\n Copied input vectors from host to GPU"; 
// Copy input vectors from host memory to GPU buffers. 
cudaStatus = cudaMemcpy(documentarray, data.documentarray, data.sizeOfTotalTermsDocsFreq * sizeof(int), cudaMemcpyHostToDevice); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMemcpy failed!"); 
    goto Error; 
} 

cudaStatus = cudaMemcpy(documentTermArray, data.documentTermArray, data.sizeOfTotalTermsDocsFreq * sizeof(int), cudaMemcpyHostToDevice); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMemcpy failed!"); 
    goto Error; 
} 

cudaStatus = cudaMemcpy(distincttermsarray, data.distincttermsarray, data.sizeOfDistinctTerms * sizeof(int), cudaMemcpyHostToDevice); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMemcpy failed!"); 
    goto Error; 
} 

cudaStatus = cudaMemcpy(classSummationTerms, data.classSummationTerms, data.sizeOfDistinctClassarray * sizeof(int), cudaMemcpyHostToDevice); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMemcpy failed!"); 
    goto Error; 
} 

cudaStatus = cudaMemcpy(distinctclassarray, data.distinctclassarray, data.sizeOfDistinctClassarray * sizeof(int), cudaMemcpyHostToDevice); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMemcpy failed!"); 
    goto Error; 
} 

cudaStatus = cudaMemcpy(startingPointOfClassDoc, data.startingPointOfClassDoc, data.sizeOfDistinctClassarray * sizeof(int), cudaMemcpyHostToDevice); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMemcpy failed!"); 
    goto Error; 
} 

cudaStatus = cudaMemcpy(endingPOintOfClassDoc, data.endingPOintOfClassDoc, data.sizeOfDistinctClassarray * sizeof(int), cudaMemcpyHostToDevice); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "cudaMemcpy failed!"); 
    goto Error; 
} 


cout << "\n Now we call the CALCKERNL()"; 
// Launch a kernel on the GPU with one thread for each element. 
calcKernel <<<dimGrid, dimBlock >>>(dev_calcMatrix, 
          documentarray, 
          documentTermArray, 
          distincttermsarray, 
          distinctclassarray, 
          startingPointOfClassDoc, 
          endingPOintOfClassDoc, 
          sizi, 
          sizeOfTerms); 

//// cudaDeviceSynchronize waits for the kernel to finish, and returns 
//// any errors encountered during the launch. 
//cudaStatus = cudaDeviceSynchronize(); 
//if (cudaStatus != cudaSuccess) { 
// fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); 
// goto Error; 
//} 

cudaStatus = cudaStreamSynchronize(0); 
if (cudaStatus != cudaSuccess) { 
    //fprintf(stderr, "calcKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
    cout << "\n Synchronization failed: " << cudaGetErrorString(cudaStatus); 
    goto Error; 
} 
// Check for any errors launching the kernel 
cudaStatus = cudaGetLastError(); 
if (cudaStatus != cudaSuccess) { 
    fprintf(stderr, "calcKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); 
    goto Error; 
} 

任何想法,为什么发生这种情况?

+0

没有,23,652本可以。问题是,他正在运行23,653个线程。 – tera

+0

我想你会很难制造[mcve]。 – user4581301

+0

线程数限制在GPU上吗?这个限制对于不同的GPU是不同的? – Ripi2

回答

1

没有Minimal, Complete, and Verifiable example,甚至完整的代码,这是不可能回答。但已经内核的开头有两个缺陷可能导致出界外内存访问:

int index = blockIdx.x * blockDim.x + threadIdx.x; 

    int term = distincttermsarray[index]; 

    if (index <= sizeOfTerms) { 

首先,使用index作为数组索引是检查它所需的温度范围内之前不安全。其次,如果sizeOfTerms是数组元素的数量,则检查需要使用index < sizeOfTerms(而不是<=)。

+0

哇我真的有一些坏的错误:(我固定这些你告诉我,但我仍然有同样的问题:(我做了一个干净的Visual Studio,因为这给我有时真的很难的问题,但它仍然没有工作,我编译并在我提到的群集中运行它,但仍然是同样的问题:( – user3774470

+1

在cuda-memcheck下运行你的代码并修复指出的问题。准备一个MCVE,然后我们会看到。 – tera

0

一个简单的方法来找到这个错误并修复它把由@tera作为sugested CUDA-MEMCHECK和使用Cuda的调试器 withiout一个生命值运行代码。 调试器应该在发生错误的瞬间停止。

我的建议是Nsight + Visual Studio与TDR关,所以它不会是一个问题,如果非法的错误需要一段时间的发生。