我正在CUDA项目中工作,但我遇到了一些严重的麻烦,我找不到解决方案。CUDA内核崩溃没有任何理由20k +线程
我使用NVIDIA Quadro K2000m实现了该项目,并在我的PC(pA)中运行。但是当我在一个拥有Nvidia Tesla GPU的集群上部署该项目时,以及在另一台PC(pB)(NVIDIA GTX 960m)上部署该项目时,该项目将无法执行!
有趣的是,当我在Visual Studio中使用Nsight调试器上的pB(第二PC),它会执行,而不是显示错误:未指定发射失败
这是代码第一核心:
__global__ void calcKernel(float *dev_calcMatrix,
int *documentarray,
int *documentTermArray,
int *distincttermsarray,
int *distinctclassarray,
int *startingPointOfClassDoc,
int *endingPOintOfClassDoc,
int sizeOfDistinctClassarray,
int sizeOfTerms)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int term = distincttermsarray[index];
if (index <= sizeOfTerms) {
for (int i = 0; i < sizeOfDistinctClassarray; i++)
{
int save = (index * sizeOfDistinctClassarray) + i;
bool test = false;
for (int j = startingPointOfClassDoc[i]; j <= endingPOintOfClassDoc[i]; j++)
{
if (term == documentarray[j])
{
printf("%i \t", index);
dev_calcMatrix[save] = dev_calcMatrix[save] + documentTermArray[j];
//printf("TermArray: documentTermArray[j] %d\n", dev_calcMatrix[save], documentTermArray[j]);
test = true;
}
}
if (!test) dev_calcMatrix[save] = 0;
}
}
}
这是我使用来创建线程和块代码:
float blockNotFinal = data.sizeOfDistinctTerms/1024;
int threads = 0;
int blocks = (int)floor(blockNotFinal);
dim3 dimGrid((blocks + 1), 1, 1);
if (data.sizeOfDistinctTerms < 1024)
{
threads = data.sizeOfDistinctTerms;
}
else
{
threads = 1024;
}
dim3 dimBlock(threads, 1, 1);
所以,我需要创建23,652个线程。我正在做的是23,652/1024 = 23.09。在获得23.09值后,我将它舍入到23并加上+ 1 = 24块。所以我创建了24个块* 1024个线程:24,576个线程。
我知道,即使他们不会使用一些线程将被创建,这就是为什么我说这个if语句中的内核beggining:
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index <= sizeOfTerms (23,652 is the size)) { .... }
的问题是,我加了一些PRINTF( )在IF语句之前和IF语句之后。
IF语句坠毁前线程的最大指数前:24479 里面的IF语句的最大线程指数坠毁之前曾是:23487.
所以,从上面的信息,数量线程不会达到最大值。另外,在群集上,它给了我另一个错误:遇到非法内存访问。我知道这个错误意味着它有一个索引超出限制,但是我给出了与线程数相等的数组大小。
下面是代码,我在GPU分配内存:
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cout << "\n Allocated GPU buffers";
// Allocate GPU buffers for input and output vectors
cudaStatus = cudaMalloc((void**)&dev_calcMatrix, data.sizeOfDistinctTerms * data.sizeOfDistinctClassarray * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_probMatrix, data.sizeOfDistinctTerms * data.sizeOfDistinctClassarray * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&classSummationTerms, data.sizeOfDistinctClassarray * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&documentarray, data.sizeOfTotalTermsDocsFreq * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&documentTermArray, data.sizeOfTotalTermsDocsFreq * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&distincttermsarray, data.sizeOfDistinctTerms * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&distinctclassarray, data.sizeOfDistinctClassarray * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&startingPointOfClassDoc, data.sizeOfDistinctClassarray * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&endingPOintOfClassDoc, data.sizeOfDistinctClassarray * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cout << "\n Copied input vectors from host to GPU";
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(documentarray, data.documentarray, data.sizeOfTotalTermsDocsFreq * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(documentTermArray, data.documentTermArray, data.sizeOfTotalTermsDocsFreq * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(distincttermsarray, data.distincttermsarray, data.sizeOfDistinctTerms * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(classSummationTerms, data.classSummationTerms, data.sizeOfDistinctClassarray * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(distinctclassarray, data.distinctclassarray, data.sizeOfDistinctClassarray * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(startingPointOfClassDoc, data.startingPointOfClassDoc, data.sizeOfDistinctClassarray * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(endingPOintOfClassDoc, data.endingPOintOfClassDoc, data.sizeOfDistinctClassarray * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cout << "\n Now we call the CALCKERNL()";
// Launch a kernel on the GPU with one thread for each element.
calcKernel <<<dimGrid, dimBlock >>>(dev_calcMatrix,
documentarray,
documentTermArray,
distincttermsarray,
distinctclassarray,
startingPointOfClassDoc,
endingPOintOfClassDoc,
sizi,
sizeOfTerms);
//// cudaDeviceSynchronize waits for the kernel to finish, and returns
//// any errors encountered during the launch.
//cudaStatus = cudaDeviceSynchronize();
//if (cudaStatus != cudaSuccess) {
// fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
// goto Error;
//}
cudaStatus = cudaStreamSynchronize(0);
if (cudaStatus != cudaSuccess) {
//fprintf(stderr, "calcKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
cout << "\n Synchronization failed: " << cudaGetErrorString(cudaStatus);
goto Error;
}
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "calcKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
任何想法,为什么发生这种情况?
没有,23,652本可以。问题是,他正在运行23,653个线程。 – tera
我想你会很难制造[mcve]。 – user4581301
线程数限制在GPU上吗?这个限制对于不同的GPU是不同的? – Ripi2