我在CUDA程序的printf与-arch = sm_20没有在内核文件显示anaything
__device__ __global__ void Kernel(float *, float * ,int);
void DeviceFunc(float *temp_h , int numvar , float *temp1_h)
{ .....
//Kernel call
printf("calling kernel\n");
Kernel<<<dimGrid , dimBlock>>>(a_d , b_d , numvar);
printf("kernel called\n");
....
}
int main(int argc , char **argv)
{ ....
printf("beforeDeviceFunc\n\n");
DeviceFunc(a_h , numvar , b_h); //Showing the data
printf("after DeviceFunc\n\n");
....
}
另外在Kernel.cu增加了一些printf语句,我写道:
#include<cuda.h>
#include <stdio.h>
__device__ __global__ void Kernel(float *a_d , float *b_d ,int size)
{
int idx = threadIdx.x ;
int idy = threadIdx.y ;
//Allocating memory in the share memory of the device
__shared__ float temp[16][16];
//Copying the data to the shared memory
temp[idy][idx] = a_d[(idy * (size+1)) + idx] ;
printf("idx=%d, idy=%d, size=%d", idx, idy, size);
....
}
然后我编译使用-arch=sm_20
这样的:
nvcc -c -arch sm_20 main.cu
nvcc -c -arch sm_20 Kernel.cu
nvcc -arch sm_20 main.o Kernel.o -o main
现在,当我运行程序时,我看到:
beforeDeviceFunc
calling kernel
kernel called
after DeviceFunc
因此,内核中的printf不会被打印。我该如何解决这个问题?
在我的情况'cudaDeviceSynchronize()'没有返回任何错误。但我注意到块尺寸太大(32 x 32),创建更小的线程块解决了问题。 – atoMerz