Cuda程序不能正常工作

我是cuda编程的初学者。我正在尝试一个简单的代码，但它不工作，我不知道还有什么要做。Cuda程序不能正常工作

我的代码：

#include <mpi.h> 
#include <cuda.h> 
#include <stdio.h> 
#include <sys/wait.h> 
// Prototypes 
__global__ void helloWorld(char*); 
__device__ int getGlobalIdx_2D_2D(); 

// Host function 

int main(int argc, char** argv) 
{ 
    unsigned int i, N, gridX, gridY, blockX, blockY; 
    N = 4096000; 

    char *str = (char *) malloc(N*sizeof(char)); 
    for(i=0; i < N; i++) str[i]='c'; 

    MPI_Init (&argc, &argv); 

    char *d_str; 
    size_t size = (size_t) N*sizeof(char); 
    cudaMalloc((void**)&d_str, size); 
    cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice); 

    gridX = 100; 
    gridY = 10; 
    blockX = blockY = 64; 
    dim3 dimGrid(gridX, gridY); // 4096 chars per block 
    dim3 dimBlock(blockX, blockY); // one thread per character, 2D 
    printf("dimGrid(%d, %d)\t", gridX, gridY); 
    printf("dimBlock(%d, %d)\t", blockX, blockY); 
    helloWorld<<< dimGrid, dimBlock >>>(d_str); 

    cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost); 
    cudaThreadSynchronize(); 

    MPI_Barrier (MPI_COMM_WORLD); 

    cudaFree(d_str); 

    printf("\nRes:\n"); 
    for(i = 0; i < N; i++) printf("\t[%u] %c\n", i, str[i]); 

    MPI_Finalize(); 

    free(str); 
    return 0.0; 
} 

// Device kernel 
__global__ void helloWorld(char* str) 
{ 
    // determine where in the thread grid we are 
    int pos = getGlobalIdx_2D_2D(); 
    if (pos % 2 == 0) str[pos] -= 2; 
    else str[pos] += 8; 
} 

__device__ int getGlobalIdx_2D_2D() 
{ 
    int blockId = blockIdx.x + blockIdx.y * gridDim.x; 
    int threadId = blockId * (blockDim.x * blockDim.y) + 
        (threadIdx.y * blockDim.x) + threadIdx.x; 
    return threadId; 
}

我所需的输出是：jajajajajajaja ... x4096000

我读过，“％”的操作效率不高，但我不认为这是问题那里。

谢谢！

来源

2016-01-12 Genís Moreno

啊，它输出的是'ccccccc ...'x4096000，作为初始化，所以char指针数组没有被修改过。 –

只是好奇，为什么4096000次？ –

请将您的程序当前（不正确）输出置于问题文本中。 – Lav

你正在执行绝对没有CUDA error checking，这样做真的很有益。一旦启用它，你会发现块尺寸64×64是无效的，因为它在一个块内产生4096个线程，这不是一个有效的配置。

来源

2016-01-12 09:49:46

我在HelloWorld例子中使用了cudaThreadSynchronize，所以我没有任何理由。我开始使用CUDA错误检查，问题是每个块的64x64线程不是一个有效的配置，如本主题中所述（http://stackoverflow.com/questions/16125389/invalid-configuration-argument-error-for-the-呼叫的-CUDA内核） –

Cuda程序不能正常工作

回答

相关问题