无效配置参数块的16位比

更大此代码工作正常：无效配置参数块的16位比

#include <stdio.h> 
#define N 1000 // <-- Works for values < 2^16 

__global__ 
void add(int *a, int *b) { 
    int i = blockIdx.x; 
    if (i<N) { 
     b[i] = 2*a[i]; 
    } 
} 
int main() { 
    int max_value[2]; 
    int ha[N], hb[N]; 
    int *da, *db; 
    cudaMalloc((void **)&da, N*sizeof(int)); 
    cudaMalloc((void **)&db, N*sizeof(int)); 
    for (int i = 0; i<N; ++i) { 
     ha[i] = i; 
    } 
    cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice); 
    add<<<N, 1>>>(da, db); 
    cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost); 
    max_value[0] = hb[0]; 
    int i; 
    for (i = 0; i < N; i++) { 
     if (hb[i] > max_value[0]) { 
      max_value[0] = hb[i]; 
      max_value[1] = i; 
     } 
    } 
    cudaFree(da); 
    cudaFree(db); 
    printf("Max number %d, from value:%d \n", max_value[0], max_value[1]); 
    getchar(); 
    return 0; 
}

但是当我从1000改变数字N（数组中的项）>（2 ）-1-程序崩溃。

我认为这是对东道国的溢出，让我感动的ha和hb数组声明BSS segment和改变N到100万。

#include <stdio.h> 
#define N 1000000 // <---- 

__global__ 
void add(int *a, int *b) { 
    int i = blockIdx.x; 
    if (i<N) { 
     b[i] = 2*a[i]; 
    } 
} 
static int ha[N]; // <---- 
static int hb[N]; // <---- 
int main() { 
    int max_value[2]; 
    // int ha[N], hb[N]; 
    int *da, *db; 
    cudaMalloc((void **)&da, N*sizeof(int)); 
    cudaMalloc((void **)&db, N*sizeof(int)); 
    for (int i = 0; i<N; ++i) { 
     ha[i] = i; 
    } 
    cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice); 
    add<<<N, 1>>>(da, db); 
    cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost); 
    max_value[0] = hb[0]; 
    int i; 
    for (i = 0; i < N; i++) { 
     if (hb[i] > max_value[0]) { 
      max_value[0] = hb[i]; 
      max_value[1] = i; 
     } 
    } 
    cudaFree(da); 
    cudaFree(db); 
    printf("Max number %d, from value:%d \n", max_value[0], max_value[1]); 
    getchar(); 
    return 0; 
}

现在我没有得到一个错误，但hb数组为空。
我的代码有什么问题？
如何分配大数组到设备并获得有效结果？

更新：我已经插入错误检查代码，
我得到的错误是 - >“无效的配置参数”。
更新代码是：

#include <stdio.h> 
#include <time.h> 
#include <math.h> 
#include <thrust/system_error.h> 
#include <thrust/system/cuda/error.h> 
#include <sstream> 
const int N = 70000; 

#define checkCudaErrors(error) {\ 
    if (error != cudaSuccess) {\ 
     printf("CUDA Error - %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(error));\ 
     exit(1);\ 
     }\ 
}\ 

__global__ 
void add(int *a, int *b) { 
    int i = blockIdx.x; 
    if (i<N) { 
     b[i] = 2*a[i]; 
    } 
} 
static int ha[N]; 
static int hb[N]; 
int main() { 
    // int ha[N], hb[N]; 
    int max_value[2]; 

    int deviceCount = 0; 
    cudaGetDeviceCount(&deviceCount); 
    cudaError_t err=cudaDeviceReset(); 
    if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);} 
    printf("Device count: %d \n", deviceCount); 

    for (int i = 0; i<N; ++i) { ha[i] = i; } 
    int *da, *db; 
    checkCudaErrors(cudaMalloc((void **)&da, N*sizeof(int))); 
    checkCudaErrors(cudaMalloc((void **)&db, N*sizeof(int))); 
    checkCudaErrors(cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice)); 
    add<<<N, 1>>>(da, db); // <--- Invalid configuration error 
    checkCudaErrors(cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost)); 
    max_value[0] = hb[0]; 
    int i; 
    for (i = 0; i < N; i++) { 
     if (hb[i] > max_value[0]) { 
      max_value[0] = hb[i]; 
      max_value[1] = i; 
     } 
    } 
    cudaError_t error = cudaGetLastError();  
    if(error != cudaSuccess) { 
     printf("CUDA error: %s\n", cudaGetErrorString(error)); 
     getchar(); 
     exit(-1); 
    } 
    getchar(); 
    return 0; 
}

该设备是一个的GeForce GTX 470和我使用
NVCC -o FOO new.cu编译

来源

2017-07-12 Panos Kal.

'cudaMalloc'和'cudaMemcpy'都返回'cudaError_t'类型的值 - 可能值得先检查一下。 – iehrlich

谢谢@iehrlich我会检查这个 –

也请看看[this]（https://stackoverflow.com/questions/34655893/cuda-large-input-arrays）和总体[this]（https：// www.google.ru/search?q=cuda+large+array）可能会给你一些提示。祝你好运！ – iehrlich

您的设备（GTX 470）是cc2.0设备e（计算能力）。

无效配置参数错误是由于cc2.0设备的一维网格块数限制为65535.此信息可在programming guide（“最大x维度一个线程块的网格“）或运行CUDA示例代码。所以，你的N选择在这里过大：

add<<<N, 1>>>(da, db); 
    ^

通常的办法解决这个与CC2.0设备是创建threadblocks的一个网格，是多维的，它允许threadblocks的数目大得多。内核启动参数实际上可以是dim3变量，这些变量允许指定多维网格（线程块）或多维线程块（线程）。

要正确执行此操作，您还需要更改内核代码，以便从可用的多维变量中创建适当的全局唯一线程ID。

下工作的例子给出了一个可能的最小集合变化的说明概念，并似乎对我正确运行：

$ cat t363.cu 
#include <stdio.h> 
#include <time.h> 
#include <math.h> 
#include <thrust/system_error.h> 
#include <thrust/system/cuda/error.h> 
#include <sstream> 
const int N = 70000; 

#define checkCudaErrors(error) {\ 
    if (error != cudaSuccess) {\ 
     printf("CUDA Error - %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(error));\ 
     exit(1);\ 
     }\ 
}\ 

__global__ 
void add(int *a, int *b) { 
    int i = blockIdx.x + blockIdx.y*gridDim.x; 
    if (i<N) { 
     b[i] = 2*a[i]; 
    } 
} 
static int ha[N]; 
static int hb[N]; 
int main() { 
    int max_value[2]; 

    int deviceCount = 0; 
    cudaGetDeviceCount(&deviceCount); 
    cudaError_t err=cudaDeviceReset(); 
    if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);} 
    printf("Device count: %d \n", deviceCount); 

    for (int i = 0; i<N; ++i) { ha[i] = i; } 
    int *da, *db; 
    checkCudaErrors(cudaMalloc((void **)&da, N*sizeof(int))); 
    checkCudaErrors(cudaMalloc((void **)&db, N*sizeof(int))); 
    checkCudaErrors(cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice)); 
    dim3 mygrid(N/10, 10); 
    add<<<mygrid, 1>>>(da, db); 
    checkCudaErrors(cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost)); 
    max_value[0] = hb[0]; 
    int i; 
    for (i = 0; i < N; i++) { 
     if (hb[i] > max_value[0]) { 
      max_value[0] = hb[i]; 
      max_value[1] = i; 
     } 
    } 
    printf("max_value[0] = %d, max_value[1] = %d\n", max_value[0], max_value[1]); 
    cudaError_t error = cudaGetLastError(); 
    if(error != cudaSuccess) { 
     printf("CUDA error: %s\n", cudaGetErrorString(error)); 
     getchar(); 
     exit(-1); 
    } 
    return 0; 
} 
$ nvcc -arch=sm_20 -o t363 t363.cu 
nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning). 
$ ./t363 
Device count: 4 
max_value[0] = 139998, max_value[1] = 69999 
$

注：

如果您在CC3跑了你原来的代码.0或更高版本的设备，它不应该抛出该错误。较新的CUDA设备将1D网格限制提高到2^31-1。但是如果你想超过这个块数（大约2B），那么你将不得不去多维网格。

在CUDA 8中不推荐使用cc2.0设备，并且从即将推出的CUDA 9版本中删除对它们的支持。

来源

2017-07-13 19:42:54

非常感谢罗伯特，为你的时间和你的真棒回复！ –

无效配置参数块的16位比

回答

相关问题