CUDA推力阵列长度

我的CUDA内核正在使用推力，按键排序和减少。当我使用阵列超过460它开始显示不正确的结果。CUDA推力阵列长度

任何人都可以解释这种行为？或者它与我的机器有关？

尽管尺寸很大，排序仍然正常，但是，REDUCE_BY_KEY运行不正常。并返回不正确的结果。我有4个数组 1）输入键被定义为wholeSequenceArray。 2）在内核中定义的初始值为1的输入值。 3）输出键用于保存输入键的不同值 4）输出值用于保存对应于相同输入的输入值之和关键。

有关reduce_by_key更多介绍请访问此页： https://thrust.github.io/doc/group__reductions.html#gad5623f203f9b3fdcab72481c3913f0e0

这里是我的代码：

#include <cstdlib> 
#include <stdlib.h> 
#include <stdio.h> 
#include <iostream> 
#include <vector> 
#include <fstream> 
#include <string> 
#include <cuda.h> 
#include <cuda_runtime.h> 
#include <thrust/device_vector.h> 
#include <thrust/host_vector.h> 
#include <thrust/sort.h> 
#include <thrust/reduce.h> 
#include <thrust/execution_policy.h> 

using namespace std; 
#define size 461 

__global__ void calculateOccurances(unsigned int *input_keys, 
      unsigned int *output_Values) { 
    int tid = threadIdx.x; 

    const int N = size; 
    __shared__ unsigned int input_values[N]; 

    unsigned int outputKeys[N]; 

    int i = tid; 
    while (i < N) { 
      if (tid < N) { 
        input_values[tid] = 1; 
      } 
      i += blockDim.x; 
    } 
    __syncthreads(); 

    thrust::sort(thrust::device, input_keys, input_keys + N); 

    thrust::reduce_by_key(thrust::device, input_keys, input_keys + N, 
        input_values, outputKeys, output_Values); 

    if (tid == 0) { 
      for (int i = 0; i < N; ++i) { 
        printf("%d,", output_Values[i]); 
      } 
    } 

} 

int main(int argc, char** argv) { 

    unsigned int wholeSequenceArray[size] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
        11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 
        9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 
        7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 
        6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 
        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 
        4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 
        3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 
        2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 
        20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 
        19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 
        18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 
        17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 
        16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 
        15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 
        14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
        13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
        12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
        11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 
        9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,1 }; 

    cout << "wholeSequenceArray:" << endl; 
    for (int i = 0; i < size; i++) { 
      cout << wholeSequenceArray[i] << ","; 
    } 

    cout << "\nStart C++ Array New" << endl; 
    cout << "Size of Input:" << size << endl; 

    cudaDeviceProp prop; 
    cudaGetDeviceProperties(&prop, 0); 
    printf("Max threads per block: %d\n", prop.maxThreadsPerBlock); 

    unsigned int counts[size]; 
    unsigned int *d_whole; 
    unsigned int *d_counts; 

    cudaMalloc((void**) &d_whole, size * sizeof(unsigned int)); 
    cudaMalloc((void**) &d_counts, size * sizeof(unsigned int)); 

    cudaMemcpy(d_whole, wholeSequenceArray, size * sizeof(unsigned int), 
        cudaMemcpyHostToDevice); 

    calculateOccurances<<<1, size>>>(d_whole, d_counts); 

    cudaMemcpy(counts, d_counts, size * sizeof(unsigned int), 
        cudaMemcpyDeviceToHost); 

    cout << endl << "Counts" << endl << endl; 
    for (int i = 0; i < size; ++i) { 
      cout << counts[i] << ","; 
    } 
    cout << endl; 

    cudaFree(d_whole); 
}

来源

2017-02-12 Emad R

当[检查CUDA错误]时你会得到任何错误（http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using -THE-CUDA的运行时API）？ –

不，它运行平稳，我删除了cuda错误代码只是为了使代码更小:) –

我不认为你明白如何在设备代码中使用'thrust'工作。你有461个线程，每个线程都是自己做的，**分开**在相同的地方对相同的数据进行排序。这可能不是一个有用的算法。这些461个线程将在彼此移动数据时进行排序。我不清楚你在这里需要一个CUDA内核。您所描述的算法可以通过以普通方式（即从主机代码）使用推力来完成。该工作仍将在设备上完成。 –

当你在内核调用一个推力算法，即推力算法在派遣整个来自每个CUDA线程。因此，您的代码正在同一地点对同一数据（每个CUDA内核线程一次）执行461次排序操作。这意味着每个线程在分类操作过程中移动数据时都会相互移动。

如果您只是想使用您在问题中概述的方法来统计数字的出现次数（有效直方图），并且您想使用推力，则根本不需要编写CUDA内核。

如果您确实想从CUDA内核中正确执行此操作，那么您需要将推力操作（sort和reduce_by_key）限制为仅从单个线程执行操作。（甚至这种方法将被限制在一个块中）。

我真的不认为第二种方法（CUDA内核）有意义，但为了完整性，我修改了代码以包含每种方法的正确示例。需要注意的是，一旦你进行还原，不再有在打印出的每个阵列中的所有461项的任何一点，所以我已经限制了打印到第一25个条目，每个阵列其中为了清楚：

$ cat t91.cu 
#include <cstdlib> 
#include <stdlib.h> 
#include <stdio.h> 
#include <iostream> 
#include <vector> 
#include <fstream> 
#include <string> 
#include <cuda.h> 
#include <cuda_runtime.h> 
#include <thrust/device_vector.h> 
#include <thrust/host_vector.h> 
#include <thrust/sort.h> 
#include <thrust/reduce.h> 
#include <thrust/execution_policy.h> 
#include <thrust/iterator/constant_iterator.h> 

using namespace std; 
#define size 461 

__global__ void calculateOccurances(unsigned int *input_keys, 
      unsigned int *output_Values) { 
    int tid = threadIdx.x; 

    const int N = size; 
    __shared__ unsigned int input_values[N]; 

    unsigned int outputKeys[N]; 

    int i = tid; 
    while (i < N) { 
      if (tid < N) { 
        input_values[tid] = 1; 
      } 
      i += blockDim.x; 
    } 
    __syncthreads(); 
    if (tid == 0){ 
     thrust::sort(thrust::device, input_keys, input_keys + N); 

     thrust::reduce_by_key(thrust::device, input_keys, input_keys + N, 
        input_values, outputKeys, output_Values); 
     } 

    if (tid == 0) { 
    printf("from kernel:\n"); 
      for (int i = 0; i < 25; ++i) { 
        printf("%d,", output_Values[i]); 
      } 
    } 

} 

int main(int argc, char** argv) { 

    unsigned int wholeSequenceArray[size] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
        11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 
        9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 
        7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 
        6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 
        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 
        4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 
        3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 
        2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 
        20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 
        19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 
        18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 
        17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 
        16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 
        15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 
        14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
        13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
        12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
        11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 
        9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,1 }; 

    cout << "wholeSequenceArray:" << endl; 
    for (int i = 0; i < size; i++) { 
      cout << wholeSequenceArray[i] << ","; 
    } 

    cout << "\nStart C++ Array New" << endl; 
    cout << "Size of Input:" << size << endl; 

    cudaDeviceProp prop; 
    cudaGetDeviceProperties(&prop, 0); 
    printf("Max threads per block: %d\n", prop.maxThreadsPerBlock); 

//just using thrust 

    thrust::device_vector<int> d_seq(wholeSequenceArray, wholeSequenceArray+size); 
    thrust::device_vector<int> d_val_out(size); 
    thrust::device_vector<int> d_key_out(size); 

    thrust::sort(d_seq.begin(), d_seq.end()); 
    int rsize = thrust::get<0>(thrust::reduce_by_key(d_seq.begin(), d_seq.end(), thrust::constant_iterator<int>(1), d_key_out.begin(), d_val_out.begin())) - d_key_out.begin(); 
    std::cout << "rsize:" << rsize << std::endl; 
    std::cout << "Thrust keys:" << std::endl; 
    thrust::copy_n(d_key_out.begin(), rsize, std::ostream_iterator<int>(std::cout, ",")); 
    std::cout << std::endl << "Thrust vals:" << std::endl; 
    thrust::copy_n(d_val_out.begin(), rsize, std::ostream_iterator<int>(std::cout, ",")); 
    std::cout << std::endl; 


// in a cuda kernel 


    unsigned int counts[size]; 
    unsigned int *d_whole; 
    unsigned int *d_counts; 

    cudaMalloc((void**) &d_whole, size * sizeof(unsigned int)); 
    cudaMalloc((void**) &d_counts, size * sizeof(unsigned int)); 

    cudaMemcpy(d_whole, wholeSequenceArray, size * sizeof(unsigned int), 
        cudaMemcpyHostToDevice); 

    calculateOccurances<<<1, size>>>(d_whole, d_counts); 

    cudaMemcpy(counts, d_counts, size * sizeof(unsigned int), 
        cudaMemcpyDeviceToHost); 

    std::cout << "from Host:" << std::endl; 
    cout << endl << "Counts" << endl << endl; 
    for (int i = 0; i < 25; ++i) { 
      cout << counts[i] << ","; 
    } 
    cout << endl; 

    cudaFree(d_whole); 
} 
$ nvcc -arch=sm_61 -o t91 t91.cu 
$ ./t91 
wholeSequenceArray: 
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1, 
Start C++ Array New 
Size of Input:461 
Max threads per block: 1024 
rsize:20 
Thrust keys: 
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, 
Thrust vals: 
24,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23, 
from kernel: 
24,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,526324,526325,526325,526327,526329,from Host: 

Counts 

24,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,526324,526325,526325,526327,526329, 
$

注意事项：

我在推力示例中包含了一个方法，因此您可以准确知道输出数组的大小。
推力方法应该独立于size参数工作正常 - 受GPU的限制（如内存大小）的限制。 CUDA内核方法实际上只是从单个线程执行推力代码，因此运行超过1个块并不明智。
您可能希望参考this question/answer以获取有关使用CUDA内核推力的更多讨论。

来源

2017-02-13 02:22:23

CUDA推力阵列长度

回答

相关问题