2017-02-12 137 views
0

我的CUDA内核正在使用推力,按键排序和减少。 当我使用阵列超过460它开始显示不正确的结果。CUDA推力阵列长度

任何人都可以解释这种行为?或者它与我的机器有关?

尽管尺寸很大,排序仍然正常,但是,REDUCE_BY_KEY运行不正常。并返回不正确的结果。我有4个数组 1)输入键被定义为wholeSequenceArray。 2)在内核中定义的初始值为1的输入值。 3)输出键用于保存输入键的不同值 4)输出值用于保存对应于相同输入的输入值之和关键。

有关reduce_by_key更多介绍请访问此页: https://thrust.github.io/doc/group__reductions.html#gad5623f203f9b3fdcab72481c3913f0e0

这里是我的代码:

#include <cstdlib> 
#include <stdlib.h> 
#include <stdio.h> 
#include <iostream> 
#include <vector> 
#include <fstream> 
#include <string> 
#include <cuda.h> 
#include <cuda_runtime.h> 
#include <thrust/device_vector.h> 
#include <thrust/host_vector.h> 
#include <thrust/sort.h> 
#include <thrust/reduce.h> 
#include <thrust/execution_policy.h> 

using namespace std; 
#define size 461 

__global__ void calculateOccurances(unsigned int *input_keys, 
      unsigned int *output_Values) { 
    int tid = threadIdx.x; 

    const int N = size; 
    __shared__ unsigned int input_values[N]; 

    unsigned int outputKeys[N]; 

    int i = tid; 
    while (i < N) { 
      if (tid < N) { 
        input_values[tid] = 1; 
      } 
      i += blockDim.x; 
    } 
    __syncthreads(); 

    thrust::sort(thrust::device, input_keys, input_keys + N); 

    thrust::reduce_by_key(thrust::device, input_keys, input_keys + N, 
        input_values, outputKeys, output_Values); 

    if (tid == 0) { 
      for (int i = 0; i < N; ++i) { 
        printf("%d,", output_Values[i]); 
      } 
    } 

} 

int main(int argc, char** argv) { 

    unsigned int wholeSequenceArray[size] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
        11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 
        9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 
        7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 
        6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 
        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 
        4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 
        3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 
        2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 
        20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 
        19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 
        18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 
        17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 
        16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 
        15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 
        14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
        13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
        12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
        11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 
        9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,1 }; 

    cout << "wholeSequenceArray:" << endl; 
    for (int i = 0; i < size; i++) { 
      cout << wholeSequenceArray[i] << ","; 
    } 

    cout << "\nStart C++ Array New" << endl; 
    cout << "Size of Input:" << size << endl; 

    cudaDeviceProp prop; 
    cudaGetDeviceProperties(&prop, 0); 
    printf("Max threads per block: %d\n", prop.maxThreadsPerBlock); 

    unsigned int counts[size]; 
    unsigned int *d_whole; 
    unsigned int *d_counts; 

    cudaMalloc((void**) &d_whole, size * sizeof(unsigned int)); 
    cudaMalloc((void**) &d_counts, size * sizeof(unsigned int)); 

    cudaMemcpy(d_whole, wholeSequenceArray, size * sizeof(unsigned int), 
        cudaMemcpyHostToDevice); 

    calculateOccurances<<<1, size>>>(d_whole, d_counts); 

    cudaMemcpy(counts, d_counts, size * sizeof(unsigned int), 
        cudaMemcpyDeviceToHost); 

    cout << endl << "Counts" << endl << endl; 
    for (int i = 0; i < size; ++i) { 
      cout << counts[i] << ","; 
    } 
    cout << endl; 

    cudaFree(d_whole); 
} 
+0

当[检查CUDA错误]时你会得到任何错误(http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using -THE-CUDA的运行时API)? –

+0

不,它运行平稳,我删除了cuda错误代码只是为了使代码更小:) –

+1

我不认为你明白如何在设备代码中使用'thrust'工作。你有461个线程,每个线程都是自己做的,**分开**在相同的地方对相同的数据进行排序。这可能不是一个有用的算法。这些461个线程将在彼此移动数据时进行排序。我不清楚你在这里需要一个CUDA内核。您所描述的算法可以通过以普通方式(即从主机代码)使用推力来完成。该工作仍将在设备上完成。 –

回答

1

当你在内核调用一个推力算法,即推力算法在派遣整个来自每个CUDA线程。因此,您的代码正在同一地点对同一数据(每个CUDA内核线程一次)执行461次排序操作。这意味着每个线程在分类操作过程中移动数据时都会相互移动。

如果您只是想使用您在问题中概述的方法来统计数字的出现次数(有效直方图),并且您想使用推力,则根本不需要编写CUDA内核。

如果您确实想从CUDA内核中正确执行此操作,那么您需要将推力操作(sort和reduce_by_key)限制为仅从单个线程执行操作。 (甚至这种方法将被限制在一个块中)。

我真的不认为第二种方法(CUDA内核)有意义,但为了完整性,我修改了代码以包含每种方法的正确示例。需要注意的是,一旦你进行还原,不再有在打印出的每个阵列中的所有461项的任何一点,所以我已经限制了打印到第一25个条目,每个阵列其中为了清楚:

$ cat t91.cu 
#include <cstdlib> 
#include <stdlib.h> 
#include <stdio.h> 
#include <iostream> 
#include <vector> 
#include <fstream> 
#include <string> 
#include <cuda.h> 
#include <cuda_runtime.h> 
#include <thrust/device_vector.h> 
#include <thrust/host_vector.h> 
#include <thrust/sort.h> 
#include <thrust/reduce.h> 
#include <thrust/execution_policy.h> 
#include <thrust/iterator/constant_iterator.h> 

using namespace std; 
#define size 461 

__global__ void calculateOccurances(unsigned int *input_keys, 
      unsigned int *output_Values) { 
    int tid = threadIdx.x; 

    const int N = size; 
    __shared__ unsigned int input_values[N]; 

    unsigned int outputKeys[N]; 

    int i = tid; 
    while (i < N) { 
      if (tid < N) { 
        input_values[tid] = 1; 
      } 
      i += blockDim.x; 
    } 
    __syncthreads(); 
    if (tid == 0){ 
     thrust::sort(thrust::device, input_keys, input_keys + N); 

     thrust::reduce_by_key(thrust::device, input_keys, input_keys + N, 
        input_values, outputKeys, output_Values); 
     } 

    if (tid == 0) { 
    printf("from kernel:\n"); 
      for (int i = 0; i < 25; ++i) { 
        printf("%d,", output_Values[i]); 
      } 
    } 

} 

int main(int argc, char** argv) { 

    unsigned int wholeSequenceArray[size] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
        11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 
        9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 
        7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 
        6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 
        5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 
        4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 
        3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 
        2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 
        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 
        20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 
        19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 
        18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 
        17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 
        16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 
        15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 
        14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
        13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
        12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
        11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 
        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 
        9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 
        8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,1 }; 

    cout << "wholeSequenceArray:" << endl; 
    for (int i = 0; i < size; i++) { 
      cout << wholeSequenceArray[i] << ","; 
    } 

    cout << "\nStart C++ Array New" << endl; 
    cout << "Size of Input:" << size << endl; 

    cudaDeviceProp prop; 
    cudaGetDeviceProperties(&prop, 0); 
    printf("Max threads per block: %d\n", prop.maxThreadsPerBlock); 

//just using thrust 

    thrust::device_vector<int> d_seq(wholeSequenceArray, wholeSequenceArray+size); 
    thrust::device_vector<int> d_val_out(size); 
    thrust::device_vector<int> d_key_out(size); 

    thrust::sort(d_seq.begin(), d_seq.end()); 
    int rsize = thrust::get<0>(thrust::reduce_by_key(d_seq.begin(), d_seq.end(), thrust::constant_iterator<int>(1), d_key_out.begin(), d_val_out.begin())) - d_key_out.begin(); 
    std::cout << "rsize:" << rsize << std::endl; 
    std::cout << "Thrust keys:" << std::endl; 
    thrust::copy_n(d_key_out.begin(), rsize, std::ostream_iterator<int>(std::cout, ",")); 
    std::cout << std::endl << "Thrust vals:" << std::endl; 
    thrust::copy_n(d_val_out.begin(), rsize, std::ostream_iterator<int>(std::cout, ",")); 
    std::cout << std::endl; 


// in a cuda kernel 


    unsigned int counts[size]; 
    unsigned int *d_whole; 
    unsigned int *d_counts; 

    cudaMalloc((void**) &d_whole, size * sizeof(unsigned int)); 
    cudaMalloc((void**) &d_counts, size * sizeof(unsigned int)); 

    cudaMemcpy(d_whole, wholeSequenceArray, size * sizeof(unsigned int), 
        cudaMemcpyHostToDevice); 

    calculateOccurances<<<1, size>>>(d_whole, d_counts); 

    cudaMemcpy(counts, d_counts, size * sizeof(unsigned int), 
        cudaMemcpyDeviceToHost); 

    std::cout << "from Host:" << std::endl; 
    cout << endl << "Counts" << endl << endl; 
    for (int i = 0; i < 25; ++i) { 
      cout << counts[i] << ","; 
    } 
    cout << endl; 

    cudaFree(d_whole); 
} 
$ nvcc -arch=sm_61 -o t91 t91.cu 
$ ./t91 
wholeSequenceArray: 
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1, 
Start C++ Array New 
Size of Input:461 
Max threads per block: 1024 
rsize:20 
Thrust keys: 
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, 
Thrust vals: 
24,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23, 
from kernel: 
24,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,526324,526325,526325,526327,526329,from Host: 

Counts 

24,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,23,526324,526325,526325,526327,526329, 
$ 

注意事项:

  1. 我在推力示例中包含了一个方法,因此您可以准确知道输出数组的大小。

  2. 推力方法应该独立于size参数工作正常 - 受GPU的限制(如内存大小)的限制。 CUDA内核方法实际上只是从单个线程执行推力代码,因此运行超过1个块并不明智。

  3. 您可能希望参考this question/answer以获取有关使用CUDA内核推力的更多讨论。