CUDA - 如何使线程在内核中等待它的孩子

我想实现一个非常简单的合并排序使用CUDA递归（对于cm> 35）技术，但我找不到方法来告诉父线程以因为cudaEventSynchronize（）和cudaStreamSynchronize（）仅是主机，所以同时启动它的子项并等待其子项计算。 __syncthread（）不会归档所需的效果，因为父项的下一行只应在子项完成所有计算后才执行。CUDA - 如何使线程在内核中等待它的孩子

__global__ void simple_mergesort(int* data,int *dataAux,int begin,int end, int depth){ 
    int middle = (end+begin)/2; 
    int i0 = begin; 
    int i1 = middle; 
    int index; 
    int n = end-begin; 

    cudaStream_t s,s1; 

    //If we're too deep or there are few elements left, we use an insertion sort... 
    if(depth >= MAX_DEPTH || end-begin <= INSERTION_SORT){ 
     selection_sort(data, begin, end); 
     return; 
    } 

    if(n < 2){ 
     return; 
    } 

    // Launches a new block to sort the left part. 
    cudaStreamCreateWithFlags(&s,cudaDeviceScheduleBlockingSync); 
    simple_mergesort<<< 1, 1, 0, s >>>(data,dataAux, begin, middle, depth+1); 
    cudaStreamDestroy(s); 

    // Launches a new block to sort the right part. 
    cudaStreamCreateWithFlags(&s1,cudaDeviceScheduleBlockingSync); 
    simple_mergesort<<< 1, 1, 0, s1 >>>(data,dataAux, middle, end, depth+1); 
    cudaStreamDestroy(s1); 

    // Waits until children have returned, does not compile. 
    cudaStreamSynchronize(s); 
    cudaStreamSynchronize(s1); 


    for (index = begin; index < end; index++) { 
     if (i0 < middle && (i1 >= end || data[i0] <= data[i1])){ 
      dataAux[index] = data[i0]; 
      i0++; 
     }else{ 
      dataAux[index] = data[i1]; 
      i1++; 
     } 
    } 

    for(index = begin; index < end; index ++){ 
     data[index] = dataAux[index]; 
    } 
}

我应该对我的代码进行哪些改编，以便达到预期效果？

感谢您的阅读。

来源

2014-12-01 Eugênio Fonseca