2011-11-16 39 views
0

因此,当我运行我的代码时,它完全执行,但是当我尝试在Visual Profiler中运行它时,它第一次运行,但似乎要运行该程序七次,时间会导致未指定的启动失败。为什么会发生?我的代码如下所示,我的错误检查告诉我错误发生在 cudaMemcpy(p-> siteset,rsites,sitesize,cudaMemcpyDeviceToHost); (可能通过搜索memcpy11很容易在代码中找到,这将是上面的行)Cuda Visual Profiler未指定的启动失败

我想不出一个原因,程序本质上是第二次运行它,但不是第一次,如果我在终端上多次运行它,它是完全正常的。任何人都可以想出可能发生的事情吗? 谢谢!

void fillin(node *p, node *left, node *rt) 
{ 
size_t stepsize = chars * sizeof(long); 
size_t sitesize = chars * sizeof(sitearray); 
seqptr lsites; 
    cudaMalloc((void **) &lsteps, stepsize); 

    checkCUDAError("malloc"); 
    cudaMalloc((void **) &lsites, sitesize); 
    checkCUDAError("malloc"); 
    cudaMemcpy(lsteps, left->numsteps, stepsize, cudaMemcpyHostToDevice); 
    checkCUDAError("memcpy7"); 
    cudaMemcpy(lsites, left->siteset, sitesize, cudaMemcpyHostToDevice); 
    checkCUDAError("memcpy8"); 
    steptr rsteps; 
    seqptr rsites; 
    cudaMalloc((void **) &rsteps, stepsize); 
    checkCUDAError("malloc"); 
    cudaMalloc((void **) &rsites, sitesize); 
    checkCUDAError("malloc"); 
    cudaMemcpy(rsteps, rt->numsteps, stepsize, cudaMemcpyHostToDevice); 
    checkCUDAError("memcpy9"); 
    cudaMemcpy(rsites, rt->siteset, sitesize, cudaMemcpyHostToDevice); 
    checkCUDAError("memcpy"); 
    //call kernel 
    int block_size = 1; 
    int n_blocks = chars; 
    fillinBoth <<<n_blocks, block_size>>> (lsteps, lsites, rsteps, rsites, chars); 
    cudaMemcpy(p->numsteps, rsteps, stepsize, cudaMemcpyDeviceToHost); 
    checkCUDAError("memcpy10"); 
    cudaMemcpy(p->siteset, rsites, sitesize, cudaMemcpyDeviceToHost); 
    checkCUDAError("memcpy11"); 
    cudaFree(rsites); cudaFree(rsteps); 
    cudaFree(lsites); cudaFree(lsteps); 
    checkCUDAError("free"); 
} 

}

__global__ void fillinBoth (steptr lsteps, seqptr lsite, steptr rsteps, seqptr rsite, long max){ 
boolean counted; 
aas aa; 
long s; 
long i, j, k, n; 
int idx = blockIdx.x; 
//reduce array references; may or may not be useful 
__shared__ long ls[3]; 
__shared__ long rs[3]; 
__shared__ long qs[3]; 
counted = false; 
k = 0; 
//computation from original program, but now need to do manual address calculation 
if(idx < max){ 
    for(i = 0; i < 3; i++){ 
     rs[i]=rsite[idx][i]; 
     ls[i]=lsite[idx][i]; 
    } 
    n = lsteps[idx] + rsteps[idx]; 
    counted = false; 
    for (i = 0; i <= 5; i++) { 
     if (k < 3) { 
      switch (i) { 

       case 0: 
        s = ls[0] & rs[0]; 
        break; 

       case 1: 
        s = (ls[0] & rs[1]) | (ls[1] & rs[0]); 
        break; 

       case 2: 
        s = (ls[0] & rs[2]) | (ls[1] & rs[1]) | (ls[2] & rs[0]); 
        break; 

       case 3: 
        s = ls[0] | (ls[1] & rs[2]) | (ls[2] & rs[1]) | rs[0]; 
        break; 

       case 4: 
        s = ls[1] | (ls[2] & rs[2]) | rs[1]; 
        break; 

       case 5: 
        s = ls[2] | rs[2]; 
        break; 
      } 
      if (counted || s != 0) { 
       qs[k] = s; 
       k++; 
       counted = true; 
      } else if (!counted) 
       n += cudaWeight[idx]; 
     } 
    } 
    for (i = 0; i <= 1; i++) { 
     for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1)) { 
      if (((1L << ((long)aa)) & qs[i]) != 0) { 
       for (j = i + 1; j <= 2; j++) 
        qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i]; 
      } 
     } 
    } 
    rsteps[idx] = n; 
    for(i = 0; i < 3; i++) 
     rsite[idx][i]=qs[i]; 
} 

}

+1

尝试运行与'cudamemchk'代码,看看它是否报告了一些越界内存访问。 Profiler的每次运行都会有所不同,但当代码访问超出界限时将会“潜在”,否则特定于芯片资源的配置文件可能会被代码破坏。 – talonmies

+0

我对它运行cuda-memcheck并没有得到任何错误=========错误摘要:0错误 其他任何问题都可能出错吗? – Izri

回答

1

尝试禁用配置文件中的会话设置所有计数器。另外,尝试从工作文件夹中删除所有文件,如“temp_compute_profiler_1_1.csv”(请参阅​​配置文件设置“工作文件夹”,默认情况下与您的可执行文件的位置相同)。

有同样的错误(OpenCL的CUDA以上):http://www.khronos.org/message_boards/viewtopic.php?t=4324

相关问题