2013-07-02 21 views
-2

我有,我想从主机内存常量内存复制CUDA的应用程序。复制发生时没有任何错误。但是我没有得到需要复制的值,当我调试程序时,我总是在常量内存中获得0值。我的代码是这样的:复制数组常量存储从主机内存

这个变量是在一个单独的header.h文件中定义现在

#include <windows.h> 
#include <dos.h> 
#include <stdio.h> 
#include <conio.h> 

#include <cuda.h> 
#include <cuda_runtime.h> 
#include <cutil.h> 
#include <curand.h> 
#include <curand_kernel.h> 


#define env_end 48 
__constant__ float dev_h_top[2*env_end]; 
__constant__ float dev_h_bot[2*env_end]; 

__constant__ int dev_row_top[8]; 
__constant__ int dev_col_top[8]; 

__constant__ int dev_row_bot[8]; 
__constant__ int dev_col_bot[8]; 

void INIT_AG_PLACEMENT_FUNC(int *,int ,int ,int,double *,double *,int,int *,int *,int *, int *); 
__global__ void AGENT_POSITION_FUNC(int *,double *,double *,int *,int *,int *,int *,int *,int *,double *, double *); 

int main(int argc,char *argv[]) 
{ 
    int i,j,t,k,test,iter; 
    int *mat, *mat_ind_top,*mat_ind_bot; 
    int rows,del_t,del_b; 
    int *top_ag_prop,*bot_ag_prop; 
    float init_p_val; 
    double *p_top,*p_bot; 
    double *top_tour_len,*bot_tour_len; 
    float *h_mat_top,*h_mat_bot;  
    int row_top[8] = {1,1,1,0,0,-1,-1,-1} ,col_top[8] = {-1,0,1,-1,1,-1,0,1},row_bot[8] = {-1,-1,-1,0,0,1,1,1},col_bot[8]={-1,0,1,-1,1,-1,0,1}; 

    //GPU variables 
    int *dev_mat,*dev_top_ag_ind_mat,*dev_bot_ag_ind_mat; 
    int *dev_top_ag_prop,*dev_bot_ag_prop; 
    int *dev_top_ag_srd,*dev_bot_ag_srd; 
    double *dev_top_ag_prob, *dev_bot_ag_prob; 
    double *dev_p_top,*dev_p_bot; 
    //Random Variables 
    curandState *state_t,*state_b; 

    cudaError_t status,error ; 


    iter = 2; 
    iter = 2; 
    rows = 16;        
    del_t = 320; 
    del_b = 320; 
    init_p_val = 200.0; 

    //Main matrix 
    mat = (int *)malloc(env_end*env_end*sizeof(int)); 
    memset(mat,0,env_end*env_end*sizeof(int)); 
    mat_ind_top = (int *)malloc(env_end*env_end*sizeof(int)); 
    memset(mat_ind_top ,0,env_end*env_end*sizeof(int)); 
    mat_ind_bot = (int *)malloc(env_end*env_end*sizeof(int)); 
    memset(mat_ind_bot,0,env_end*env_end*sizeof(int)); 

    //Top and bottom phermone matrix 
    p_top = (double *)malloc(env_end*env_end*sizeof(double)); 
    p_bot = (double*)malloc(env_end*env_end*sizeof(double)); 

    //Top agents properties matrix memory allocation & memset 
    top_ag_prop = (int *)malloc(8*((rows*env_end)-del_t)*sizeof(int)); 
    memset(top_ag_prop,0,8*((rows*env_end)-del_t)*sizeof(int)); 
    //Top agents tour length matrix allocation 
    top_tour_len = (double *)malloc(((rows*env_end)-del_t)*sizeof(double)); 
    memset(top_tour_len,0,((rows*env_end)-del_t)*sizeof(double)); 

    //Bottom agents properties matrix memory allocation & memset 
    bot_ag_prop = (int *)malloc(8*((rows*env_end)-del_b)*sizeof(int)); 
    memset(bot_ag_prop,0,8*((rows*env_end)-del_b)*sizeof(int)); 
    //Bottom agents tour length matrix allocation and memset 
    bot_tour_len = (double *)malloc(((rows*env_end)-del_b)*sizeof(double)); 
    memset(bot_tour_len,0,((rows*env_end)-del_b)*sizeof(double)); 

    INIT_AG_PLACEMENT_FUNC(mat, rows, del_t, del_b, p_top, p_bot, init_p_val, 
     top_ag_prop, bot_ag_prop, mat_ind_top, mat_ind_bot); 

    //Heuristics Matrix Memory allocation 
    h_mat_top = (float *)malloc(2*env_end*sizeof(float)); 
    memset(h_mat_top,0,2*env_end*sizeof(float)); 
    h_mat_bot = (float *)malloc(2*env_end*sizeof(float)); 
    memset(h_mat_bot,0,2*env_end*sizeof(float)); 

    for (i=0;i<env_end;i++) 
    { h_mat_top[i*2] = sqrt(double(((env_end-i)*(env_end-i)) +1));  // This stores the distance of the agents placed in the top 
     h_mat_top[i*2+1] = (env_end-i);            // to the end of the environment target 
     printf("%f\t%f\n",h_mat_top[i*2] ,h_mat_top[i*2+1]); 

     h_mat_bot[i*2] = sqrt(double((i*i)+1)) ;          // This stores the distance of the agents placed in the bottom 
     h_mat_bot[i*2+1] = double(i);             // to the end of the environment target 
    } 


    //GPU and CPU both variables 
    //Device main matrix allocation and memory copy 
    cudaMalloc((void **)&dev_mat,env_end*env_end*sizeof(int)); 
    cudaMemcpy(dev_mat,mat,env_end*env_end*sizeof(int),cudaMemcpyHostToDevice); 
    //Device Top Agents index matrix variable memory allocation and copy 
    cudaMalloc((void **)&dev_top_ag_ind_mat,env_end*env_end*sizeof(int)); 
    cudaMemcpy(dev_top_ag_ind_mat,mat_ind_top,env_end*env_end*sizeof(int),cudaMemcpyHostToDevice); 
    //Device Bottom Agents index matrix variable memory allocation and copy 
    cudaMalloc((void **)&dev_bot_ag_ind_mat,env_end*env_end*sizeof(int)); 
    cudaMemcpy(dev_bot_ag_ind_mat,mat_ind_bot,env_end*env_end*sizeof(int),cudaMemcpyHostToDevice); 
    //Device top phermone matrix allocation and memory copy 
    cudaMalloc((void **)&dev_p_top,env_end*env_end*sizeof(double)); 
    cudaMemcpy(dev_p_top,p_top,env_end*env_end*sizeof(double),cudaMemcpyHostToDevice); 
    //Device bottom phermone matrix allocation and memory copy 
    cudaMalloc((void **)&dev_p_bot,env_end*env_end*sizeof(double)); 
    cudaMemcpy(dev_p_bot,p_bot,env_end*env_end*sizeof(double),cudaMemcpyHostToDevice); 
    //Device Top agents properties memory allocation and memory contents copy 
    cudaMalloc((void **)&dev_top_ag_prop,8*((rows*env_end)-del_t)*sizeof(int)); 
    cudaMemcpy(dev_top_ag_prop,top_ag_prop,8*((rows*env_end)-del_t)*sizeof(int),cudaMemcpyHostToDevice); 
    //Device Bottom agents properties memory allocation and memory contents copy 
    cudaMalloc((void **)&dev_bot_ag_prop,8*((rows*env_end)-del_b)*sizeof(int)); 
    cudaMemcpy(dev_bot_ag_prop,bot_ag_prop,8*((rows*env_end)-del_b)*sizeof(int),cudaMemcpyHostToDevice); 

    //GPU only variables 
    //Device Top agents surrounding cells matrix memory allocation and memset  
    cudaMalloc((void **)&dev_top_ag_srd,8*((rows*env_end)-del_t)*sizeof(int)); 
    cudaMemset(dev_top_ag_srd,0,8*((rows*env_end)-del_t)*sizeof(int)); 
    //Device Bottom agents surrounding cells matrix memory allocation and memset 
    cudaMalloc((void **)&dev_bot_ag_srd,8*((rows*env_end)-del_b)*sizeof(int)); 
    cudaMemset(dev_bot_ag_srd,0,8*((rows*env_end)-del_b)*sizeof(int)); 
    //Device Top agents probability matrix memory allocation and memset 
    cudaMalloc((void **)&dev_top_ag_prob,8*((rows*env_end)-del_t)*sizeof(double)); 
    cudaMemset(dev_top_ag_prob,0,8*((rows*env_end)-del_t)*sizeof(double)); 
    //Device Bottom agents probability matrix memory allocation and memset 
    cudaMalloc((void **)&dev_bot_ag_prob,8*((rows*env_end)-del_b)*sizeof(double)); 
    cudaMemset(dev_bot_ag_prob,0,8*((rows*env_end)-del_b)*sizeof(double)); 
    //Device random number seed memory allocation for top and bottom agents 
    cudaMalloc((void **)&state_t,8*((rows*env_end)-del_t)*sizeof(curandState)); 
    cudaMalloc((void **)&state_b,8*((rows*env_end)-del_b)*sizeof(curandState)); 

    status = cudaMemcpyToSymbol(dev_h_top,h_mat_top,2*env_end*sizeof(float)); 

    if (status!=cudaSuccess) 
    { printf("Error in allocating constant memory!!"); 
    } 
    status = cudaMemcpyToSymbol(dev_h_bot,h_mat_bot,2*env_end*sizeof(float));//,cudaMemcpyHostToDevice); 
    if (status!=cudaSuccess) 
    { printf("Error in allocating constant memory!!"); 
    } 
    status = cudaMemcpyToSymbol(dev_row_top,row_top,8*sizeof(int)); 
    if (status!=cudaSuccess) 
    { printf("Error in allocating constant memory!!"); 
    } 
    status = cudaMemcpyToSymbol(dev_col_top,col_top,8*sizeof(int)); 
    if (status!=cudaSuccess) 
    { printf("Error in allocating constant memory!!"); 
    } 
    status = cudaMemcpyToSymbol(dev_row_bot,row_bot,8*sizeof(int)); 
    if (status!=cudaSuccess) 
    { printf("Error in allocating constant memory!!"); 
    } 
    status = cudaMemcpyToSymbol(dev_col_bot,col_bot,8*sizeof(int)); 
    if (status!=cudaSuccess) 
    { printf("Error in allocating constant memory!!"); 
    } 
    //system("PAUSE"); 

    dim3 gridDim_1(env_end/16,env_end/16,1); 
    dim3 blockDim_1(16,16,1); 

    AGENT_POSITION_FUNC<<<gridDim_1,blockDim_1>>>(dev_mat, dev_p_top, dev_p_bot, dev_top_ag_prop,dev_bot_ag_prop, dev_top_ag_srd, dev_bot_ag_srd, 
                             dev_top_ag_ind_mat, dev_bot_ag_ind_mat,dev_top_ag_prob, dev_bot_ag_prob); 
    cudaDeviceSynchronize(); 
    error = cudaGetLastError(); 
    if(error != cudaSuccess) 
    {printf("CUDA Error: %s\n", cudaGetErrorString(error)); 
    } 


    cudaFree(dev_mat); 
    cudaFree(dev_top_ag_ind_mat); 
    cudaFree(dev_bot_ag_ind_mat); 
    cudaFree(dev_p_top); 
    cudaFree(dev_p_bot); 
    cudaFree(dev_top_ag_prop); 
    cudaFree(dev_bot_ag_prop); 
    cudaFree(dev_top_ag_srd); 
    cudaFree(dev_bot_ag_srd); 
    cudaFree(dev_top_ag_prob); 
    cudaFree(dev_bot_ag_prob); 
    cudaFree(state_t); 
    cudaFree(state_b); 

    free(top_ag_prop); 
    free(bot_ag_prop); 
    free(h_mat_top); 
    free(h_mat_bot); 
    free(top_tour_len); 
    free(bot_tour_len); 
    free(p_top); 
    free(p_bot); 
    free(mat_ind_top); 
    free(mat_ind_bot); 
    free(mat); 
} 


void INIT_AG_PLACEMENT_FUNC(int *mat,int rows,int del_t,int del_b,double *p_top,double *p_bot,int init_p_val, 
    int *top_ag_prop,int *bot_ag_prop, int *mat_ind_top, int *mat_ind_bot) 
{ 
    int i,j,t,k,t_r_rand,t_c_rand,b_r_rand,b_c_rand; 

    for (i=0;i<rows;i++) 
    { for (j=0;j<env_end;j++) 
    { mat[i*env_end+j]=1; 
    } 
    } 

    for (i=env_end-rows;i<env_end;i++) 
    { for (j=0;j<env_end;j++) 
    { mat[i*env_end+j]=2; 
    } 
    } 

    srand(time(NULL)); 
    for (i=0;i<del_t;i++) 
    { t_r_rand = rand()%(rows +1); 
    t_c_rand = rand()%env_end; 
    if (mat[t_r_rand*env_end+t_c_rand]==0) 
    { i=i-1; 
    } 
    else 
    { mat[t_r_rand*env_end+t_c_rand] = 0; 
    } 
    } 

    srand(time(NULL)); 
    for (i=0;i<del_b;i++) 
    { b_r_rand = rand()%(env_end-(env_end - rows))+(env_end - rows); 
    b_c_rand = rand()%env_end; 
    if (mat[b_r_rand*env_end+b_c_rand]==0) 
    { i=i-1; 
    } 
    else 
    { mat[b_r_rand*env_end+b_c_rand] = 0; 
    } 
    } 

    t=0,k=0; 
    for (i=0;i<env_end;i++) 
    { for (j=0;j<env_end;j++) 
     { //id | index number | row | col | target col | future_row | future_col | empty cell 
      if (mat[i*env_end+j] == 1) 
      { top_ag_prop[t*8] = 1; top_ag_prop[t*8+1] = t; top_ag_prop[t*8+2] = i; top_ag_prop[t*8+3] = j;  
       top_ag_prop[t*8+4] = j; top_ag_prop[t*8+5] = -1; top_ag_prop[t*8+6] = -1; top_ag_prop[t*8+7] = -1; 
       mat_ind_top[i*env_end+j] = t; 
       t+=1; 
      } 
      else if (mat[i*env_end+j] == 2) 
      { bot_ag_prop[k*8] = 2; bot_ag_prop[k*8+1] = k; bot_ag_prop[k*8+2] = i; bot_ag_prop[k*8+3] = j; 
       bot_ag_prop[k*8+4] = j; bot_ag_prop[k*8+5] = -1; bot_ag_prop[k*8+6] = -1; bot_ag_prop[k*8+7] = -1; 
       mat_ind_bot[i*env_end+j] = k; 
       k+=1; 
      } 

      p_top[i*env_end+j] = init_p_val; 
      p_bot[i*env_end+j] = init_p_val; 
     } 
    } 

} 

__global__ void AGENT_POSITION_FUNC(int *dev_mat,double *dev_p_top,double *dev_p_bot,int *dev_top_ag_prop, int *dev_bot_ag_prop, 
                     int *dev_top_ag_srd, int *dev_bot_ag_srd,int *dev_top_ag_ind_mat, int *dev_bot_ag_ind_mat, 
                     double *dev_top_ag_prob, double *dev_bot_ag_prob) 
{ 
    //Maximum using 20 automatic variables 
    //7 registers are used, 13 left unused. 
    int row = blockIdx.y*blockDim.y+threadIdx.y; 
    int col = blockIdx.x*blockDim.x +threadIdx.x; 

    if (col==0) 
    { printf("%f\t%f\n",dev_h_top[row*2],dev_h_top[row*2+1]); 
    } 
    int index_loc; 
    int ty = threadIdx.y, tx = threadIdx.x; 
    int by = blockIdx.y, bx= blockIdx.x; 

    __shared__ int mat_block_local[18][18]; 
    __shared__ int mat_ind_local_top[16][16]; 
    __shared__ int mat_ind_local_bot[16][16]; 
    __shared__ double p_mat_local_top[18][18]; 
    __shared__ double p_mat_local_bot[18][18]; 

    //Loading of Inner elements for the main and the indices matrices of top and bottom agents 
    mat_block_local[ty +1][tx +1] = dev_mat[row*env_end+col]; 
    mat_ind_local_top[ty][tx] = dev_top_ag_ind_mat[row*env_end+col]; 
    mat_ind_local_bot[ty][tx] = dev_bot_ag_ind_mat[row*env_end+col]; 

    //loading of phermone matrix to the local shared memory 
    p_mat_local_top[ty+1][tx+1] = dev_p_top[row*env_end+col]; 
    p_mat_local_bot[ty+1][tx+1] = dev_p_bot[row*env_end+col]; 

    if (ty<=1) 
    { 
     //Left and Right Vertical Halo elements load (without corner elements) 
     mat_block_local[tx+1][(blockDim.x+1)*ty] = (((bx+ty)*blockDim.x-(!ty)) >=env_end || ((bx+ty)*blockDim.x-(!ty)) <0)?-1: 
      dev_mat[(by*blockDim.y+tx)*env_end+((bx+ty)*blockDim.x-(!ty))]; 

    p_mat_local_top[tx+1][(blockDim.x+1)*ty] = (((bx+ty)*blockDim.x-(!ty)) >=env_end || ((bx+ty)*blockDim.x-(!ty)) <0)?-1: 
     dev_p_top[(by*blockDim.y+tx)*env_end+((bx+ty)*blockDim.x-(!ty))]; 

    p_mat_local_bot[tx+1][(blockDim.x+1)*ty] = (((bx+ty)*blockDim.x-(!ty)) >=env_end || ((bx+ty)*blockDim.x-(!ty)) <0)?-1: 
     dev_p_bot[(by*blockDim.y+tx)*env_end+((bx+ty)*blockDim.x-(!ty))]; 

    //Top and Bottom Horizontal Halo elements load (without corner elements) 

    mat_block_local[(blockDim.y+1)*ty][tx+1] = (((by+ty)*blockDim.y - !(ty)) >=env_end || ((by+ty)*blockDim.y - !(ty))<0)?-1: 
     dev_mat[((by+ty)*blockDim.y - !(ty))*env_end+ (bx*blockDim.x+tx)]; 

    p_mat_local_top[(blockDim.y+1)*ty][tx+1] = (((by+ty)*blockDim.y - !(ty)) >=env_end || ((by+ty)*blockDim.y - !(ty))<0)?-1: 
     dev_p_top[((by+ty)*blockDim.y - !(ty))*env_end+ (bx*blockDim.x+tx)]; 

    p_mat_local_bot[(blockDim.y+1)*ty][tx+1] = (((by+ty)*blockDim.y - !(ty)) >=env_end || ((by+ty)*blockDim.y - !(ty))<0)?-1: 
     dev_p_bot[((by+ty)*blockDim.y - !(ty))*env_end+ (bx*blockDim.x+tx)]; 

    //Corner halo elements load 

    mat_block_local[(blockDim.y+1)*ty][0] = ((bx == 0) || ((by+ty)*blockDim.y-!(ty))<0 || ((by+ty)*blockDim.y-!(ty))>=env_end)?-1: 
     dev_mat[((by+ty)*blockDim.y-!(ty))*env_end+(bx*blockDim.x-1)]; 

    p_mat_local_top[(blockDim.y+1)*ty][0] = ((bx== 0) || ((by+ty)*blockDim.y-!(ty))<0 || ((by+ty)*blockDim.y-!(ty))>=env_end)?-1: 
     dev_p_top[((by+ty)*blockDim.y-!(ty))*env_end+(bx*blockDim.x-1)]; 

    p_mat_local_bot[(blockDim.y+1)*ty][0] = ((bx== 0) || ((by+ty)*blockDim.y-!(ty))<0 || ((by+ty)*blockDim.y-!(ty))>=env_end)?-1: 
     dev_p_bot[((by+ty)*blockDim.y-!(ty))*env_end+(bx*blockDim.x-1)]; 


    mat_block_local[(blockDim.y+1)*ty][blockDim.x+1] = ((bx+1)*blockDim.x>=env_end || ((by+ty)*blockDim.y-!(ty))>=env_end || ((by+ty)*blockDim.y-!(ty))<0)?-1: 
     dev_mat[((by+ty)*blockDim.y-!(ty))*env_end+((bx+1)*blockDim.x)]; 

    p_mat_local_top[(blockDim.y+1)*ty][blockDim.x+1] = ((bx+1)*blockDim.x>=env_end || ((by+ty)*blockDim.y-!(ty))>=env_end || ((by+ty)*blockDim.y-!(ty))<0)?-1: 
     dev_p_top[((by+ty)*blockDim.y-!(ty))*env_end+((bx+1)*blockDim.x)]; 

    p_mat_local_top[(blockDim.y+1)*ty][blockDim.x+1] = ((bx+1)*blockDim.x>=env_end || ((by+ty)*blockDim.y-!(ty))>=env_end || ((by+ty)*blockDim.y-!(ty))<0)?-1: 
     dev_p_top[((by+ty)*blockDim.y-!(ty))*env_end+((bx+1)*blockDim.x)]; 

    }                           
    __syncthreads(); 


    if (mat_block_local[ty +1][tx+1] == 1) 
    { 
     index_loc =  mat_ind_local_top[ty][tx] ;//dev_top_ag_ind_mat[row*env_end+col]; 
     //Neighborhood store top 
     dev_top_ag_srd[index_loc*8] = mat_block_local[(ty+1)+1][(tx+1)-1]; dev_top_ag_srd[index_loc*8+1] = mat_block_local[(ty+1)+1][tx+1]; 
     dev_top_ag_srd[index_loc*8+2] = mat_block_local[(ty+1)+1][(tx+1)+1]; dev_top_ag_srd[index_loc*8+3] = mat_block_local[ty+1][(tx+1)-1]; 
     dev_top_ag_srd[index_loc*8+4] = mat_block_local[ty+1][(tx+1)+1]; dev_top_ag_srd[index_loc*8+5] = mat_block_local[(ty+1)-1][(tx+1)-1]; 
     dev_top_ag_srd[index_loc*8+6] = mat_block_local[(ty+1)-1][tx+1]; dev_top_ag_srd[index_loc*8+7] = mat_block_local[(ty+1)-1][(tx+1)+1]; 

     dev_top_ag_prob[index_loc*8] = p_mat_local_top[(ty+1)+1][(tx+1)-1]*(1/dev_h_top[(row+1)*2]); 
     dev_top_ag_prob[index_loc*8+1] = p_mat_local_top[(ty+1)+1][(tx+1)]*(1/dev_h_top[(row+1)*2+1]); 
     dev_top_ag_prob[index_loc*8+2] = p_mat_local_top[(ty+1)+1][(tx+1)+1]*(1/dev_h_top[(row+1)*2]); 
     dev_top_ag_prob[index_loc*8+3] = p_mat_local_top[(ty+1)][(tx+1)-1]*(1/dev_h_top[row*2]); 
     dev_top_ag_prob[index_loc*8+4] = p_mat_local_top[(ty+1)][(tx+1)+1]*(1/dev_h_top[row*2]); 
     dev_top_ag_prob[index_loc*8+5] = p_mat_local_top[(ty+1)-1][(tx+1)-1]*(1/dev_h_top[(row-1)*2]); 
     dev_top_ag_prob[index_loc*8+6] = p_mat_local_top[(ty+1)-1][(tx+1)]*(1/dev_h_top[(row-1)*2+1]); 
     dev_top_ag_prob[index_loc*8+7] = p_mat_local_top[(ty+1)-1][(tx+1)+1]*(1/dev_h_top[(row-1)*2]); 
    } 
    else if (mat_block_local[ty +1][tx +1] == 2) 
    { 
     index_loc =  mat_ind_local_bot[ty][tx] ;//dev_bot_ag_ind_mat[row*env_end+col]; 
     //Neighborhood store bottom 
     dev_bot_ag_srd[index_loc*8] = mat_block_local[(ty+1)-1][(tx+1)-1]; dev_bot_ag_srd[index_loc*8+1] = mat_block_local[(ty+1)-1][tx+1]; 
     dev_bot_ag_srd[index_loc*8+2] = mat_block_local[(threadIdx.y+1)-1][(threadIdx.x+1)+1]; dev_bot_ag_srd[index_loc*8+3] = mat_block_local[threadIdx.y+1][(threadIdx.x+1)-1]; 
     dev_bot_ag_srd[index_loc*8+4] = mat_block_local[threadIdx.y+1][(threadIdx.x+1)+1]; dev_bot_ag_srd[index_loc*8+5] = mat_block_local[(threadIdx.y+1)+1][(threadIdx.x+1)-1]; 
     dev_bot_ag_srd[index_loc*8+6] = mat_block_local[(threadIdx.y+1)+1][(threadIdx.x+1)]; dev_bot_ag_srd[index_loc*8+7] = mat_block_local[(threadIdx.y+1)+1][(threadIdx.x+1)+1]; 

     dev_bot_ag_prob[index_loc*8] = p_mat_local_bot[(threadIdx.y+1)-1][(threadIdx.x+1)-1]*(1/dev_h_bot[(row-1)*2]); 
     dev_bot_ag_prob[index_loc*8+1] = p_mat_local_bot[(threadIdx.y+1)-1][(threadIdx.x+1)]*(1/dev_h_bot[(row-1)*2+1]); 
     dev_bot_ag_prob[index_loc*8+2] = p_mat_local_bot[(threadIdx.y+1)-1][(threadIdx.x+1)+1]*(1/dev_h_bot[(row-1)*2]); 
     dev_bot_ag_prob[index_loc*8+3] = p_mat_local_bot[(threadIdx.y+1)][(threadIdx.x+1)-1]*(1/dev_h_bot[row*2]); 
     dev_bot_ag_prob[index_loc*8+4] = p_mat_local_bot[(threadIdx.y+1)][(threadIdx.x+1)+1]*(1/dev_h_bot[row*2]); 
     dev_bot_ag_prob[index_loc*8+5] = p_mat_local_bot[(threadIdx.y+1)+1][(threadIdx.x+1)-1]*(1/dev_h_bot[(row+1)*2]); 
     dev_bot_ag_prob[index_loc*8+6] = p_mat_local_bot[(threadIdx.y+1)+1][(threadIdx.x+1)]*(1/dev_h_bot[(row+1)*2]+1); 
     dev_bot_ag_prob[index_loc*8+7] = p_mat_local_bot[(threadIdx.y+1)+1][(threadIdx.x+1)+1]*(1/dev_h_bot[(row+1)*2]); 
    } 

    __syncthreads(); 
} 

当我调试,我总是在dev_h_top得到0值,dev_h_bot以及其他不变变量。我不知道发生了什么错误。我知道正确的值被存储在主变量中,并且变量的复制函数中没有错误指示,但我不知道为什么所需的值没有被复制。我还写了一个正确运行的虚拟程序,但我不知道我在这个程序中做了什么错误。我总是得到无穷的dev_top_ag_prob和dev_bot_ag_prob当我做一个内核调试在dev_h_top和dev_h_bot的值是0。我张贴虚编程,这似乎是完美的工作,如果这一次能正常运行,然后我原来的计划应该。但不幸的是,由于奇怪的原因,它不是。虚拟程序如下:

int main() 
{ 
     int num,*test_var,test_cons_var[8]={1,1,1,0,0,-1,-1,-1}, *test_res; 
    int *dev_test,*dev_res_var; 
    int i,j; 

    num = 32; 

    test_var = (int *)malloc(num*sizeof(int)); 
    test_res = (int *)malloc(8*num*sizeof(int)); 

    for (i=0;i<num;i++) 
    { test_var[i] = rand()%(10); 
     printf("%d\n",test_var[i]); 
    } 

    cudaMalloc((void **)&dev_test,num*sizeof(int)); 
    cudaMemcpy(dev_test,test_var,num*sizeof(int),cudaMemcpyHostToDevice); 
    cudaMalloc((void **)&dev_res_var,8*num*sizeof(int)); 
    cudaMemcpyToSymbol(test,test_cons_var,8*sizeof(int)); 

    test_kernel<<<1,num>>>(dev_test,dev_res_var); 
    cudaMemcpy(test_res,dev_res_var,8*num*sizeof(int),cudaMemcpyDeviceToHost); 

    printf("\n\n"); 
    for (i=0;i<num;i++) 
    { for (j=0;j<8;j++) 
     {printf("%d\t",test_res[i*8+j]); 
     } 
     printf("\n"); 
    } 

    cudaFree(dev_test); 
    cudaFree(dev_res_var); 
    free(test_var); 
    free(test_res); 

    exit(0) 
} 

__global__ void test_kernel(int *dev_test,int *dev_res_var) 
{ 
    int i; 

    for (i=0;i<8;i++) 
    { dev_res_var[threadIdx.x*8+i]=dev_test[threadIdx.x]*test[i]; 
    } 
} 

任何帮助将非常感激。由于

+0

@Robert Crovella:我不知道发生了什么,但所有的评论和答案都不见了。我只是想让你知道我也制作了一个与你在答案中建议的完全相同的虚拟程序,但是我写的程序给了我这个问题。这就是为什么我不知道该怎么做。 – duttasankha

+0

我要求您提出一个新问题,而不是编辑您的原始问题。我的回答和所有评论不再适用于您的新代码和问题,因此我删除了它们。 SO旨在成为问答格式,而不是聊天室。我很难以这种方式来帮助你。此外,我并没有要求您将一堆代码转储到问题中,而是要求您创建一个简短的,可编译的再现器,它可以证明您正在编译的问题。我不想通过所有的'scanf'行来读写。我要求你做一些工作。 –

+0

当然。对于您必须经历的不便,我深表歉意。我将用需要编译的代码发布一篇新文章,我将结束这个问题。谢谢您的合作。 – duttasankha

回答

0

正如我所说的,你似乎在确认后,您在这里发布的代码其实不是你正在运行的,当你跑,你已经在这里发布的代码,你得到了正确的结果。

如果你实际上有__constant__声明在不同的模块这将是一个问题。正如你在这个问题中发布它们一样,它们在同一个模块中,因为标题包含在这个问题中唯一的模块(.cu文件)中。但是,如果您的实际项目具有单独的.cu文件,并且您将__constant__声明放入一个文件中,但试图在另一个文件中使用它们,那就是一个问题。 __constant__声明have module scope only除非您使用带有单独编译的设备链接程序。由于您的常量声明位于头文件中,因此您可能会以这种方式陷入困境。

假设您没有使用单独编译,请确保__constant__声明仅在单个模块(.cu文件)中定义(包含),并确保它们的所有修改/更新/用法仅在该文件中出现。

+0

robert crovella:我正在更多地关注我的项目,而我注意到的并不是创建错误的常量声明。现在我在一个单独的头文件中声明常量变量。但是使用常量变量的主机函数和内核函数需要位于同一个文件中。所以我们可以在一个单独的头文件中声明常量变量,但是使用它的函数需要在同一个文件中。 – duttasankha

相关问题