2013-05-29 240 views
0

我正试图在CUDA上进行差异化演化,但问题在于负责“突变,交叉,评估,选择”的内核从未启动。CUDA DE内核不启动

任何帮助?

这里是整个代码:

#include <iostream> 
#include <curand_kernel.h> 
using namespace std; 

/**** ERROR HANDLING ****/ 
static void HandleError(cudaError_t err,const char *file, int line) 
{ 
    if (err != cudaSuccess) { 
     printf("%s in %s at line %d\n", cudaGetErrorString(err), 
       file, line); 
     system("pause"); 
     exit(EXIT_FAILURE); 
    } 
} 
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) 


/**** HOST AND DEVICE CONSTANTS****/ 
const int hNP=100, hD=31, hN=10; 
__constant__ int NP, D, N; 
__constant__ float Cr, F; 


/*** EVAL FUNCTION******/ 
__device__ float lennardJones(float a[3], float b[3]) { 

    float distance = sqrt((a[0] - b[0]) * (a[0] - b[0]) 
          + (a[1] - b[1]) * (a[1] - b[1]) 
          + (a[2] - b[2]) * (a[2] - b[2])); 
    float distance6 = distance * distance * distance 
         * distance * distance * distance; 
    float distance12 = distance6 * distance6; 
    return 1/distance12 - 2/distance6; 
} 

/**** RANDOM GENERATORS***/ 
__device__ float rndFloat(curandState* globalState, int id) 
{ 
    curandState localState = globalState[id]; 
    float RANDOM = curand_uniform(&localState); 
    globalState[id] = localState; 
    return RANDOM; 
} 
__device__ int rndInt(curandState* globalState, int id, int max) 
{ 
    curandState localState = globalState[id]; 
    float RANDOM = curand_uniform(&localState); 
    globalState[id] = localState; 
    return RANDOM*max; 
} 
__device__ float rndFloat(curandState* globalState, int id, int max) 
{ 
    curandState localState = globalState[id]; 
    float RANDOM = curand_uniform(&localState); 
    globalState[id] = localState; 
    return RANDOM*max; 
} 
__device__ float rndFloat(curandState* globalState, int id, int min,int max) 
{ 
    curandState localState = globalState[id]; 
    float RANDOM = curand_uniform(&localState); 
    globalState[id] = localState; 
    return min+RANDOM*(max-min); 
} 
/*** SEEDS ****/ 
__global__ void setup_kernel (curandState * state, unsigned long seed) 
{ 
    int id= threadIdx.x+blockIdx.x*blockDim.x; 
    if(id < NP) 
     curand_init(seed, id, 0,&state[id]); 
} 

/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/ 
__global__ void kernelE(curandState* globalState, float *population) 
{ 
    int id= threadIdx.x+blockIdx.x*blockDim.x; 
    if(id < NP) 
    { 
     //init, just populating array with some specific numbers 
     population[D*id]=0; 
     population[D*id+N]=0; 
     population[D*id +2*N]=0; 

     population[D*id+1]=rndFloat(globalState,threadIdx.x,4); 
     population[D*id+N+1]=0; 
     population[D*id +2*N+1]=0; 

     for(int i=2; i<N; i++){ 
      float min= -4 - 1/4*abs((int)((i-4)/3)); 
      float max= 4 + 1/4*abs((int)((i-4)/3)); 
      if(i==2) 
      { 
       population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359); 
       population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max); 
       population[D*id +2*N+2]=0; 
      } 
      else 
      { 
       population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max); 
       population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max); 
       population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max); 
      } 
     } 

     //eval 
     float e=0; 
     for(int i=0; i<N; i++) 
     { 
      for(int j=0; j<i; j++) 
      { 
       float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]}; 
       e += lennardJones(a,b); 
      } 
     } 
     population[D*id + D-1]=e; 
    } 
} 
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/ 
__global__ void kernelP(curandState* globalState, int *mutation) 
{ 
    int id= threadIdx.x+blockIdx.x*blockDim.x; 
    if(id<NP) 
    { 
     int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP); 
     while(a == id){a = rndInt(globalState, id, NP);} 
     while(b == a && b==id){b=rndInt(globalState, id, NP);} 
     while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);} 
     mutation[D*id+0]=a; 
     mutation[D*id+1]=b; 
     mutation[D*id+2]=c; 
    } 
} 
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/ 
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop) 
{ 
    int id= threadIdx.x+blockIdx.x*blockDim.x; 
    if(id<NP) 
    { 
     int a=mutation[D*id+0], b=mutation[D*id+1], c=mutation[D*id+2]; 

     //DE mutation and crossover 
     int j=rndInt(globalState, id, NP); 
     for(int i=0; i<D-1; i++) 
     { 
      //DE mutation 
      pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]); 
      //DE crossover 
      if(Cr > rndFloat(globalState, id) && i!= j) 
       pop[D*id+i]=population[D*id +i]; 
     } 
     // Eval 
     pop[D*id+D-1]=0; 
     for(int i=0; i<N; i++) 
     { 
      for(int j=0; j<i; j++) 
      { 
       float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]}; 
       pop[D*id+D-1] += lennardJones(a,b); 
      } 
     } 

     __syncthreads(); 
     //DE selection 
     if(pop[D*id+D-1] < population[D*id +D-1]) 
     { 
      for(int i=0; i<D; i++) 
       population[D*id +i]=pop[D*id+i]; 
     } 
    } 
} 

void getBestScore(float *hpopulation) 
{ 
    int max=0; 
    for(int i=1; i<hNP; i++) 
    { 
     if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1]) 
      max=i; 
    } 
    for(int j=0; j<hN; j++) 
     cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl; 
    cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl; 
} 

int main() 
{ 
    cudaEvent_t start,stop; 
    HANDLE_ERROR(cudaEventCreate(&start)); 
    HANDLE_ERROR(cudaEventCreate(&stop)); 
    HANDLE_ERROR(cudaEventRecord(start,0)); 

    int device, st=100; 
    float hCr=0.6f, hF=0.8f; 
    cudaDeviceProp prop; 
    HANDLE_ERROR(cudaGetDevice(&device)); 
    HANDLE_ERROR(cudaGetDeviceProperties(&prop, device)); 
// int SN = prop.maxThreadsPerBlock; //512 threads per block 
    //int SB = (hNP+(SN-1))/SN; 


    //constants NP, D, N, Cr, F 
    HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int))); 
    HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int))); 
    HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int))); 
    HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float))); 
    HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float))); 

    //seeds 
    curandState* devStates; 
    HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState))); 
    setup_kernel <<< 1, hNP>>> (devStates, 50); 

    //population 
    float *population, *pop; 
    float hpopulation[hNP*hD]; 
    HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float))); 
    HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float))); 

    //mutation 
    int *mutation, *mutation1; 
    int *hmutation; 
    HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault)); 
    HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int))); 
    HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int))); 

    //stream 
    cudaStream_t stream_i, stream_j; 
    HANDLE_ERROR(cudaStreamCreate(&stream_i)); 
    HANDLE_ERROR(cudaStreamCreate(&stream_j)); 

    kernelE<<<1,hNP, 0,stream_i>>>(devStates,population); 
    kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation); 


    while(st != 0) 
    { 
     /*** COPYING MUTATION INDICES***/ 
     HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j)); 
     HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i)); 

     /**** CALLING KERNELS****/ 
     kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation); 
     kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop); 

     st--; 

     //HANDLE_ERROR(cudaStreamSynchronize(stream_i)); 
     //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost)); 
     //getBestScore(hpopulation); 
     //cin.get(); 
    } 

    HANDLE_ERROR(cudaStreamSynchronize(stream_i)); 
    HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost)); 
    getBestScore(hpopulation); 

    cudaEventRecord(stop,0); 
    cudaEventSynchronize(stop); 
    float time; 
    HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop)); 
    cout<<endl<<"Tme: "<<time/1000<<"s"<<endl; 

    HANDLE_ERROR(cudaEventDestroy(start)); 
    HANDLE_ERROR(cudaEventDestroy(stop)); 
    HANDLE_ERROR(cudaStreamDestroy(stream_i)); 
    HANDLE_ERROR(cudaStreamDestroy(stream_j)); 
    HANDLE_ERROR(cudaFree(population)); 
    HANDLE_ERROR(cudaFree(pop)); 
    HANDLE_ERROR(cudaFreeHost(hmutation)); 
    HANDLE_ERROR(cudaFree(mutation1)); 
    HANDLE_ERROR(cudaFree(devStates)); 

    system("pause"); 
    return 0; 
} 

更新 - 解决方案:

#include <iostream> 
#include <curand_kernel.h> 
using namespace std; 

/**** ERROR HANDLING ****/ 
static void HandleError(cudaError_t err,const char *file, int line) 
{ 
    if (err != cudaSuccess) { 
     printf("%s in %s at line %d\n", cudaGetErrorString(err), 
       file, line); 
     system("pause"); 
     exit(EXIT_FAILURE); 
    } 
} 
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) 


/**** HOST AND DEVICE CONSTANTS****/ 
const int hNP=100, hD=31, hN=10; 
__constant__ int NP, D, N; 
__constant__ float Cr, F; 


/*** EVAL FUNCTION******/ 
__device__ float lennardJones(float a[3], float b[3]) { 

    float distance = sqrt((a[0] - b[0]) * (a[0] - b[0]) 
          + (a[1] - b[1]) * (a[1] - b[1]) 
          + (a[2] - b[2]) * (a[2] - b[2])); 
    float distance6 = distance * distance * distance 
         * distance * distance * distance; 
    float distance12 = distance6 * distance6; 
    return 1/distance12 - 2/distance6; 
} 

/**** RANDOM GENERATORS***/ 
__device__ float rndFloat(curandState* globalState, int id) 
{ 
    curandState localState = globalState[id]; 
    float RANDOM = curand_uniform(&localState); 
    globalState[id] = localState; 
    return RANDOM; 
} 
__device__ int rndInt(curandState* globalState, int id, int max) 
{ 
    curandState localState = globalState[id]; 
    float RANDOM = curand_uniform(&localState); 
    globalState[id] = localState; 
    return RANDOM*max; 
} 
__device__ float rndFloat(curandState* globalState, int id, int max) 
{ 
    curandState localState = globalState[id]; 
    float RANDOM = curand_uniform(&localState); 
    globalState[id] = localState; 
    return RANDOM*max; 
} 
__device__ float rndFloat(curandState* globalState, int id, int min,int max) 
{ 
    curandState localState = globalState[id]; 
    float RANDOM = curand_uniform(&localState); 
    globalState[id] = localState; 
    return min+RANDOM*(max-min); 
} 
/*** SEEDS ****/ 
__global__ void setup_kernel (curandState * state, unsigned long seed) 
{ 
    int id= threadIdx.x+blockIdx.x*blockDim.x; 
    if(id < NP) 
     curand_init(seed, id, 0,&state[id]); 
} 

/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/ 
__global__ void kernelE(curandState* globalState, float *population) 
{ 
    int id= threadIdx.x+blockIdx.x*blockDim.x; 
    if(id < NP) 
    { 
     //init, just populating array with some specific numbers 
     population[D*id]=0; 
     population[D*id+N]=0; 
     population[D*id +2*N]=0; 

     population[D*id+1]=rndFloat(globalState,threadIdx.x,4); 
     population[D*id+N+1]=0; 
     population[D*id +2*N+1]=0; 

     for(int i=2; i<N; i++){ 
      float min= -4 - 1/4*abs((int)((i-4)/3)); 
      float max= 4 + 1/4*abs((int)((i-4)/3)); 
      if(i==2) 
      { 
       population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359); 
       population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max); 
       population[D*id +2*N+2]=0; 
      } 
      else 
      { 
       population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max); 
       population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max); 
       population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max); 
      } 
     } 

     //eval 
     float e=0; 
     for(int i=0; i<N; i++) 
     { 
      for(int j=0; j<i; j++) 
      { 
       float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]}; 
       e += lennardJones(a,b); 
      } 
     } 
     population[D*id + D-1]=e; 
    } 
} 
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/ 
__global__ void kernelP(curandState* globalState, int *mutation) 
{ 
    int id= threadIdx.x+blockIdx.x*blockDim.x; 
    if(id<NP) 
    { 
     int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP); 
     while(a == id){a = rndInt(globalState, id, NP);} 
     while(b == a && b==id){b=rndInt(globalState, id, NP);} 
     while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);} 
     mutation[3*id+0]=a; 
     mutation[3*id+1]=b; 
     mutation[3*id+2]=c; 
    } 
} 
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/ 
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop) 
{ 
    int id= threadIdx.x+blockIdx.x*blockDim.x; 
    if(id<NP) 
    { 
     int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2]; 

     //DE mutation and crossover 
     int j=rndInt(globalState, id, NP); 
     for(int i=0; i<D-1; i++) 
     { 
      //DE mutation 
      pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]); 
      //DE crossover 
      if(Cr > rndFloat(globalState, id) && i!= j) 
       pop[D*id+i]=population[D*id +i]; 
     } 
     // Eval 
     pop[D*id+D-1]=0; 
     for(int i=0; i<N; i++) 
     { 
      for(int j=0; j<i; j++) 
      { 
       float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]}; 
       pop[D*id+D-1] += lennardJones(a,b); 
      } 
     } 

     __syncthreads(); 
     //DE selection 
     if(pop[D*id+D-1] < population[D*id +D-1]) 
     { 
      for(int i=0; i<D; i++) 
       population[D*id +i]=pop[D*id+i]; 
     } 
    } 
} 

void getBestScore(float *hpopulation) 
{ 
    int max=0; 
    for(int i=1; i<hNP; i++) 
    { 
     if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1]) 
      max=i; 
    } 
    for(int j=0; j<hN; j++) 
     cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl; 
    cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl; 
} 

int main() 
{ 
    cudaEvent_t start,stop; 
    HANDLE_ERROR(cudaEventCreate(&start)); 
    HANDLE_ERROR(cudaEventCreate(&stop)); 
    HANDLE_ERROR(cudaEventRecord(start,0)); 

    int device, st=100; 
    float hCr=0.6f, hF=0.8f; 
    cudaDeviceProp prop; 
    HANDLE_ERROR(cudaGetDevice(&device)); 
    HANDLE_ERROR(cudaGetDeviceProperties(&prop, device)); 
// int SN = prop.maxThreadsPerBlock; //512 threads per block 
    //int SB = (hNP+(SN-1))/SN; 


    //constants NP, D, N, Cr, F 
    HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int))); 
    HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int))); 
    HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int))); 
    HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float))); 
    HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float))); 

    //seeds 
    curandState* devStates; 
    HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState))); 
    setup_kernel <<< 1, hNP>>> (devStates, 50); 

    //population 
    float *population, *pop; 
    float hpopulation[hNP*hD]; 
    HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float))); 
    HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float))); 

    //mutation 
    int *mutation, *mutation1; 
    int *hmutation; 
    HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault)); 
    HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int))); 
    HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int))); 

    //stream 
    cudaStream_t stream_i, stream_j; 
    HANDLE_ERROR(cudaStreamCreate(&stream_i)); 
    HANDLE_ERROR(cudaStreamCreate(&stream_j)); 

    kernelE<<<1,hNP, 0,stream_i>>>(devStates,population); 
    kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation); 


    while(st != 0) 
    { 
     /*** COPYING MUTATION INDICES***/ 
     HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j)); 
     HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i)); 

     /**** CALLING KERNELS****/ 
     kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation); 
     kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop); 

     st--; 

     //HANDLE_ERROR(cudaStreamSynchronize(stream_i)); 
     //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost)); 
     //getBestScore(hpopulation); 
     //cin.get(); 
    } 

    HANDLE_ERROR(cudaStreamSynchronize(stream_i)); 
    HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost)); 
    getBestScore(hpopulation); 

    cudaEventRecord(stop,0); 
    cudaEventSynchronize(stop); 
    float time; 
    HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop)); 
    cout<<endl<<"Tme: "<<time/1000<<"s"<<endl; 

    HANDLE_ERROR(cudaEventDestroy(start)); 
    HANDLE_ERROR(cudaEventDestroy(stop)); 
    HANDLE_ERROR(cudaStreamDestroy(stream_i)); 
    HANDLE_ERROR(cudaStreamDestroy(stream_j)); 
    HANDLE_ERROR(cudaFree(population)); 
    HANDLE_ERROR(cudaFree(pop)); 
    HANDLE_ERROR(cudaFreeHost(hmutation)); 
    HANDLE_ERROR(cudaFree(mutation1)); 
    HANDLE_ERROR(cudaFree(devStates)); 

    system("pause"); 
    return 0; 
} 
+1

你可以做[错误检查内核启动(HTTP:// stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api)。你为什么不这样做,并报告你得到什么样的错误?另外,当你用'cuda-memcheck'运行你的代码时会发生什么?我的猜测是你的内核正在启动,但你在设备代码中发生了某种错误,导致内核提前终止。 –

+0

错误检查内核显示没有错误。 cuda-memcheck也显示没有错误。我还在VS 2010上安装了最新的nsight与适当的驱动程序,但是当我按下“开始CUDA调试”时,没有任何反应......我放入内核的断点被忽略。我不知道下一步该怎么做。 – user755974

+1

更新您发布的代码,以显示您如何进行内核错误检查。更好的办法是发布一个完整的,可编译的代码。除非您在主机代码中存在逻辑错误,或者完全错误地解释了代码生成的数据,否则内核无法启动,不会出现错误,并且不会显示cuda中的错误 - MEMCHECK。 –

回答

1

解决方案:

#include <iostream> 
#include <curand_kernel.h> 
using namespace std; 

/**** ERROR HANDLING ****/ 
static void HandleError(cudaError_t err,const char *file, int line) 
{ 
    if (err != cudaSuccess) { 
     printf("%s in %s at line %d\n", cudaGetErrorString(err), 
       file, line); 
     system("pause"); 
     exit(EXIT_FAILURE); 
    } 
} 
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__)) 


/**** HOST AND DEVICE CONSTANTS****/ 
const int hNP=100, hD=31, hN=10; 
__constant__ int NP, D, N; 
__constant__ float Cr, F; 


/*** EVAL FUNCTION******/ 
__device__ float lennardJones(float a[3], float b[3]) { 

    float distance = sqrt((a[0] - b[0]) * (a[0] - b[0]) 
          + (a[1] - b[1]) * (a[1] - b[1]) 
          + (a[2] - b[2]) * (a[2] - b[2])); 
    float distance6 = distance * distance * distance 
         * distance * distance * distance; 
    float distance12 = distance6 * distance6; 
    return 1/distance12 - 2/distance6; 
} 

/**** RANDOM GENERATORS***/ 
__device__ float rndFloat(curandState* globalState, int id) 
{ 
    curandState localState = globalState[id]; 
    float RANDOM = curand_uniform(&localState); 
    globalState[id] = localState; 
    return RANDOM; 
} 
__device__ int rndInt(curandState* globalState, int id, int max) 
{ 
    curandState localState = globalState[id]; 
    float RANDOM = curand_uniform(&localState); 
    globalState[id] = localState; 
    return RANDOM*max; 
} 
__device__ float rndFloat(curandState* globalState, int id, int max) 
{ 
    curandState localState = globalState[id]; 
    float RANDOM = curand_uniform(&localState); 
    globalState[id] = localState; 
    return RANDOM*max; 
} 
__device__ float rndFloat(curandState* globalState, int id, int min,int max) 
{ 
    curandState localState = globalState[id]; 
    float RANDOM = curand_uniform(&localState); 
    globalState[id] = localState; 
    return min+RANDOM*(max-min); 
} 
/*** SEEDS ****/ 
__global__ void setup_kernel (curandState * state, unsigned long seed) 
{ 
    int id= threadIdx.x+blockIdx.x*blockDim.x; 
    if(id < NP) 
     curand_init(seed, id, 0,&state[id]); 
} 

/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/ 
__global__ void kernelE(curandState* globalState, float *population) 
{ 
    int id= threadIdx.x+blockIdx.x*blockDim.x; 
    if(id < NP) 
    { 
     //init, just populating array with some specific numbers 
     population[D*id]=0; 
     population[D*id+N]=0; 
     population[D*id +2*N]=0; 

     population[D*id+1]=rndFloat(globalState,threadIdx.x,4); 
     population[D*id+N+1]=0; 
     population[D*id +2*N+1]=0; 

     for(int i=2; i<N; i++){ 
      float min= -4 - 1/4*abs((int)((i-4)/3)); 
      float max= 4 + 1/4*abs((int)((i-4)/3)); 
      if(i==2) 
      { 
       population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359); 
       population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max); 
       population[D*id +2*N+2]=0; 
      } 
      else 
      { 
       population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max); 
       population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max); 
       population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max); 
      } 
     } 

     //eval 
     float e=0; 
     for(int i=0; i<N; i++) 
     { 
      for(int j=0; j<i; j++) 
      { 
       float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]}; 
       e += lennardJones(a,b); 
      } 
     } 
     population[D*id + D-1]=e; 
    } 
} 
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/ 
__global__ void kernelP(curandState* globalState, int *mutation) 
{ 
    int id= threadIdx.x+blockIdx.x*blockDim.x; 
    if(id<NP) 
    { 
     int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP); 
     while(a == id){a = rndInt(globalState, id, NP);} 
     while(b == a && b==id){b=rndInt(globalState, id, NP);} 
     while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);} 
     mutation[3*id+0]=a; 
     mutation[3*id+1]=b; 
     mutation[3*id+2]=c; 
    } 
} 
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/ 
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop) 
{ 
    int id= threadIdx.x+blockIdx.x*blockDim.x; 
    if(id<NP) 
    { 
     int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2]; 

     //DE mutation and crossover 
     int j=rndInt(globalState, id, NP); 
     for(int i=0; i<D-1; i++) 
     { 
      //DE mutation 
      pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]); 
      //DE crossover 
      if(Cr > rndFloat(globalState, id) && i!= j) 
       pop[D*id+i]=population[D*id +i]; 
     } 
     // Eval 
     pop[D*id+D-1]=0; 
     for(int i=0; i<N; i++) 
     { 
      for(int j=0; j<i; j++) 
      { 
       float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]}; 
       pop[D*id+D-1] += lennardJones(a,b); 
      } 
     } 

     __syncthreads(); 
     //DE selection 
     if(pop[D*id+D-1] < population[D*id +D-1]) 
     { 
      for(int i=0; i<D; i++) 
       population[D*id +i]=pop[D*id+i]; 
     } 
    } 
} 

void getBestScore(float *hpopulation) 
{ 
    int max=0; 
    for(int i=1; i<hNP; i++) 
    { 
     if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1]) 
      max=i; 
    } 
    for(int j=0; j<hN; j++) 
     cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl; 
    cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl; 
} 

int main() 
{ 
    cudaEvent_t start,stop; 
    HANDLE_ERROR(cudaEventCreate(&start)); 
    HANDLE_ERROR(cudaEventCreate(&stop)); 
    HANDLE_ERROR(cudaEventRecord(start,0)); 

    int device, st=100; 
    float hCr=0.6f, hF=0.8f; 
    cudaDeviceProp prop; 
    HANDLE_ERROR(cudaGetDevice(&device)); 
    HANDLE_ERROR(cudaGetDeviceProperties(&prop, device)); 
// int SN = prop.maxThreadsPerBlock; //512 threads per block 
    //int SB = (hNP+(SN-1))/SN; 


    //constants NP, D, N, Cr, F 
    HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int))); 
    HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int))); 
    HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int))); 
    HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float))); 
    HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float))); 

    //seeds 
    curandState* devStates; 
    HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState))); 
    setup_kernel <<< 1, hNP>>> (devStates, 50); 

    //population 
    float *population, *pop; 
    float hpopulation[hNP*hD]; 
    HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float))); 
    HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float))); 

    //mutation 
    int *mutation, *mutation1; 
    int *hmutation; 
    HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault)); 
    HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int))); 
    HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int))); 

    //stream 
    cudaStream_t stream_i, stream_j; 
    HANDLE_ERROR(cudaStreamCreate(&stream_i)); 
    HANDLE_ERROR(cudaStreamCreate(&stream_j)); 

    kernelE<<<1,hNP, 0,stream_i>>>(devStates,population); 
    kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation); 


    while(st != 0) 
    { 
     /*** COPYING MUTATION INDICES***/ 
     HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j)); 
     HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i)); 

     /**** CALLING KERNELS****/ 
     kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation); 
     kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop); 

     st--; 

     //HANDLE_ERROR(cudaStreamSynchronize(stream_i)); 
     //HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost)); 
     //getBestScore(hpopulation); 
     //cin.get(); 
    } 

    HANDLE_ERROR(cudaStreamSynchronize(stream_i)); 
    HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost)); 
    getBestScore(hpopulation); 

    cudaEventRecord(stop,0); 
    cudaEventSynchronize(stop); 
    float time; 
    HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop)); 
    cout<<endl<<"Tme: "<<time/1000<<"s"<<endl; 

    HANDLE_ERROR(cudaEventDestroy(start)); 
    HANDLE_ERROR(cudaEventDestroy(stop)); 
    HANDLE_ERROR(cudaStreamDestroy(stream_i)); 
    HANDLE_ERROR(cudaStreamDestroy(stream_j)); 
    HANDLE_ERROR(cudaFree(population)); 
    HANDLE_ERROR(cudaFree(pop)); 
    HANDLE_ERROR(cudaFreeHost(hmutation)); 
    HANDLE_ERROR(cudaFree(mutation1)); 
    HANDLE_ERROR(cudaFree(devStates)); 

    system("pause"); 
    return 0; 
}