0
我正试图在CUDA上进行差异化演化,但问题在于负责“突变,交叉,评估,选择”的内核从未启动。CUDA DE内核不启动
任何帮助?
这里是整个代码:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err),
file, line);
system("pause");
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[D*id+0]=a;
mutation[D*id+1]=b;
mutation[D*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[D*id+0], b=mutation[D*id+1], c=mutation[D*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}
更新 - 解决方案:
#include <iostream>
#include <curand_kernel.h>
using namespace std;
/**** ERROR HANDLING ****/
static void HandleError(cudaError_t err,const char *file, int line)
{
if (err != cudaSuccess) {
printf("%s in %s at line %d\n", cudaGetErrorString(err),
file, line);
system("pause");
exit(EXIT_FAILURE);
}
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
/**** HOST AND DEVICE CONSTANTS****/
const int hNP=100, hD=31, hN=10;
__constant__ int NP, D, N;
__constant__ float Cr, F;
/*** EVAL FUNCTION******/
__device__ float lennardJones(float a[3], float b[3]) {
float distance = sqrt((a[0] - b[0]) * (a[0] - b[0])
+ (a[1] - b[1]) * (a[1] - b[1])
+ (a[2] - b[2]) * (a[2] - b[2]));
float distance6 = distance * distance * distance
* distance * distance * distance;
float distance12 = distance6 * distance6;
return 1/distance12 - 2/distance6;
}
/**** RANDOM GENERATORS***/
__device__ float rndFloat(curandState* globalState, int id)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM;
}
__device__ int rndInt(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return RANDOM*max;
}
__device__ float rndFloat(curandState* globalState, int id, int min,int max)
{
curandState localState = globalState[id];
float RANDOM = curand_uniform(&localState);
globalState[id] = localState;
return min+RANDOM*(max-min);
}
/*** SEEDS ****/
__global__ void setup_kernel (curandState * state, unsigned long seed)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
curand_init(seed, id, 0,&state[id]);
}
/**** DIFFERENTIAL EVOLUTION: INITIALIZATION ***/
__global__ void kernelE(curandState* globalState, float *population)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id < NP)
{
//init, just populating array with some specific numbers
population[D*id]=0;
population[D*id+N]=0;
population[D*id +2*N]=0;
population[D*id+1]=rndFloat(globalState,threadIdx.x,4);
population[D*id+N+1]=0;
population[D*id +2*N+1]=0;
for(int i=2; i<N; i++){
float min= -4 - 1/4*abs((int)((i-4)/3));
float max= 4 + 1/4*abs((int)((i-4)/3));
if(i==2)
{
population[D*id+2]=rndFloat(globalState,threadIdx.x,3.14159265359);
population[D*id+N+2]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+2]=0;
}
else
{
population[D*id +i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id+N+i]=rndFloat(globalState,threadIdx.x,min,max);
population[D*id +2*N+i]=rndFloat(globalState,threadIdx.x,min,max);
}
}
//eval
float e=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={population[D*id +i], population[D*id+N+i], population[D*id +2*N+i]}, b[]={population[D*id +j],population[D*id +j+N], population[D*id +2*N+j]};
e += lennardJones(a,b);
}
}
population[D*id + D-1]=e;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION INDICES ****/
__global__ void kernelP(curandState* globalState, int *mutation)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a = rndInt(globalState, id, NP),b = rndInt(globalState, id, NP),c= rndInt(globalState, id, NP);
while(a == id){a = rndInt(globalState, id, NP);}
while(b == a && b==id){b=rndInt(globalState, id, NP);}
while(c == a && c== b && c ==id){c=rndInt(globalState, id, NP);}
mutation[3*id+0]=a;
mutation[3*id+1]=b;
mutation[3*id+2]=c;
}
}
/**** DIFFERENTIAL EVOLUTION: MUTATION, CROSSOVER, EVALUATION AND SELECTION ***/
__global__ void kernelMCER(curandState* globalState, float *population, int *mutation, float *pop)
{
int id= threadIdx.x+blockIdx.x*blockDim.x;
if(id<NP)
{
int a=mutation[3*id+0], b=mutation[3*id+1], c=mutation[3*id+2];
//DE mutation and crossover
int j=rndInt(globalState, id, NP);
for(int i=0; i<D-1; i++)
{
//DE mutation
pop[D*id+i]= population[D*a +i] + F*(population[D*b +i]-population[D*c +i]);
//DE crossover
if(Cr > rndFloat(globalState, id) && i!= j)
pop[D*id+i]=population[D*id +i];
}
// Eval
pop[D*id+D-1]=0;
for(int i=0; i<N; i++)
{
for(int j=0; j<i; j++)
{
float a[]={pop[D*id+i], pop[D*id+N+i], pop[D*id+2*N+i]}, b[]={pop[D*id+j],pop[D*id+N+j], pop[D*id+2*N+j]};
pop[D*id+D-1] += lennardJones(a,b);
}
}
__syncthreads();
//DE selection
if(pop[D*id+D-1] < population[D*id +D-1])
{
for(int i=0; i<D; i++)
population[D*id +i]=pop[D*id+i];
}
}
}
void getBestScore(float *hpopulation)
{
int max=0;
for(int i=1; i<hNP; i++)
{
if(hpopulation[hD*max+hD-1] > hpopulation[hD*i+hD-1])
max=i;
}
for(int j=0; j<hN; j++)
cout<<"Atom "<<(j+1)<<": ("<<hpopulation[hD*max+j]<<", "<<hpopulation[hD*max+hN+j]<<", "<<hpopulation[hD*max+hN*2+j]<<") "<<endl;
cout<<"Result: "<<hpopulation[hD*max+hD-1]<<endl;
}
int main()
{
cudaEvent_t start,stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start,0));
int device, st=100;
float hCr=0.6f, hF=0.8f;
cudaDeviceProp prop;
HANDLE_ERROR(cudaGetDevice(&device));
HANDLE_ERROR(cudaGetDeviceProperties(&prop, device));
// int SN = prop.maxThreadsPerBlock; //512 threads per block
//int SB = (hNP+(SN-1))/SN;
//constants NP, D, N, Cr, F
HANDLE_ERROR(cudaMemcpyToSymbol(N, &hN, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(NP, &hNP, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(D, &hD, sizeof(int)));
HANDLE_ERROR(cudaMemcpyToSymbol(F, &hF, sizeof(float)));
HANDLE_ERROR(cudaMemcpyToSymbol(Cr, &hCr, sizeof(float)));
//seeds
curandState* devStates;
HANDLE_ERROR(cudaMalloc (&devStates, hNP*sizeof(curandState)));
setup_kernel <<< 1, hNP>>> (devStates, 50);
//population
float *population, *pop;
float hpopulation[hNP*hD];
HANDLE_ERROR(cudaMalloc((void**)&population, hNP*hD*sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&pop, hNP*hD*sizeof(float)));
//mutation
int *mutation, *mutation1;
int *hmutation;
HANDLE_ERROR(cudaHostAlloc((void**)&hmutation, hNP*3*sizeof(int), cudaHostAllocDefault));
HANDLE_ERROR(cudaMalloc((void**)&mutation, hNP*3*sizeof(int)));
HANDLE_ERROR(cudaMalloc((void**)&mutation1, hNP*3*sizeof(int)));
//stream
cudaStream_t stream_i, stream_j;
HANDLE_ERROR(cudaStreamCreate(&stream_i));
HANDLE_ERROR(cudaStreamCreate(&stream_j));
kernelE<<<1,hNP, 0,stream_i>>>(devStates,population);
kernelP<<<1,hNP, 0,stream_j>>>(devStates,mutation);
while(st != 0)
{
/*** COPYING MUTATION INDICES***/
HANDLE_ERROR(cudaMemcpyAsync(hmutation, mutation,hNP*3*sizeof(int), cudaMemcpyDeviceToHost, stream_j));
HANDLE_ERROR(cudaMemcpyAsync(mutation1, hmutation,hNP*3*sizeof(int), cudaMemcpyHostToDevice, stream_i));
/**** CALLING KERNELS****/
kernelP<<<1,hNP,0,stream_j>>>(devStates,mutation);
kernelMCER<<<1,hNP,0,stream_i>>>(devStates,population,mutation1,pop);
st--;
//HANDLE_ERROR(cudaStreamSynchronize(stream_i));
//HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
//getBestScore(hpopulation);
//cin.get();
}
HANDLE_ERROR(cudaStreamSynchronize(stream_i));
HANDLE_ERROR(cudaMemcpy(hpopulation, population, hNP*hD*sizeof(float), cudaMemcpyDeviceToHost));
getBestScore(hpopulation);
cudaEventRecord(stop,0);
cudaEventSynchronize(stop);
float time;
HANDLE_ERROR(cudaEventElapsedTime(&time, start, stop));
cout<<endl<<"Tme: "<<time/1000<<"s"<<endl;
HANDLE_ERROR(cudaEventDestroy(start));
HANDLE_ERROR(cudaEventDestroy(stop));
HANDLE_ERROR(cudaStreamDestroy(stream_i));
HANDLE_ERROR(cudaStreamDestroy(stream_j));
HANDLE_ERROR(cudaFree(population));
HANDLE_ERROR(cudaFree(pop));
HANDLE_ERROR(cudaFreeHost(hmutation));
HANDLE_ERROR(cudaFree(mutation1));
HANDLE_ERROR(cudaFree(devStates));
system("pause");
return 0;
}
你可以做[错误检查内核启动(HTTP:// stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api)。你为什么不这样做,并报告你得到什么样的错误?另外,当你用'cuda-memcheck'运行你的代码时会发生什么?我的猜测是你的内核正在启动,但你在设备代码中发生了某种错误,导致内核提前终止。 –
错误检查内核显示没有错误。 cuda-memcheck也显示没有错误。我还在VS 2010上安装了最新的nsight与适当的驱动程序,但是当我按下“开始CUDA调试”时,没有任何反应......我放入内核的断点被忽略。我不知道下一步该怎么做。 – user755974
更新您发布的代码,以显示您如何进行内核错误检查。更好的办法是发布一个完整的,可编译的代码。除非您在主机代码中存在逻辑错误,或者完全错误地解释了代码生成的数据,否则内核无法启动,不会出现错误,并且不会显示cuda中的错误 - MEMCHECK。 –