2013-03-06 130 views
0

我想知道什么是创造,0和49K之间的一个伪随机数,这将是为每个线程同样使用curand或别的东西的最佳方式。Cuda的随机数生成

我更喜欢生成内核内部的随机数,因为我将要产生一个的时间,但大约10,000倍。

而且我可以使用花车0.0和1.0之间,但我不知道如何使可用于所有线程我PRN,因为大多数岗位和实例说明如何有不同的PRN为每个线程。

感谢

回答

8

也许你只需要研究curand documentation,特别是对device API。为每个线程获取相同序列的关键是为每个线程创建状态(大多数示例都是这样做的),然后将相同的序列号传递给每个线程的init函数。在curand_init,参数的序列如下:

curand_init(seed, subsequence number, offset, state) 

通过设置种子每个INIT调用相同的,我们生成每个线程的相同序列。通过设置子序列和偏移号码相同,我们为每个线程在该序列中选择相同的起始值。

下面是代码来演示:

// compile with: nvcc -arch=sm_20 -lcurand -o t89 t89.cu 
#include <stdio.h> 
#include <curand.h> 
#include <curand_kernel.h> 

#define SCALE 49000 
#define DSIZE 5000 
#define nTPB 256 

#define cudaCheckErrors(msg) \ 
    do { \ 
     cudaError_t __err = cudaGetLastError(); \ 
     if (__err != cudaSuccess) { \ 
      fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ 
       msg, cudaGetErrorString(__err), \ 
       __FILE__, __LINE__); \ 
      fprintf(stderr, "*** FAILED - ABORTING\n"); \ 
      exit(1); \ 
     } \ 
    } while (0) 

__device__ float getnextrand(curandState *state){ 

    return (float)(curand_uniform(state)); 
} 

__device__ int getnextrandscaled(curandState *state, int scale){ 

    return (int) scale * getnextrand(state); 
} 


__global__ void initCurand(curandState *state, unsigned long seed){ 
    int idx = threadIdx.x + blockIdx.x * blockDim.x; 
    curand_init(seed, 0, 0, &state[idx]); 
} 

__global__ void testrand(curandState *state, int *a1, int *a2){ 
    int idx = threadIdx.x + blockIdx.x * blockDim.x; 

    a1[idx] = getnextrandscaled(&state[idx], SCALE); 
    a2[idx] = getnextrandscaled(&state[idx], SCALE); 
} 

int main() { 

    int *h_a1, *h_a2, *d_a1, *d_a2; 
    curandState *devState; 

    h_a1 = (int *)malloc(DSIZE*sizeof(int)); 
    if (h_a1 == 0) {printf("malloc fail\n"); return 1;} 
    h_a2 = (int *)malloc(DSIZE*sizeof(int)); 
    if (h_a2 == 0) {printf("malloc fail\n"); return 1;} 
    cudaMalloc((void**)&d_a1, DSIZE * sizeof(int)); 
    cudaMalloc((void**)&d_a2, DSIZE * sizeof(int)); 
    cudaMalloc((void**)&devState, DSIZE * sizeof(curandState)); 
    cudaCheckErrors("cudamalloc"); 



    initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1); 
    cudaDeviceSynchronize(); 
    cudaCheckErrors("kernels1"); 
    testrand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a1, d_a2); 
    cudaDeviceSynchronize(); 
    cudaCheckErrors("kernels2"); 
    cudaMemcpy(h_a1, d_a1, DSIZE*sizeof(int), cudaMemcpyDeviceToHost); 
    cudaMemcpy(h_a2, d_a2, DSIZE*sizeof(int), cudaMemcpyDeviceToHost); 
    cudaCheckErrors("cudamemcpy"); 
    printf("1st returned random value is %d\n", h_a1[0]); 
    printf("2nd returned random value is %d\n", h_a2[0]); 

    for (int i=1; i< DSIZE; i++){ 
     if (h_a1[i] != h_a1[0]) { 
     printf("mismatch on 1st value at %d, val = %d\n", i, h_a1[i]); 
     return 1; 
     } 
     if (h_a2[i] != h_a2[0]) { 
     printf("mismatch on 2nd value at %d, val = %d\n", i, h_a2[i]); 
     return 1; 
     } 
     } 
    printf("thread values match!\n"); 

}