opencl设备中首选的矢量宽度

我是OpenCL的初学者，正在尝试运行“OpenLC in Action”示例代码。我有以下代码来获得我的设备的首选向量宽度。我的电脑上检测到的平台来自英特尔酷睿i7和高清显卡，另外一个来自NVIDIA GeForce 940M。每当我运行代码时，它会给每种类型的向量宽度赋予“1”，除非输入因零不支持而为零的double。即使当我在计算机中更改平台来检查其设备时，结果也是一样的。我在AMD计算机上运行代码，它似乎正常工作，因为它给了我不同类型的不同数字。但是，我不确定为什么此代码在我的计算机的不同平台上始终为每种类型提供“1”。有任何想法吗？这里是输出： opencl设备中首选的矢量宽度

下面是代码：

#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 
#include <CL/cl.h> 

int main(){ 

    cl_int err, i, j; 
    cl_platform_id *platforms; 
    cl_device_id *devices; 
    cl_uint num_platforms, num_devices, vector_width; 
    size_t plat_name_size, devi_name_size; 
    char *plat_name_data, *devi_name_data; 


    err = clGetPlatformIDs(1, NULL, &num_platforms); 
    if (err < 0){ 
     perror("No platform is found"); 
     exit(1); 
    } 
    platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platforms); 
    clGetPlatformIDs(num_platforms, platforms, NULL); 

    printf("Number of found platforms is %d\n ", num_platforms); 

    for (i = 0; i < num_platforms; i++){ 

     err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &plat_name_size); 
     if (err < 0){ 
      perror("Couldn't read platform name."); 
      exit(1); 
     } 
     plat_name_data = (char*)malloc(plat_name_size); 
     clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, plat_name_size, plat_name_data, NULL); 
     printf("Platform No.%d is: %s\n", i, plat_name_data); 

     err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 1, NULL, &num_devices); 
     if (err < 0){ 
      perror("No device is found in this platform"); 
      exit(1); 
     } 
     devices = (cl_device_id*)malloc(sizeof(cl_device_id)*(num_devices)); 
     clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); 
     printf("Number of devices found in this platform is: %d\n", num_devices); 
     for (j = 0; j < num_devices; j++){ 
      err = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &devi_name_size); 
      if (err < 0){ 
       perror("Couldn't read the device name."); 
       exit(1); 
      } 
      devi_name_data = (char*)malloc(devi_name_size); 
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, devi_name_size, devi_name_data, NULL); 
      printf("Device No.%d name is: %s\n", j + 1, devi_name_data); 
      if (strstr(devi_name_data, "GeForce 940M")){ 
       clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, 
        sizeof(cl_uint), &vector_width, NULL); 
       printf("Preferred vector width in chars: %u\n", vector_width); 
       clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, 
        sizeof(cl_uint), &vector_width, NULL); 
       printf("Preferred vector width in shorts: %u\n", vector_width); 
       clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, 
        sizeof(cl_uint), &vector_width, NULL); 
       printf("Preferred vector width in ints: %u\n", vector_width); 
       clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, 
        sizeof(cl_uint), &vector_width, NULL); 
       printf("Preferred vector width in longs: %u\n", vector_width); 
       clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, 
        sizeof(cl_uint), &vector_width, NULL); 
       printf("Preferred vector width in floats: %u\n", vector_width); 
       clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, 
        sizeof(cl_uint), &vector_width, NULL); 
       printf("Preferred vector width in doubles: %u\n", vector_width); 
      } 
     } 

    } 
    return 0; 
}

来源

2015-11-30 mfaieghi

简短的回答：你是正确的查询它，并且平台编译器知道什么是最好的载体宽度尺寸。所以是的，这是正确的值1.

长答案：对于一个CPU（任何类型的CPU），它可能更喜欢非矢量。特别是在Intel CPU + Compiler上，由于英特尔编译器将矢量化作为优化过程的一部分，所以它更喜欢用户不首先对代码进行矢量化。

的确，它看起来像nVIDIA也喜欢用户输入非矢量化的代码。这并不意味着如果已经进行了向量化，代码将会运行得更慢它只是意味着编译器（由于它具有优化技术）更喜欢代码未被向量化。

对OpenCL驱动程序的更新可能会导致这些值的更改。另外，你应该把它们作为方向。其他因素如：本地内存使用情况，联合全球访问，本地规模等......通常更重要。

来源

2015-11-30 01:37:10 DarkZeros

感谢您的快速答复。是的，我选择了nVIDIA，但它给了我1.我将编辑这个问题，并将选择nVIDIA平台的代码添加到您的评论中。谢谢。 – mfaieghi

NVIDIA GPU更喜欢非矢量化代码是正常的。 –

我更新了代码。所以，如果设备不喜欢矢量化代码，我们不应该这样写代码？如果我们编写矢量化的代码，是否会有任何性能成本？ – mfaieghi

下面是我做过的一个实验，了解向量化操作如何在更喜欢执行标量操作的设备中执行。我用两个不同的内核实现了简化算法。第一个内核将数据视为标量，而第二个内核将数据视为float4向量（代码如下）。这是执行结果。很明显，虽然NVIDIA设备更喜欢非矢量化操作，但矢量化操作更快。

首选矢量宽度：1 reduction_scalar：检查通过。总时间= 4471424 reduction_vector：检查通过。总时间= 1723776

这里是代码：

#define _CRT_SECURE_NO_WARNINGS 
#define PROGRAM_FILE "reduction.cl" 

#define ARRAY_SIZE 1048576 
#define NUM_KERNELS 2 

#include <math.h> 
#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 
#include <time.h> 

#ifdef MAC 
#include <OpenCL/cl.h> 
#else 
#include <CL/cl.h> 
#endif 

/* Find a GPU or CPU associated with the first available platform */ 
cl_device_id create_device() { 

    cl_platform_id platform; 
    cl_device_id dev; 
    int err; 

    /* Identify a platform */ 
    err = clGetPlatformIDs(1, &platform, NULL); 
    if (err < 0) { 
     perror("Couldn't identify a platform"); 
     exit(1); 
    } 

    /* Access a device */ 
    err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL); 
    if (err == CL_DEVICE_NOT_FOUND) { 
     printf(" GPU is not first! Going on CPU :("); 
     err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL); 
    } 
    if (err < 0) { 
     perror("Couldn't access any devices"); 
     exit(1); 
    } 

    return dev; 
} 

/* Create program from a file and compile it */ 
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) { 

    cl_program program; 
    FILE *program_handle; 
    char *program_buffer, *program_log; 
    size_t program_size, log_size; 
    int err; 

    /* Read program file and place content into buffer */ 
    program_handle = fopen(filename, "r"); 
    if (program_handle == NULL) { 
     perror("Couldn't find the program file"); 
     exit(1); 
    } 
    fseek(program_handle, 0, SEEK_END); 
    program_size = ftell(program_handle); 
    rewind(program_handle); 
    program_buffer = (char*)malloc(program_size + 1); 
    program_buffer[program_size] = '\0'; 
    fread(program_buffer, sizeof(char), program_size, program_handle); 
    fclose(program_handle); 

    /* Create program from file */ 
    program = clCreateProgramWithSource(ctx, 1, 
     (const char**)&program_buffer, &program_size, &err); 
    if (err < 0) { 
     perror("Couldn't create the program"); 
     exit(1); 
    } 
    free(program_buffer); 

    /* Build program */ 
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
    if (err < 0) { 

     /* Find size of log and print to std output */ 
     clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
      0, NULL, &log_size); 
     program_log = (char*)malloc(log_size + 1); 
     program_log[log_size] = '\0'; 
     clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, 
      log_size + 1, program_log, NULL); 
     printf("%s\n", program_log); 
     free(program_log); 
     exit(1); 
    } 

    return program; 
} 

int main() { 

    /* OpenCL structures */ 
    cl_device_id device; 
    cl_context context; 
    cl_program program; 
    cl_kernel kernel[NUM_KERNELS]; 
    cl_command_queue queue; 
    cl_event prof_event; 
    cl_int i, j, err, preferred_width; 
    size_t local_size, global_size; 
    char kernel_names[NUM_KERNELS][20] = 
    { "reduction_scalar", "reduction_vector" }; 

    /* Data and buffers */ 
    float *data = (float *)malloc(sizeof(float)* ARRAY_SIZE); 
    //float data[ARRAY_SIZE]; 
    float sum, actual_sum, *scalar_sum, *vector_sum; 
    cl_mem data_buffer, scalar_sum_buffer, vector_sum_buffer; 
    cl_int num_groups; 
    cl_ulong time_start, time_end, total_time; 

    /* Initialize data */ 
    for (i = 0; i<ARRAY_SIZE; i++) { 
     data[i] = 1.0f*i; 
    } 

    /* Create device and determine local size */ 
    device = create_device(); 
    clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, 
     sizeof(preferred_width), &preferred_width, NULL); 
    printf("Preferred vector width: %d\n", preferred_width); 
    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, 
     sizeof(local_size), &local_size, NULL); 
    if (err < 0) { 
     perror("Couldn't obtain device information"); 
     exit(1); 
    } 

    /* Allocate and initialize output arrays */ 
    num_groups = ARRAY_SIZE/local_size; 
    scalar_sum = (float*)malloc(num_groups * sizeof(float)); 
    vector_sum = (float*)malloc(num_groups/4 * sizeof(float)); 
    for (i = 0; i<num_groups; i++) { 
     scalar_sum[i] = 0.0f; 
    } 
    for (i = 0; i<num_groups/4; i++) { 
     vector_sum[i] = 0.0f; 
    } 

    /* Create a context */ 
    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); 
    if (err < 0) { 
     perror("Couldn't create a context"); 
     exit(1); 
    } 

    /* Build program */ 
    program = build_program(context, device, PROGRAM_FILE); 

    /* Create data buffer */ 
    data_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | 
     CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err); 
    scalar_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | 
     CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), scalar_sum, &err); 
    vector_sum_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | 
     CL_MEM_COPY_HOST_PTR, num_groups * sizeof(float), vector_sum, &err); 
    if (err < 0) { 
     perror("Couldn't create a buffer"); 
     exit(1); 
    }; 

    /* Create a command queue */ 
    queue = clCreateCommandQueue(context, device, 
     CL_QUEUE_PROFILING_ENABLE, &err); 
    if (err < 0) { 
     perror("Couldn't create a command queue"); 
     exit(1); 
    }; 

    for (i = 0; i<NUM_KERNELS; i++) { 

     /* Create a kernel */ 
     kernel[i] = clCreateKernel(program, kernel_names[i], &err); 
     if (err < 0) { 
      perror("Couldn't create a kernel"); 
      exit(1); 
     }; 

     /* Create kernel arguments */ 
     err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &data_buffer); 
     if (i == 0) { 
      global_size = ARRAY_SIZE; 
      err |= clSetKernelArg(kernel[i], 1, local_size * sizeof(float), NULL); 
      err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &scalar_sum_buffer); 
     } 
     else { 
      global_size = ARRAY_SIZE/4; 
      err |= clSetKernelArg(kernel[i], 1, local_size * 4 * sizeof(float), NULL); 
      err |= clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &vector_sum_buffer); 
     } 
     if (err < 0) { 
      perror("Couldn't create a kernel argument"); 
      exit(1); 
     } 

     /* Enqueue kernel */ 
     err = clEnqueueNDRangeKernel(queue, kernel[i], 1, NULL, &global_size, 
      &local_size, 0, NULL, &prof_event); 
     if (err < 0) { 
      perror("Couldn't enqueue the kernel"); 
      exit(1); 
     } 

     /* Finish processing the queue and get profiling information */ 
     clFinish(queue); 
     clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START, 
      sizeof(time_start), &time_start, NULL); 
     clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END, 
      sizeof(time_end), &time_end, NULL); 
     total_time = time_end - time_start; 

     /* Read the result */ 
     if (i == 0) { 
      err = clEnqueueReadBuffer(queue, scalar_sum_buffer, CL_TRUE, 0, 
       num_groups * sizeof(float), scalar_sum, 0, NULL, NULL); 
      if (err < 0) { 
       perror("Couldn't read the buffer"); 
       exit(1); 
      } 
      sum = 0.0f; 
      for (j = 0; j<num_groups; j++) { 
       sum += scalar_sum[j]; 
      } 
     } 
     else { 
      err = clEnqueueReadBuffer(queue, vector_sum_buffer, CL_TRUE, 0, 
       num_groups/4 * sizeof(float), vector_sum, 0, NULL, NULL); 
      if (err < 0) { 
       perror("Couldn't read the buffer"); 
       exit(1); 
      } 
      sum = 0.0f; 
      for (j = 0; j<num_groups/4; j++) { 
       sum += vector_sum[j]; 
      } 
     } 

     /* Check result */ 
     printf("%s: ", kernel_names[i]); 
     actual_sum = 1.0f * ARRAY_SIZE/2 * (ARRAY_SIZE - 1); 
     if (fabs(sum - actual_sum) > 0.01*fabs(sum)) 
      printf("Check failed.\n"); 
     else 
      printf("Check passed.\n"); 
     printf("Total time = %lu\n\n", total_time); 

     /* Deallocate event */ 
     clReleaseEvent(prof_event); 
    } 

    /* Deallocate resources */ 
    free(scalar_sum); 
    free(vector_sum); 
    for (i = 0; i<NUM_KERNELS; i++) { 
     clReleaseKernel(kernel[i]); 
    } 
    clReleaseMemObject(scalar_sum_buffer); 
    clReleaseMemObject(vector_sum_buffer); 
    clReleaseMemObject(data_buffer); 
    clReleaseCommandQueue(queue); 
    clReleaseProgram(program); 
    clReleaseContext(context); 
    return 0; 
}

和仁：

__kernel void reduction_scalar(__global float* data, 
     __local float* partial_sums, __global float* output) { 

    int lid = get_local_id(0); 
    int group_size = get_local_size(0); 

    partial_sums[lid] = data[get_global_id(0)]; 
    barrier(CLK_LOCAL_MEM_FENCE); 

    for(int i = group_size/2; i>0; i >>= 1) { 
     if(lid < i) { 
     partial_sums[lid] += partial_sums[lid + i]; 
     } 
     barrier(CLK_LOCAL_MEM_FENCE); 
    } 

    if(lid == 0) { 
     output[get_group_id(0)] = partial_sums[0]; 
    } 
} 

__kernel void reduction_vector(__global float4* data, 
     __local float4* partial_sums, __global float* output) { 

    int lid = get_local_id(0); 
    int group_size = get_local_size(0); 

    partial_sums[lid] = data[get_global_id(0)]; 
    barrier(CLK_LOCAL_MEM_FENCE); 

    for(int i = group_size/2; i>0; i >>= 1) { 
     if(lid < i) { 
     partial_sums[lid] += partial_sums[lid + i]; 
     } 
     barrier(CLK_LOCAL_MEM_FENCE); 
    } 

    if(lid == 0) { 
     output[get_group_id(0)] = dot(partial_sums[0], (float4)(1.0f)); 
    } 
}

来源

2015-12-04 15:23:13 mfaieghi

opencl设备中首选的矢量宽度

回答

相关问题