在多个GPU上运行OpenCL内核？

现在我编写了几个在一个GPU上并行运行的算法，但是当我尝试在几个GPU上执行它们时（例如3），它们都有同样的问题。问题是，在一个GPU上执行的代码在3个GPU上执行完全相同的时间量（不会更快）。我试图用更多的数据执行，尝试执行不同的任务，没有任何帮助。最后，我最终尝试运行像元素总和这样的最简单的任务，并且仍然有这个可怕的错误。这就是为什么我不相信这是一个特定算法的问题，我觉得我的代码存在一个错误（或者甚至在我的几种GPU上并行化代码的方法）。在多个GPU上运行OpenCL内核？

这里是我的Parallel.cpp类的头文件：

#ifndef PARALLEL_H 
#define PARALLEL_H 

#define __NO_STD_VECTOR // Use cl::vector and cl::string and 
#define __NO_STD_STRING // not STL versions, more on this later 
#include <CL/cl.h> 

class Parallel 
{ 
    public: 
     Parallel(); 
     int executeAttachVectorsKernel(int*, int*, int*, int); 
     static void getMaxWorkGroupSize(int*, int*, int*); 
     virtual ~Parallel(); 
    protected: 
    private: 
     char* file_contents(const char*, int*); 
     void getShortInfo(cl_device_id); 
     int init(void); 
     cl_platform_id platform; 
     cl_device_id* devices; 
     cl_uint num_devices; 
     cl_command_queue* queues; 
     int* WGSizes; 
     int* WGNumbers; 
     cl_context context; 
     cl_program program; 
     cl_kernel kernel; 
     cl_mem input1; 
     cl_mem input2; 
     cl_mem output; 
}; 

#endif // PARALLEL_H

下面是初始化方法的init：

int Parallel::init() { 
cl_int err; 

//Connect to the first platfrom 
err = clGetPlatformIDs(1, &platform, NULL); 
if (err != CL_SUCCESS) { 
    cerr << "Error occured while executing clGetPlatformIDs" << endl; 
    return EXIT_FAILURE; 
} 

//Get devices number 
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); 
if (err != CL_SUCCESS) { 
    cerr << "Error: Failed to create a device group:" << endl; 
    return EXIT_FAILURE; 
} 

cout << "NUM DEVICES =" << num_devices << endl; 

devices = new cl_device_id[num_devices]; 
//Get all the GPU devices 
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices, NULL); 

//Create one context for all the devices 
context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &err); 
if (!context) { 
    cerr << "Error: Failed to create a compute context!" << endl; 
    return EXIT_FAILURE; 
} 

queues = new cl_command_queue[num_devices]; 
WGNumbers = new int[num_devices]; 
WGSizes = new int[num_devices]; 


for(int i = 0; i < num_devices; i++) { 
    //Create a command queue for every device 
    queues[i] = clCreateCommandQueue(context, devices[i], 0, &err); 
    if (!queues[i]) { 
     cerr << "Error: Failed to create a command commands!" << endl; 
     return EXIT_FAILURE; 
    } 

    cl_ulong temp; 
    clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(temp), &temp, NULL); 
    WGSizes[i] = (int)temp; 

    clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(temp), &temp, NULL); 
    WGNumbers[i] = (int)temp; 
} 

//Translate kernel code into chars 
int pl; 
size_t program_length; 
string path = "./kernel/kernel_av.cl"; 

char* cSourceCL = file_contents(path.c_str(), &pl); 
program_length = (size_t)pl; 

//Create a program 
program = clCreateProgramWithSource(context, 1, 
        (const char **) &cSourceCL, &program_length, &err); 

if (!program) { 
    cerr << "Error: Failed to create compute program!" << endl; 
    return EXIT_FAILURE; 
} 

//Create an executable 
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
if (err != CL_SUCCESS) 
{ 
    size_t len; 
    char buffer[2048]; 

    cerr << "Error: Failed to build program executable!" << endl; 
    exit(1); 
} 

// Create the compute kernel in the program 
kernel = clCreateKernel(program, "calculate2dim", &err); 
if (err != CL_SUCCESS) 
{ 
    cerr << "Error: Failed to create compute kernel!" << endl; 
    exit(1); 
} 
}

其执行内核的方法是在这里：

int Parallel::executeAttachVectorsKernel(int* data1, int* data2, int* results, int vectors_num) { 

cl_int err; 
size_t global; // global domain size for our calculation 
size_t local; // local domain size for our calculation 

int partition = vectors_num/num_devices; 
unsigned int count = partition; 
input1 = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * count, NULL, NULL); 
input2 = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * count, NULL, NULL); 
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * count, NULL, NULL); 
if (!input1 || !input2 || !output) { 
    cerr << "Error: Failed to allocate device memory!" << endl; 
    exit(1); 
} 

int** data1_apart = new int*[num_devices]; 
int** data2_apart = new int*[num_devices]; 
int** results_apart = new int*[num_devices]; 

for(int i = 0; i < num_devices; i++) { 
    cout << "Executing parallel part on GPU " << i + 1 << endl; 
    cout << "Partition size = " << partition << endl; 
    data1_apart[i] = new int[partition]; 
    data2_apart[i] = new int[partition]; 
    results_apart[i] = new int[partition]; 

    for(int j = i*partition, k = 0; k < partition; j++, k++) { 
     data1_apart[i][k] = data1[j]; 
     data2_apart[i][k] = data2[j]; 
    } 

    //Transfer the input vector into device memory 
    err = clEnqueueWriteBuffer(queues[i], input1, 
           CL_TRUE, 0, sizeof(int) * count, 
           data1_apart[i], 0, NULL, NULL); 

    err = clEnqueueWriteBuffer(queues[i], input2, 
           CL_TRUE, 0, sizeof(int) * count, 
           data2_apart[i], 0, NULL, NULL); 

    if (err != CL_SUCCESS) 
    { 
     cerr << "Error: Failed to write to source array!" << endl; 
     exit(1); 
    } 

    int parameter4 = count/WGNumbers[i]; 

    //Set the arguments to the compute kernel 
    err = 0; 
    err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input1); 
    err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input2); 
    err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); 
    err |= clSetKernelArg(kernel, 3, sizeof(int), &parameter4); 
    if (err != CL_SUCCESS) 
    { 
     cerr << "Error: Failed to set kernel arguments! " << err << endl; 
     exit(1); 
    } 

    global = WGNumbers[i]; 
    local = WGSizes[i]; 

    if(local > global) { 
     local = global; 
    } 
    cout << "global = " << global << " local = " << local << endl; 

    err = clEnqueueNDRangeKernel(queues[i], kernel, 
           1, NULL, &global, &local, 
           0, NULL, NULL); 
    if (err) 
    { 
     cerr << "Error: Failed to execute kernel!" << endl; 
     return EXIT_FAILURE; 
    } 
} 

for(int i = 0; i < num_devices; i++) { 
    //Wait for all commands to complete 
    clFinish(queues[i]); 

    //Read back the results from the device to verify the output 

    err = clEnqueueReadBuffer(queues[i], output, 
           CL_TRUE, 0, sizeof(int) * count, 
           results_apart[i], 0, NULL, NULL); 
    if (err != CL_SUCCESS) 
    { 
     cerr << "Error: Failed to read output array! " << err << endl; 
     exit(1); 
    } 

    for(int j = 0; j < partition; j++) { 
     results[i*partition + j] = results_apart[i][j]; 
    } 

    delete [] data1_apart[i]; 
    delete [] data2_apart[i]; 
    delete [] results_apart[i]; 
} 

clReleaseMemObject(input1); 
clReleaseMemObject(input2); 
clReleaseMemObject(output); 
delete [] data1_apart; 
delete [] data2_apart; 
}

在将此问题发布到stackoverflow之前，我一直在争取2-3周这个问题，现在我真的编辑某人的帮助，所以我会高度赞赏任何想法和答案！

来源

2013-05-10 Vladimir

您的所有设备都在相同的缓冲区上运行。内核执行时，数据将在设备之间移动。没有适当的同步，结果将是不确定的。

如果可能，请考虑为每个设备分配一组不同的缓冲区。

来源

2013-05-10 18:02:25

这就是我现在正在做的和近乎实例（〜1毫秒的差异）执行内核为重的工作负载（〜130ms工作） – 2013-05-24 16:51:33

这是我认为正在发生的事情。您为每个参与的opencl设备调用一次clEnqueueNDRangeKernel。此时，没有任何内核开始执行，因为clFlush尚未被调用。接下来，你为每个队列建立一个完整的队列。第一次clFinish调用会导致第一个排队的工作组运行。它也等待它完成。第一个工作组完成后，clFinish将控制权返回给您的应用程序。然后，您的应用程序会调用clFinish来获取下一个队列。这会触发第二个工作泥浆运行，并等待它完成。所以这项工作按顺序进行。解决方案可能很简单，只要在每次调用clEnqueueNDRangeKernel之后立即调用clFush即可。这就是我的AMD系统的行为。我将很快发布一个工作示例。

来源

2013-05-13 03:33:37 ScottD

谢谢，我会试试这个！如果可能的话，请对这种方法提供更多评论！ – Vladimir 2013-05-13 12:12:22

这里是工作示例[link] http://notabs.org/blcutil/wip/blcutil_devel-018.7z 下面是如何在一台设备上运行它，然后是另一台设备，然后再运行： [link] http：/ /notabs.org/blcutil/wip/sample-output.htm 命令行选项-opencl将使程序列表opencl设备。命令行选项-opencl =允许您选择一个或多个列出的设备以供使用。 – ScottD 2013-05-13 16:51:53

是的，我认为这是正确的。OpenCL规范对此非常具体：“请注意，当回调（或其他代码）将命令排入命令队列时，在队列被刷新之前，不需要执行命令。” – doug65536 2013-05-30 01:50:53

您正在使用哪种GPU？我有一个显示在两个GPU设备上的GTX590。当我试图在两台设备上运行时，似乎都要等到每台设备完成之后才转移到下一台设备上（即使它不是这样）。我不知道Nvidia是否解决了这个问题。

阅读我在Nvidia网站上看到的一些消息，当时我阅读了一些关于Nvidia的建议，为每个设备创建单独的上下文，并在不同的线程中运行它们。这就是我所做的，它很棒。我为此使用了pthreads（或SDL_threads）。这很容易设置。

来源

2013-05-18 15:14:32

在多个GPU上运行OpenCL内核？

回答

相关问题