OpenCL矩阵乘法入队/缓冲区读取

我正在尝试OpenCL中的基本矩阵乘法程序。我相信我的问题出现在排队和/或缓冲区读取中，因为结果矩阵的输出完全不正确，矩阵A和B的第一行也不正确。我是OpenCL的新手，我一直在撞击我的头对此已有一段时间了，也许这里的某个人可以给我一个暗示我要去哪里的错误？OpenCL矩阵乘法入队/缓冲区读取

主机代码：

#define __NO_STD_VECTOR // Uses cl::vector instead of standard version 
#include <CL/cl.hpp> 
#include <stdlib.h> 
#include <stdio.h> 
#include <fstream> 
#include <iostream> 
#include <math.h> 
#include <string> 

/* Defined matrix width/height constants */ 
#define numRowsA 3 
#define numColsA 3 
#define numRowsB 3 
#define numColsB 3 
#define numRowsC numRowsA 
#define numColsC numColsB 

using namespace std; 

/* Function declarations */ 
inline void checkErr(cl_int err, string name); 
void initMatrix (float* matrix, int numIndices); 
void printMatrix (string displayName, float* matrix, int numIndices, 
     int rowSize); 

//************* 
// Main Program 
//************* 
int main(int argc, char* argv[]) { 

    /* Check for valid matrix sizes */ 
    if (numColsA != numRowsB) { 
     cout << "ERROR: Invalid matrix dimensions." << endl; 
    } else { 

    srand(2013); // Set random seed 

    /* Allocate memory for matrices A, B, and C */ 
    unsigned int sizeA = numRowsA * numColsA; 
    unsigned int sizeB = numRowsB * numColsB; 
    unsigned int sizeC = numRowsC * numColsC; 
    unsigned int memoryA = sizeof(float) * sizeA; 
    unsigned int memoryB = sizeof(float) * sizeB; 
    unsigned int memoryC = sizeof(float) * sizeC; 

    /* 
     Allocate memoryA/memoryB/memoryC size blocks of bytes 
     (cast from void*) 
    */ 
    float* blockA = (float*) malloc(memoryA); 
    float* blockB = (float*) malloc(memoryB); 
    float* blockC = (float*) malloc(memoryC); 

    /* Initialize matrices A and B */ 
    initMatrix(blockA, sizeA); 
    initMatrix(blockB, sizeB); 

    /* Display matrices A and B */ 
    printMatrix("Matrix A", blockA, sizeA, numColsA); 
    printMatrix("Matrix B", blockB, sizeB, numColsB); 

    cl_int err;   // Error code 
    string platformVendor; // Platform vendor 

    /* Create list of platforms */ 
    cl::vector <cl::Platform> platformList; 
    cl::Platform::get(&platformList); 

    /* 
     Display potential Platform list generation error. If the 
     platform list size does not equal 0, CL_SUCCESS (0) is 
     sent to the function. If the platform list size does 
     equal 0, -1 is sent to the function. 
    */ 
    checkErr(platformList.size()!=0 ? CL_SUCCESS : -1, 
      "Platform"); 

    /* 
     Replace empty value of platformVendor with device vendor 
     name 
    */ 
    platformList[0].getInfo((cl_platform_info) CL_PLATFORM_VENDOR, 
     &platformVendor); 

    /* Properties for Context constructor (Use unknown) */ 
    cl_context_properties cprops[3] = 
     { 
     CL_CONTEXT_PLATFORM, 
     (cl_context_properties) (platformList[0])(), 
     0 
     }; 

    /* Create context */ 
    cl::Context context(CL_DEVICE_TYPE_GPU, cprops, NULL, NULL, 
     &err); 

    /* Display potential Context constructor error */ 
    checkErr(err, "Context"); 

    /* Create buffer for matrix A */ 
    cl::Buffer deviceMemA(context, 
      CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeA, blockA, &err); 

    /* Create buffer for matrix B */ 
    cl::Buffer deviceMemB(context, 
      CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeB, blockB, &err); 

    /* Create buffer for matrix C */ 
    cl::Buffer deviceMemC(context, 
      CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeC, blockC, &err); 

    /* Create buffer for row (A) and col (C) */ 
    cl::Buffer rowA(context, CL_MEM_READ_ONLY, sizeof(int), 
      (void *) numRowsA, &err); 
    cl::Buffer colC(context, CL_MEM_READ_ONLY, sizeof(int), 
      (void *) numColsC, &err); 

    /* Display potential Buffer constructor error */ 
    checkErr(err, "Buffers"); 

    /* Get list of devices */ 
    cl::vector<cl::Device> devices = 
     context.getInfo<CL_CONTEXT_DEVICES>(); 

    /* Check for at least one device, if not throw error */ 
    checkErr(devices.size() > 0 ? CL_SUCCESS : -1, "No Devices"); 

    /* Read input from .cl file */ 
    ifstream file("matrixMult1_kernels.cl"); 

    /* Check for potential problem opening .cl input file */ 
    checkErr(file.is_open() ? CL_SUCCESS:-1, "File Not Open"); 

    /* Store file contents in a string */ 
    string prog(istreambuf_iterator<char>(file), 
      (istreambuf_iterator<char>())); 

    /* Create source object */ 
    cl::Program::Sources source(1, make_pair(prog.c_str(), 
     prog.length()+1)); 

    /* Create program for given context and source */ 
    cl::Program program(context, source); 

    err = program.build(devices, ""); // Check for build error 

    /* Display potential program build error */ 
    checkErr(err, "Program Build"); 

    /* Create kernel */ 
    cl::Kernel kernel(program, "matrixMul", &err); 

    /* Display potential Kernel constructor error */ 
    checkErr(err, "Kernel"); 

    /* 
     Set matrixMul arguments, error checking after each 
     argument 
    */ 
    err = kernel.setArg(0, deviceMemA); 
    checkErr(err, "Arg0"); 
    err = kernel.setArg(1, deviceMemB); 
    checkErr(err, "Arg1"); 
    err = kernel.setArg(2, deviceMemC); 
    checkErr(err, "Arg2"); 
    err = kernel.setArg(3, rowA); 
    checkErr(err, "Arg3"); 
    err = kernel.setArg(4, colC); 
    checkErr(err, "Arg4"); 

    /* Create command queue */ 
    cl::CommandQueue queue(context, devices[0], 0, &err); 

    /* Display potential CommandQueue constructor error */ 
    checkErr(err, "Command Queue"); 

    /* Create event object */ 
    cl::Event event; 

    cl::NDRange global(3, 3); 
    cl::NDRange local(1, 1); 

    /* Enqueue the kernel */ 
    err = queue.enqueueNDRangeKernel(kernel, 2, global, local, 
     NULL, &event); 

    /* Display potential enqueueing error */ 
    checkErr(err, "Enqueue"); 

    /* Wait until kernel has completed execution before continuing */ 
    event.wait(); 

    /* Read kernel result back into host memory */ 
    err = queue.enqueueReadBuffer(deviceMemC, CL_TRUE, 0, memoryC, 
     blockC, NULL, &event); 

     checkErr(err, "C"); 

    err = queue.enqueueReadBuffer(deviceMemA, CL_TRUE, 0, sizeA, 
     blockA, NULL, &event); 
    err = queue.enqueueReadBuffer(deviceMemB, CL_TRUE, 0, sizeB, 
     blockB, NULL, &event); 

    /* Display potential kernel read error */ 
    checkErr(err, "Read Buffer"); 

    /* Display matrices */ 
     cout << endl; 
     cout << "After:" << endl; 
    printMatrix("Matrix A", blockA, sizeA, numColsA); 
    printMatrix("Matrix B", blockB, sizeB, numColsB); 
    printMatrix("Matrix C", blockC, sizeC, numColsC); 

    /* Free up memory */ 
    free(blockA); 
    free(blockB); 
    free(blockC); 
    } 
} 

//-------------------------------------------------------------------- 
// checkErr - Inline error checking function for OpenCL portion of 
//   host program. 
// 
// PRE: err is of type int in OpenCL; name is a string. 
// POST: The program is terminated after display an error message 
//  indicating the location of the error and the error code. 
//-------------------------------------------------------------------- 
inline void checkErr(cl_int err, string name) { 

    /* Check error code against OpenCL success constant */ 
    if (err != CL_SUCCESS) { 

    /* 
     Display an error message stating the error origin and 
     error number. 
    */ 
    std::cerr << "ERROR: " << name << " (" << err << ")" 
       << std::endl; 

    exit(EXIT_FAILURE); // Terminates process with status code 0 
    } 
} 

//-------------------------------------------------------------------- 
// initMatrix - Assigns a random float value to each indice of the 
//    matrix. 
// 
// PRE: matrix is a pointer to a block of bytes in memory; numIndices 
//  is the number of indicies in the matrix being instantiated. 
// POST: Each index of the matrix has been instantiated with a random 
//  float value. 
//-------------------------------------------------------------------- 
void initMatrix (float* matrix, int numIndices) { 

    /* 
    Loop through the block of bytes, assigning a random float 
    for each index of the matrix 
    */ 
    for (int i = 0; i < numIndices; i++) { 

    /* Assign a random float between 0 and 1 at this byte */ 
    matrix[i] = rand()/(float) RAND_MAX; 
    } 
} 

//-------------------------------------------------------------------- 
// printMatrix - Outputs a readable version of the matrix. 
// 
// PRE: displayName is a string; matrix is a pointer to a block of 
//  bytes in memory; numIndices an integer indicating the number 
//  of indices in the matrix being displayed (read left-to-right, 
//  top-to-bottom); rowSize is an integer indicating the number 
//  of elements in one row of the matrix. 
// POST: A readable version of the matrix is displayed. 
//-------------------------------------------------------------------- 
void printMatrix (string displayName, float* matrix, int numIndices, 
      int rowSize) { 

    /* Output display name of matrix */ 
    cout << "\n" << displayName << ":" << endl; 

    /* Loop through each indice of the matrix */ 
    for (int i = 0; i < numIndices; i++) { 
    cout << matrix[i]; // Display value at this indice 

    /* Check for next row of the matrix */ 
    if (((i + 1) % rowSize) == 0) { 
     cout << endl; // Line break 
    } else { 
     cout << " | "; // Indice separator 
    } 
    } 
}

内核：

// matrixMult1_kernels.cl 
// Multiply two matrices A * B = C 
// Device code. 

// OpenCL Kernel 
__kernel void 
matrixMul(__global float* A, 
      __global float* B, 
     __global float* C, 
      int wA, int wB) { 

    // 2D Thread ID 
    int tx = get_local_id(0); 
    int ty = get_local_id(1); 

    // value stores the element 
    // that is computed by the thread 
    float value = 0; 

    for (int k = 0; k < wA; ++k) 
    { 
     float elementA = A[ty * wA + k]; 
     float elementB = B[k * wB + tx]; 
     value += elementA * elementB; 
    } 

    // Write the matrix to device memory each 
    // thread writes one element 
    C[ty * wA + tx] = value; 
}

样本输出：

Matrix A:

0.398748 | 0.999793 | 0.206833

0.354238 | 0.674347 | 0.492022

0.707017 | 0.353635 | 0.430668

Matrix B:

0.91598 | 0.0260167 | 0.881732

0.810974 | 0.193091 | 0.589857

0.229151 | 0.0657822 | 0.965835

ERROR: C (-30)

我用的NVIDIA GeForce 9800 GT，只支持的OpenCL 1.1的工作。任何帮助在这里将不胜感激。

谢谢，

乔

来源

2013-03-13 J. Jackson

输入数据矩阵A和B不传递给该设备。当您创建缓冲区：

cl::Buffer deviceMemA(context, CL_MEM_READ_WRITE, memoryA,blockA, &err)

的BLOCKa中参数将被忽略，因为标志不指定如何使用它。您至少需要添加CL_MEM_COPY_HOST_PTR以初始化内容为blockA的缓冲区。

或者，您可以拨打clEnqueueWriteBuffer在创建缓冲区后发送数据。

来源

2013-03-13 16:38:42

谢谢！现在缓冲区的格式为cl :: Buffer deviceMemA（上下文，CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR，memoryA，blockA，&err);（C除外，这是只写）现在我得到一个无效的命令队列错误在内核代码的最后一行，它将值设置为矩阵C中的指定位置。 – 2013-03-13 18:57:30

您的缓冲区读取应以字节数传递缓冲区大小，如memoryC，而不是大小{A，B，C}值。，你正在读取代码中缓冲区C的3倍 – 2013-03-13 19:49:41

嗯，我将缓冲区大小更改为sizeA，sizeB和sizeC，并且我已经在缓冲区x3部分找到了，现在这是一个无效的命令队列错误，我承认，我不知道那些NDRange家伙是怎么回事，他们是正确的吗？再次感谢！ – 2013-03-13 20:12:28

OpenCL矩阵乘法入队/缓冲区读取

回答

相关问题