0
#include "utils.h"
__global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
{
for (size_t r = 0; r < numRows; ++r) {
for (size_t c = 0; c < numCols; ++c) {
uchar4 rgba = rgbaImage[r * numCols + c];
float channelSum = 0.299f * rgba.x + 0.587f * rgba.y + 0.114f * rgba.z;
greyImage[r * numCols + c] = channelSum;
}
}
}
void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage, size_t numRows, size_t numCols)
{
const dim3 blockSize(1, 1, 1); //TODO
const dim3 gridSize(1, 1, 1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
这是用于将彩色图像转换为灰度的代码。我正在完成这门课程的作业,并在completing it
之后得到了这些结果。无法理解CUDA内核启动的行为
A.
blockSize = (1, 1, 1)
gridSize = (1, 1, 1)
Your code ran in: 34.772705 msecs.
B.
blockSize = (numCols, 1, 1)
gridSize = (numRows, 1, 1)
Your code ran in: 1821.326416 msecs.
C.
blockSize = (numRows, 1, 1)
gridSize = (numCols, 1, 1)
Your code ran in: 1695.917480 msecs.
D.
blockSize = (1024, 1, 1)
gridSize = (170, 1, 1) [the image size is : r=313, c=557, blockSize*gridSize ~= r*c]
Your code ran in: 1709.109863 msecs.
我已经尝试了几个组合,但没有得到更好的性能比A.我差的只有几纳秒亲近的小值增加块大小和gridsize。 例如:
blockSize = (10, 1, 1)
gridSize = (10, 1, 1)
Your code ran in: 34.835167 msecs.
我不明白为什么更高的数字没有得到更好的性能,反而导致更糟糕的表现。此外,似乎增加块大小比网格大小更好。