2013-02-02 54 views
7

我在一个紧密循环中实现双线性插值并试图用SSE优化它,但是我从它得到零加速。SSE双线性插值

下面是代码,非SIMD版本使用其可以与实现乘法和加法运算符被定义为struct Vec3f { float x, y, z; }一个简单的矢量的结构:

#ifdef USE_SIMD 
    const Color c11 = pixelCache[y1 * size.x + x1]; 
    const Color c12 = pixelCache[y2 * size.x + x1]; 
    const Color c22 = pixelCache[y2 * size.x + x2]; 
    const Color c21 = pixelCache[y1 * size.x + x2]; 

    __declspec(align(16)) float mc11[4] = { 1.0, c11.GetB(), c11.GetG(), c11.GetR() }; 
    __declspec(align(16)) float mc12[4] = { 1.0, c12.GetB(), c12.GetG(), c12.GetR() }; 
    __declspec(align(16)) float mc22[4] = { 1.0, c22.GetB(), c22.GetG(), c22.GetR() }; 
    __declspec(align(16)) float mc21[4] = { 1.0, c21.GetB(), c21.GetG(), c21.GetR() }; 

    // scalars in vector form for SSE 
    const float s11 = (x2-x)*(y2-y); 
    const float s12 = (x2-x)*(y-y1); 
    const float s22 = (x-x1)*(y-y1); 
    const float s21 = (x-x1)*(y2-y); 

    __declspec(align(16)) float ms11[4] = {1.0, s11, s11, s11}; 
    __declspec(align(16)) float ms12[4] = {1.0, s12, s12, s12}; 
    __declspec(align(16)) float ms22[4] = {1.0, s22, s22, s22}; 
    __declspec(align(16)) float ms21[4] = {1.0, s21, s21, s21}; 

    __asm { 
     movaps xmm0, mc11 
     movaps xmm1, mc12 
     movaps xmm2, mc22 
     movaps xmm3, mc21 

     movaps xmm4, ms11 
     movaps xmm5, ms12 
     movaps xmm6, ms22 
     movaps xmm7, ms21 

     mulps xmm0, xmm4 
     mulps xmm1, xmm5 
     mulps xmm2, xmm6 
     mulps xmm3, xmm7 

     addps xmm0, xmm1 
     addps xmm0, xmm2 
     addps xmm0, xmm3 

     movaps mc11, xmm0 
    } 
#else 
    const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]); 
    const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]); 
    const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]); 
    const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]); 

    const Vec3f colour = 
      c11*(x2-x)*(y2-y) + 
      c21*(x-x1)*(y2-y) + 
      c12*(x2-x)*(y-y1) + 
      c22*(x-x1)*(y-y1); 
#endif 

重新排列汇编代码重用寄存器(结束了只有三个xmm寄存器)没有任何效果。我也尝试使用内在函数:

// perform bilinear interpolation 
const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]); 
const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]); 
const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]); 
const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]); 

// scalars in vector form for SSE 
const float s11 = (x2-x)*(y2-y); 
const float s12 = (x2-x)*(y-y1); 
const float s22 = (x-x1)*(y-y1); 
const float s21 = (x-x1)*(y2-y); 

__m128 mc11 = _mm_set_ps(1.f, c11.b, c11.g, c11.r); 
__m128 mc12 = _mm_set_ps(1.f, c12.b, c12.g, c12.r); 
__m128 mc22 = _mm_set_ps(1.f, c22.b, c22.g, c22.r); 
__m128 mc21 = _mm_set_ps(1.f, c21.b, c21.g, c21.r); 

__m128 ms11 = _mm_set_ps(1.f, s11, s11, s11); 
__m128 ms12 = _mm_set_ps(1.f, s12, s12, s12); 
__m128 ms22 = _mm_set_ps(1.f, s22, s22, s22); 
__m128 ms21 = _mm_set_ps(1.f, s21, s21, s21); 

mc11 = _mm_mul_ps(mc11, ms11); 
mc12 = _mm_mul_ps(mc12, ms12); 
mc22 = _mm_mul_ps(mc22, ms22); 
mc21 = _mm_mul_ps(mc21, ms21); 

mc11 = _mm_add_ps(mc11, mc12); 
mc11 = _mm_add_ps(mc11, mc22); 
mc11 = _mm_add_ps(mc11, mc21); 

Vec3f colour; 
_mm_storeu_ps(colour.array, mc11); 

并无济于事。我错过了什么,或者在这里获得任何额外的速度是不可能的?

+8

我讨厌这样说,但这绝对不是正确的方法。首先,你花这么多工作只是填充矢量(这是纯粹的开销)。然后在计算结束时你有一个讨厌的依赖链。但从根本上来说,主要问题是您正在使用数组结构打包。如果你对SIMD很认真,你应该考虑切换到数组结构。 – Mysticial

+0

好吧,我了解了有关载体的信息,我会尝试首先重新排列数据以尊重对齐。但是请您详细说明“计算结束时的依赖链”吗? – SimpleMan

+7

你有3个添加,这取决于对方。所以他们都不能并行完成,因为在开始下一个之前必须先完成。我看到你正在做某种减少 - 这是使用二叉树缩减的最佳方式。现在你只能组合4个向量。所以无论如何重新排列它都没有多大的收获。但我怀疑从更大的角度来看,你实际上正在总结一组更大的数字。 – Mysticial

回答

6

为什么要浮点? 鉴于A,B,C,d和XERR包装像素ARGB,yerr范围0-256,一个简单的例子是:

// ================================================================================================================= 
// xs_Bilerp 
// ================================================================================================================= 
finline uint32 xs_Bilerp (uint32 a, uint32 b, uint32 c, uint32 d, uint32 xerr, uint32 yerr) 
{ 
    #define xs_rbmask 0x00ff00ff 
    #define xs_agmask 0xff00ff00 

    if (a==b && c==d && a==d) return a; 

    const uint32 arb  = a & xs_rbmask; 
    const uint32 crb  = c & xs_rbmask; 
    const uint32 aag  = a & xs_agmask; 
    const uint32 cag  = c & xs_agmask; 

    const uint32 rbdx1  = (b & xs_rbmask) - arb; 
    const uint32 rbdx2  = (d & xs_rbmask) - crb; 
    const uint32 agdx1  = ((b & xs_agmask)>>8) - (aag >> 8); 
    const uint32 agdx2  = ((d & xs_agmask)>>8) - (cag >> 8); 

    const uint32 rb1  = (arb  + ((rbdx1 * xerr) >> 8)) & xs_rbmask; 
    const uint32 ag1  = (aag  + ((agdx1 * xerr) )) & xs_agmask; 
    const uint32 rbdy  = ((crb  + ((rbdx2 * xerr) >> 8)) & xs_rbmask)  - rb1; 
    const uint32 agdy  = (((cag + ((agdx2 * xerr) )) & xs_agmask)>>8) - (ag1 >> 8); 

    const uint32 rb   = (rb1 + ((rbdy * yerr) >> 8)) & xs_rbmask; 
    const uint32 ag   = (ag1 + ((agdy * yerr) )) & xs_agmask; 

    return ag | rb; 
}