2014-12-06 24 views
1

林新在上证所内部函数,并会在使用该9AS此欣赏一些线索协助尚未大雾给我)如何重写这段代码上证所内在

我有这样的代码

for(int k=0; k<=n-4; k+=4) 
{ 

    int xc0 = 512 + ((idx + k*iddx)>>6); 
    int yc0 = 512 + ((idy + k*iddy)>>6); 

    int xc1 = 512 + ((idx + (k+1)*iddx)>>6); 
    int yc1 = 512 + ((idy + (k+1)*iddy)>>6); 

    int xc2 = 512 + ((idx + (k+2)*iddx)>>6); 
    int yc2 = 512 + ((idy + (k+2)*iddy)>>6); 

    int xc3 = 512 + ((idx + (k+3)*iddx)>>6); 
    int yc3 = 512 + ((idy + (k+3)*iddy)>>6); 

    unsigned color0 = working_buffer[yc0*working_buffer_size_x + xc0]; 
    unsigned color1 = working_buffer[yc1*working_buffer_size_x + xc1]; 
    unsigned color2 = working_buffer[yc2*working_buffer_size_x + xc2]; 
    unsigned color3 = working_buffer[yc3*working_buffer_size_x + xc3]; 

    int adr = base_adr + k; 

    frame_bitmap[adr] = color0; 
    frame_bitmap[adr+1]= color1; 
    frame_bitmap[adr+2]= color2; 
    frame_bitmap[adr+3]= color3; 
} 

都在这里是int/unsigned,这是循环的关键部分,不确定整数sse是否会在速度上有所帮助,但不知道它是否会起作用?有人可以帮忙吗?

(即时通讯使用的mingw32)

+0

你可以去混淆'working_buffer'的实际访问模式吗?所以,只需指数数学。这有点难以解码。我仍不确定这是一个“奇怪的聚会”还是一种可以与之合作的模式。 – harold 2014-12-06 17:32:03

+0

看起来像一个“聚集”类型的操作,所以至少需要AVX2。 – 2014-12-06 17:53:15

+0

working_buffer是一个无符号颜色的纹理[] [] [数据有1024 x 1024,尽管可悲的是working_buffer低维度比1024更大 - 尽管如果非常需要我可以重写某些代码以使其仅为无符号texture_bitmap [1024] [ 1024] – user2214913 2014-12-06 19:48:09

回答

1

我SSE是有点生疏,但你应该做的是:

xmm0: [k, k+1, k+2, k+3] //xc0, xc1,.... 
xmm1: [k, k+1, k+2, k+3] //yc0, yc1,.... 
//initialize before the loop 
xmm2: [512, 512, 512, 512] 
xmm3: [idx, idx, idx, idx] 
xmm4: [iddx, iddx, iddx, iddx] 
xmm5: [idy, idy, idy, idy] 
xmm6: [iddy, iddy, iddy, iddy] 
xmm7: [working_buffer_size_x, working_buffer_size_x, working_buffer_size_x, working_buffer_size_x] 

计算:

xmm0 * xmm4 
xmm0 + xmm3 
xmm0 >> 6 
xmm0 + xmm2 

xmm0: [xc0, xc1, xc2, xc3] 
/////////////////////////////// 

xmm1 * xmm6 
xmm1 + xmm5 
xmm1 >> 6 
xmm1 + xmm2 

xmm1: [yc0, yc1, yc2, yc3] 

xmm1 * xmm7 
xmm1 + xmm0 

现在xmm1是:

xmm1: [yc0*working_buffer_size_x + xc0, yc1*working_buffer_size_x + xc1, yc2*working_buffer_size_x + xc2, yc3*working_buffer_size_x + xc3] 

您正在读写每个循环(working_buffer,frame_bitmap数组)中的内存,这些操作比计算本身的速度要慢得多,所以速度的提升不会像预期的那么大。

编辑

你需要working_buffer和frame_bitmap阵列被对准并且被SSE4.1

#include <emmintrin.h> 
#include <smmintrin.h> //SSE4.1 

int a[4] __attribute__((aligned(16))); 
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; 

xmm2 = _mm_set1_epi32(512); 
xmm3 = _mm_set1_epi32(idx); 
xmm4 = _mm_set1_epi32(iddx); 
xmm5 = _mm_set1_epi32(idy); 
xmm6 = _mm_set1_epi32(iddy); 
xmm7 = _mm_set1_epi32(working_buffer_size_x); 

for(k = 0; k <= n - 4; k +=4){ 
    xmm0 = _mm_set_epi32(k + 3, k + 2, k + 1, k); 
    xmm1 = _mm_set_epi32(k + 3, k + 2, k + 1, k); 

    //xmm0 * xmm4 
    xmm0 = _mm_mullo_epi32(xmm0, xmm4); 

    //xmm0 + xmm3 
    xmm0 = _mm_add_epi32(xmm0, xmm3); 

    //xmm0 >> 6 
    xmm0 = _mm_srai_epi32(xmm0, 6); 

    //xmm0 + xmm2 
    xmm0 = _mm_add_epi32(xmm0, xmm2); 



    //xmm1 * xmm6 
    xmm1 = _mm_mullo_epi32(xmm1, xmm6); 

    //xmm1 + xmm5 
    xmm1 = _mm_add_epi32(xmm1, xmm5); 

    //xmm1 >> 6 
    xmm1 = _mm_srai_epi32(xmm1, 6); 

    //xmm1 + xmm2 
    xmm1 = _mm_add_epi32(xmm1, xmm2); 


    //xmm1 * xmm7 
    xmm1 = _mm_mullo_epi32(xmm1, xmm7); 
    //xmm1 + xmm0 
    xmm1 = _mm_add_epi32(xmm1, xmm0); 


    //a[0] = yc0*working_buffer_size_x + xc0 
    //a[1] = yc1*working_buffer_size_x + xc1 
    //a[2] = yc2*working_buffer_size_x + xc2 
    //a[3] = yc3*working_buffer_size_x + xc3 
    _mm_store_si128((__m128i *)&a[0], xmm1); 

    unsigned color0 = working_buffer[ a[0] ]; 
    unsigned color1 = working_buffer[ a[1] ]; 
    unsigned color2 = working_buffer[ a[2] ]; 
    unsigned color3 = working_buffer[ a[3] ]; 

    int adr = base_adr + k; 

    frame_bitmap[adr] = color0; 
    frame_bitmap[adr+1]= color1; 
    frame_bitmap[adr+2]= color2; 
    frame_bitmap[adr+3]= color3; 
} 

您可以优化它更通过避免_mm_store_si128((__m128i *)&a[0], xmm1);int adr = base_adr + k;使用直接处理内存的程序集。

+0

好吧,我知道在这里速度的提升可能并不大(如果有的话,如果它在所有的作品中都可以测试 – user2214913 2014-12-06 17:07:31

+0

ps这是工作的一部分,也许有人可以继续进行下去?我知道的助记符就像__m128 a128 = _mm_load_ps(a); __m128 b128 = _mm_load_ps(b); __m128 out128 = _mm_div_ps(a128,b128); _mm_store_ps(out,out128); – user2214913 2014-12-06 17:09:48

+1

@ user2214913正如我所说,我在几个月前写过内在函数,所以我不记得它们。但你的例子非常简单。我会看看我能做什么。 – 2014-12-06 17:23:58