2016-04-30 36 views
1

amd64提供了许多隐含的硬件快捷方式。我有什么是0xff00ff00ff00ff00,我想要的是0xffffffff00000000,甚至0x00000000ffffffff。挖掘虽然AMD的manuals显示pshufb,但该手册在其操作模糊。

我的问题是,是否有执行上述操作的指令(不是软件实现)?是否有指令在amd64程序集中“打包”一个寄存器?

+2

你总是想走动整个字节?如果是的话,SSSE3'pshufb'是要走的路。如果你想打包一些粒度,使用BMI2'pext'(如果有的话)。 (请参阅[我刚刚使用pext/pdep写的另一个答案](http://stackoverflow.com/a/36951611/224132)。)顺便说一句,英特尔的insn ref手册很不错。对于需要它们的指令,它有很好的图表。请参阅[x86标记wiki](http://stackoverflow.com/tags/x86/info)以获取它的链接和英特尔内部指南。 –

+1

为了澄清,你想洗牌一个通用的注册,而不是一个向量注册?您可能需要查看'BMI2'中添加的'pext/pdep'指令。 – EOF

+0

@EOF由于我使用的是纯装配,因此如果需要的话,我可以将值移动。但是,谢谢你的信息! – user69874

回答

2

pshufb可能是您想要的说明。

它需要两个MMX/XMM寄存器作为参数。第一个是要洗牌的数据;第二个(“控制面具”)控制它如何洗牌。

每个寄存器中的数据被分解为字节。对于输出的每个字节,处理器使用的对应字节的控制蒙版来决定如何将它设置:

  • 如果掩模字节的高位(0x80)被设定,为这个字节的结果是零。
  • 否则,掩码字节的下半部分将用作输入字节的索引。例如,如果掩码字节为0x03,则输入的第三个字节用于输出的该字节。

作为示例,0x0f0e0d0c0b0a09080706050403020100的控制掩码将颠倒寄存器中的字节。

请注意,如果您只是使用正常的整数寄存器,将数据导入和导出相应的寄存器可能会很棘手。使用编译器内在函数可能会使这更容易。

+2

我认为'0x0f0e ... 00'是身份掩码(每个元素都没有改变)。高字节是第一个字节,因为您将它写为一个大的128位数字。每个字节以小端顺序分开,0x0f,0x0e,...,0x00是字节反转掩码。 –

+0

一个实验证实了@PeterCordes的观点。 – Ruslan

0

在这个混帐对于排序蚕食完整代码:https://github.com/regehr/nibble-sort/blob/master/scotty.c

#include "nibble.h" 

#include <stdio.h> 
#include <stdlib.h> 
#include <inttypes.h> 
#include <emmintrin.h> 
#include <smmintrin.h> 

/* 
    5.8.5 Packed Shuffle Bytes 
    PSHUFB Permutes each byte in place, according to 
    a shuffle control mask. The least significant three or 
    four bits of each shuffle control byte of the control mask form the shuffle 
    index. The shuffle 
    mask is unaffected. If the most significant bit 
    (bit 7) of a shuffle control byte is set, the 
    constant zero is written in the result byte 
*/ 

// typedef unsigned char uchar 

// extern const int BUFSIZE; 

/* compile with: 
gcc -std=gnu99 -march=native -fbranch-probabilities -ftracer 
-fprefetch-loop-arrays -O3 
*/ 

void nibble_sort_scotty(unsigned long *buf); 

static inline unsigned long create_mask(char amount, char type, char location) { 
    unsigned long all = type; 
    int i; 

    switch (amount) { 
    case 0: 
    return 0; 
    default: 
    for (i = 0; i < amount - 1; i++) { 
     all = ((all << 4) | type); 
    } 
    return all << location; 
    } 
} 

static inline unsigned long write_new(__m128i read) { 
    unsigned long write = 0; 
    int i; 
    unsigned int where = 64; 
    unsigned int pulled; 
    /* ignore zeros saves on loop iterations */ 

    pulled = _mm_extract_epi8(read, 15); 
    where = where - (pulled * 4); 
    write |= create_mask(pulled, 0xF, where); 

    pulled = _mm_extract_epi8(read, 14); 
    where = where - (pulled * 4); 
    write |= create_mask(pulled, 0xE, where); 

    pulled = _mm_extract_epi8(read, 13); 
    where = where - (pulled * 4); 
    write |= create_mask(pulled, 0xD, where); 

    pulled = _mm_extract_epi8(read, 12); 
    where = where - (pulled * 4); 
    write |= create_mask(pulled, 0xC, where); 

    pulled = _mm_extract_epi8(read, 11); 
    where = where - (pulled * 4); 
    write |= create_mask(pulled, 0xB, where); 

    pulled = _mm_extract_epi8(read, 10); 
    where = where - (pulled * 4); 
    write |= create_mask(pulled, 0xA, where); 

    pulled = _mm_extract_epi8(read, 9); 
    where = where - (pulled * 4); 
    write |= create_mask(pulled, 9, where); 

    pulled = _mm_extract_epi8(read, 8); 
    where = where - (pulled * 4); 
    write |= create_mask(pulled, 8, where); 

    pulled = _mm_extract_epi8(read, 7); 
    where = where - (pulled * 4); 
    write |= create_mask(pulled, 7, where); 

    pulled = _mm_extract_epi8(read, 6); 
    where = where - (pulled * 4); 
    ; 
    write |= create_mask(pulled, 6, where); 

    pulled = _mm_extract_epi8(read, 5); 
    where = where - (pulled * 4); 
    ; 
    write |= create_mask(pulled, 5, where); 

    pulled = _mm_extract_epi8(read, 4); 
    where = where - (pulled * 4); 
    ; 
    write |= create_mask(pulled, 4, where); 

    pulled = _mm_extract_epi8(read, 3); 
    where = where - (pulled * 4); 
    ; 
    write |= create_mask(pulled, 3, where); 

    pulled = _mm_extract_epi8(read, 2); 
    where = where - (pulled * 4); 
    ; 
    write |= create_mask(pulled, 2, where); 

    pulled = _mm_extract_epi8(read, 1); 
    where = where - (pulled * 4); 
    ; 
    write |= create_mask(pulled, 1, where); 

    return write; 
} 

void nibble_sort_scotty(unsigned long *temp) { 
    __builtin_prefetch(temp, 1, 3); 
    unsigned long sorted; 
    __m128i all = {0}; 
    __m128i zero = {0}; 

    int i, j; 
    int word_holder; 
    unsigned long *buf = temp; 
    /* NOTES: 
    epi32 extract so one can preserver the "higher" bits of 'all' 
    if we didnt we'd be slamming 0's into all ruining the count 
    */ 
    for (j = 0; j < 1024; j++) { 
    for (i = 0; i < 16; i++) { 
     /* todo mm extract the byte then nibble shift */ 
     if (*buf == 0) 
     break; 

     switch (((*buf) & 0xf)) { 
     case 0: 
     // word_holder = _mm_extract_epi32(all, 0); 
     // all = _mm_insert_epi32(all, word_holder + 0x1, 0); 
     break; 
     case 1: 
     word_holder = _mm_extract_epi32(all, 0); 
     all = _mm_insert_epi32(all, (word_holder + 0x100), 0); 
     break; 
     case 2: 
     word_holder = _mm_extract_epi32(all, 0); 
     all = _mm_insert_epi32(all, (word_holder + 0x10000), 0); 
     break; 
     case 3: 
     word_holder = _mm_extract_epi32(all, 0); 
     all = _mm_insert_epi32(all, (word_holder + 0x1000000), 0); 
     break; 
     case 4: 
     word_holder = _mm_extract_epi32(all, 1); 
     all = _mm_insert_epi32(all, (word_holder + 0x1), 1); 
     break; 
     case 5: 
     word_holder = _mm_extract_epi32(all, 1); 
     all = _mm_insert_epi32(all, (word_holder + 0x100), 1); 
     break; 
     case 6: 
     word_holder = _mm_extract_epi32(all, 1); 
     all = _mm_insert_epi32(all, (word_holder + 0x10000), 1); 
     break; 
     case 7: 
     word_holder = _mm_extract_epi32(all, 1); 
     all = _mm_insert_epi32(all, (word_holder + 0x1000000), 1); 
     break; 
     case 8: 
     word_holder = _mm_extract_epi32(all, 2); 
     all = _mm_insert_epi32(all, (word_holder + 0x1), 2); 
     break; 
     case 9: 
     word_holder = _mm_extract_epi32(all, 2); 
     all = _mm_insert_epi32(all, (word_holder + 0x100), 2); 
     break; 
     case 0xA: 
     word_holder = _mm_extract_epi32(all, 2); 
     all = _mm_insert_epi32(all, (word_holder + 0x10000), 2); 
     break; 
     case 0xB: 
     word_holder = _mm_extract_epi32(all, 2); 
     all = _mm_insert_epi32(all, (word_holder + 0x1000000), 2); 
     break; 
     case 0xC: 
     word_holder = _mm_extract_epi32(all, 3); 
     all = _mm_insert_epi32(all, (word_holder + 0x1), 3); 
     break; 
     case 0xD: 
     word_holder = _mm_extract_epi32(all, 3); 
     all = _mm_insert_epi32(all, (word_holder + 0x100), 3); 
     break; 
     case 0xE: 
     word_holder = _mm_extract_epi32(all, 3); 
     all = _mm_insert_epi32(all, (word_holder + 0x10000), 3); 
     break; 
     case 0xF: 
     word_holder = _mm_extract_epi32(all, 3); 
     all = _mm_insert_epi32(all, (word_holder + 0x1000000), 3); 
     break; 
     default: 
     __builtin_abort(); 
     } 
     *buf >>= 4; 
    } 
    __builtin_prefetch((buf + 1), 1, 3); 
    *buf = write_new(all); 
    all = zero; 
    buf++; 
    } 
} 

/* 
int main(void) 
{ 
     // unsigned long test = 0xc67b1f9a06cac113; 
     unsigned long test = 0x728dc8ba40781809;//0xfbbbc9741902c44c;// at 0 to 
sort to fcccbbb997444210 
     nibble_sort_scotty(&test); 
     printf("%lx\n", test); 
} 
*/ 

建立混洗控制掩码平方毫米/XMM2

相关问题