amd64提供了许多隐含的硬件快捷方式。我有什么是0xff00ff00ff00ff00
,我想要的是0xffffffff00000000
,甚至0x00000000ffffffff
。挖掘虽然AMD的manuals显示pshufb
,但该手册在其操作模糊。
我的问题是,是否有执行上述操作的指令(不是软件实现)?是否有指令在amd64程序集中“打包”一个寄存器?
回答
pshufb
可能是您想要的说明。
它需要两个MMX/XMM寄存器作为参数。第一个是要洗牌的数据;第二个(“控制面具”)控制它如何洗牌。
每个寄存器中的数据被分解为字节。对于输出的每个字节,处理器使用的对应字节的控制蒙版来决定如何将它设置:
- 如果掩模字节的高位(
0x80
)被设定,为这个字节的结果是零。 - 否则,掩码字节的下半部分将用作输入字节的索引。例如,如果掩码字节为
0x03
,则输入的第三个字节用于输出的该字节。
作为示例,0x0f0e0d0c0b0a09080706050403020100
的控制掩码将颠倒寄存器中的字节。
请注意,如果您只是使用正常的整数寄存器,将数据导入和导出相应的寄存器可能会很棘手。使用编译器内在函数可能会使这更容易。
我认为'0x0f0e ... 00'是身份掩码(每个元素都没有改变)。高字节是第一个字节,因为您将它写为一个大的128位数字。每个字节以小端顺序分开,0x0f,0x0e,...,0x00是字节反转掩码。 –
一个实验证实了@PeterCordes的观点。 – Ruslan
在这个混帐对于排序蚕食完整代码:https://github.com/regehr/nibble-sort/blob/master/scotty.c
#include "nibble.h"
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <emmintrin.h>
#include <smmintrin.h>
/*
5.8.5 Packed Shuffle Bytes
PSHUFB Permutes each byte in place, according to
a shuffle control mask. The least significant three or
four bits of each shuffle control byte of the control mask form the shuffle
index. The shuffle
mask is unaffected. If the most significant bit
(bit 7) of a shuffle control byte is set, the
constant zero is written in the result byte
*/
// typedef unsigned char uchar
// extern const int BUFSIZE;
/* compile with:
gcc -std=gnu99 -march=native -fbranch-probabilities -ftracer
-fprefetch-loop-arrays -O3
*/
void nibble_sort_scotty(unsigned long *buf);
static inline unsigned long create_mask(char amount, char type, char location) {
unsigned long all = type;
int i;
switch (amount) {
case 0:
return 0;
default:
for (i = 0; i < amount - 1; i++) {
all = ((all << 4) | type);
}
return all << location;
}
}
static inline unsigned long write_new(__m128i read) {
unsigned long write = 0;
int i;
unsigned int where = 64;
unsigned int pulled;
/* ignore zeros saves on loop iterations */
pulled = _mm_extract_epi8(read, 15);
where = where - (pulled * 4);
write |= create_mask(pulled, 0xF, where);
pulled = _mm_extract_epi8(read, 14);
where = where - (pulled * 4);
write |= create_mask(pulled, 0xE, where);
pulled = _mm_extract_epi8(read, 13);
where = where - (pulled * 4);
write |= create_mask(pulled, 0xD, where);
pulled = _mm_extract_epi8(read, 12);
where = where - (pulled * 4);
write |= create_mask(pulled, 0xC, where);
pulled = _mm_extract_epi8(read, 11);
where = where - (pulled * 4);
write |= create_mask(pulled, 0xB, where);
pulled = _mm_extract_epi8(read, 10);
where = where - (pulled * 4);
write |= create_mask(pulled, 0xA, where);
pulled = _mm_extract_epi8(read, 9);
where = where - (pulled * 4);
write |= create_mask(pulled, 9, where);
pulled = _mm_extract_epi8(read, 8);
where = where - (pulled * 4);
write |= create_mask(pulled, 8, where);
pulled = _mm_extract_epi8(read, 7);
where = where - (pulled * 4);
write |= create_mask(pulled, 7, where);
pulled = _mm_extract_epi8(read, 6);
where = where - (pulled * 4);
;
write |= create_mask(pulled, 6, where);
pulled = _mm_extract_epi8(read, 5);
where = where - (pulled * 4);
;
write |= create_mask(pulled, 5, where);
pulled = _mm_extract_epi8(read, 4);
where = where - (pulled * 4);
;
write |= create_mask(pulled, 4, where);
pulled = _mm_extract_epi8(read, 3);
where = where - (pulled * 4);
;
write |= create_mask(pulled, 3, where);
pulled = _mm_extract_epi8(read, 2);
where = where - (pulled * 4);
;
write |= create_mask(pulled, 2, where);
pulled = _mm_extract_epi8(read, 1);
where = where - (pulled * 4);
;
write |= create_mask(pulled, 1, where);
return write;
}
void nibble_sort_scotty(unsigned long *temp) {
__builtin_prefetch(temp, 1, 3);
unsigned long sorted;
__m128i all = {0};
__m128i zero = {0};
int i, j;
int word_holder;
unsigned long *buf = temp;
/* NOTES:
epi32 extract so one can preserver the "higher" bits of 'all'
if we didnt we'd be slamming 0's into all ruining the count
*/
for (j = 0; j < 1024; j++) {
for (i = 0; i < 16; i++) {
/* todo mm extract the byte then nibble shift */
if (*buf == 0)
break;
switch (((*buf) & 0xf)) {
case 0:
// word_holder = _mm_extract_epi32(all, 0);
// all = _mm_insert_epi32(all, word_holder + 0x1, 0);
break;
case 1:
word_holder = _mm_extract_epi32(all, 0);
all = _mm_insert_epi32(all, (word_holder + 0x100), 0);
break;
case 2:
word_holder = _mm_extract_epi32(all, 0);
all = _mm_insert_epi32(all, (word_holder + 0x10000), 0);
break;
case 3:
word_holder = _mm_extract_epi32(all, 0);
all = _mm_insert_epi32(all, (word_holder + 0x1000000), 0);
break;
case 4:
word_holder = _mm_extract_epi32(all, 1);
all = _mm_insert_epi32(all, (word_holder + 0x1), 1);
break;
case 5:
word_holder = _mm_extract_epi32(all, 1);
all = _mm_insert_epi32(all, (word_holder + 0x100), 1);
break;
case 6:
word_holder = _mm_extract_epi32(all, 1);
all = _mm_insert_epi32(all, (word_holder + 0x10000), 1);
break;
case 7:
word_holder = _mm_extract_epi32(all, 1);
all = _mm_insert_epi32(all, (word_holder + 0x1000000), 1);
break;
case 8:
word_holder = _mm_extract_epi32(all, 2);
all = _mm_insert_epi32(all, (word_holder + 0x1), 2);
break;
case 9:
word_holder = _mm_extract_epi32(all, 2);
all = _mm_insert_epi32(all, (word_holder + 0x100), 2);
break;
case 0xA:
word_holder = _mm_extract_epi32(all, 2);
all = _mm_insert_epi32(all, (word_holder + 0x10000), 2);
break;
case 0xB:
word_holder = _mm_extract_epi32(all, 2);
all = _mm_insert_epi32(all, (word_holder + 0x1000000), 2);
break;
case 0xC:
word_holder = _mm_extract_epi32(all, 3);
all = _mm_insert_epi32(all, (word_holder + 0x1), 3);
break;
case 0xD:
word_holder = _mm_extract_epi32(all, 3);
all = _mm_insert_epi32(all, (word_holder + 0x100), 3);
break;
case 0xE:
word_holder = _mm_extract_epi32(all, 3);
all = _mm_insert_epi32(all, (word_holder + 0x10000), 3);
break;
case 0xF:
word_holder = _mm_extract_epi32(all, 3);
all = _mm_insert_epi32(all, (word_holder + 0x1000000), 3);
break;
default:
__builtin_abort();
}
*buf >>= 4;
}
__builtin_prefetch((buf + 1), 1, 3);
*buf = write_new(all);
all = zero;
buf++;
}
}
/*
int main(void)
{
// unsigned long test = 0xc67b1f9a06cac113;
unsigned long test = 0x728dc8ba40781809;//0xfbbbc9741902c44c;// at 0 to
sort to fcccbbb997444210
nibble_sort_scotty(&test);
printf("%lx\n", test);
}
*/
建立混洗控制掩码在平方毫米/XMM2
- 1. 指令指针是一个程序可见的寄存器吗?
- 2. x86程序集MOV指令 - 寄存器到寄存器和存储器到存储器
- 3. 是否有任何指令将所有字节添加到MMX寄存器中?
- 4. 程序集retn,jmp,进程寄存器
- 5. 为什么setjmp(3)不保存AMD64上的所有寄存器?
- 6. 加载到寄存器中的指令
- 7. sparc程序集和%y寄存器
- 8. 程序集 - .data,.code和寄存器...?
- 9. 有没有办法让一个字大小在32位寄存器指令
- 10. LPC111x系列是否支持带有高寄存器的MOV指令?
- 11. 如何检查x86_64程序集中的寄存器是否为零
- 12. 设备是否有寄存器?
- 13. JTAG指令寄存器长度,IR_LEN
- 14. 指令DB和AX寄存器
- 15. 特权指令,添加寄存器值?
- 16. 标签VS在寄存器指令序列(SICP)
- 17. 是否可以在实模式下使用32位寄存器/指令?
- 18. 在gcc内联x86_64程序集中约束r10寄存器
- 19. 在程序集中显示寄存器的值
- 20. 是否有一个CALL指令支持带偏移量的寄存器间接寻址?
- 21. Linux AMD64中使用的fs/gs寄存器如何?
- 22. 检查一个HtmlElement是否有指令
- 23. 有没有办法在一个dll中打包多个.NET程序集?
- 24. 程序中是否有程序集?
- 25. 问题存储一个字节到寄存器x86-64程序集
- 26. Sparc程序集 - 整个函数调用中的全局寄存器不一致
- 27. x86程序集从32位寄存器中获取一个字节
- 28. 将涉及CPU寄存器的指令转换为存储器指令
- 29. 程序集8086:仅汇总MOV和LEA的2个寄存器
- 30. 是否有XSL“包含”指令?
你总是想走动整个字节?如果是的话,SSSE3'pshufb'是要走的路。如果你想打包一些粒度,使用BMI2'pext'(如果有的话)。 (请参阅[我刚刚使用pext/pdep写的另一个答案](http://stackoverflow.com/a/36951611/224132)。)顺便说一句,英特尔的insn ref手册很不错。对于需要它们的指令,它有很好的图表。请参阅[x86标记wiki](http://stackoverflow.com/tags/x86/info)以获取它的链接和英特尔内部指南。 –
为了澄清,你想洗牌一个通用的注册,而不是一个向量注册?您可能需要查看'BMI2'中添加的'pext/pdep'指令。 – EOF
@EOF由于我使用的是纯装配,因此如果需要的话,我可以将值移动。但是,谢谢你的信息! – user69874