0
当我编译这个简单的单词XOR示例时,Clang 3.4不会引导它。为什么?这似乎比http://llvm.org/docs/Vectorizers.html#features中的示例更简单,我们在单个单词之间没有数据依赖性。所以从理论上讲,这可以是矢量化的。使用为什么Clang没有vectorise big-int XOR
#include <stdio.h>
void do_xor(const unsigned int num1[5], const unsigned int num2[5]) {
unsigned int num3[5];
// word-wise xor
for (int i = 0; i < 5; ++i)
num3[i] = num1[i]^num2[i];
for (int i = 4; i >= 0; --i)
printf("%08x", num3[i]);
printf("\n");
}
clang -O3 -fslp-vectorize-aggressive -emit-llvm .....
(强制向量化),我们结束了(剥离非功能份):
define void @do_xor(i32* nocapture readonly %num1, i32* nocapture readonly %num2) #0 {
.preheader4:
%0 = load i32* %num1, align 4, !tbaa !1
%1 = load i32* %num2, align 4, !tbaa !1
%2 = xor i32 %1, %0
%3 = getelementptr inbounds i32* %num1, i64 1
%4 = load i32* %3, align 4, !tbaa !1
%5 = getelementptr inbounds i32* %num2, i64 1
%6 = load i32* %5, align 4, !tbaa !1
%7 = xor i32 %6, %4
%8 = getelementptr inbounds i32* %num1, i64 2
%9 = load i32* %8, align 4, !tbaa !1
%10 = getelementptr inbounds i32* %num2, i64 2
%11 = load i32* %10, align 4, !tbaa !1
%12 = xor i32 %11, %9
%13 = getelementptr inbounds i32* %num1, i64 3
%14 = load i32* %13, align 4, !tbaa !1
%15 = getelementptr inbounds i32* %num2, i64 3
%16 = load i32* %15, align 4, !tbaa !1
%17 = xor i32 %16, %14
%18 = getelementptr inbounds i32* %num1, i64 4
%19 = load i32* %18, align 4, !tbaa !1
%20 = getelementptr inbounds i32* %num2, i64 4
%21 = load i32* %20, align 4, !tbaa !1
%22 = xor i32 %21, %19
%23 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i64 0, i64 0), i32 %22) #2
%24 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i64 0, i64 0), i32 %17) #2
%25 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i64 0, i64 0), i32 %12) #2
%26 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i64 0, i64 0), i32 %7) #2
%27 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([5 x i8]* @.str, i64 0, i64 0), i32 %2) #2
%putchar = tail call i32 @putchar(i32 10) #2
ret void
}
产生无矢量的代码。为什么?
该标志不会“强制”任何东西,并且矢量化循环及其加载/设置成本无法比并行发出的5条简单快速的按位异或指令快。另外,如何有效处理第五元素? –
我认为,IR应该包含向量代码。机器代码应该决定矢量是否比单个指令更有效率并生成相应的代码。第五个元素总是可以作为单个指令处理 - 这没问题? –
您应该使参数对齐到16个字节,然后才能对代码进行矢量化 –