2017-08-06 45 views
3

我试图在内核中嵌入ARMv8上的AES实现。我有一个C++实现,并且我有一个英特尔内核实现。ARM和Intel内部函数对AES使用相同的子密钥计划吗?

实现应该是等效的,所以我试图将英特尔用作ARMv8的蓝图。有一些差异,但他们是占了。问题是,我得到了不同的结果。

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds) 
{ 
#if defined(__ARM_FEATURE_CRYPTO) 

    uint8x16_t data = vld1q_u8(in); 

    // AES encryption with ARM intrinsics: 
    // rnds-1 (9 for AES128) cycles of AES: 
    // (Add, Shift, Sub) plus Mix Columns 
    unsigned int i; 
    for (i=0; i<rounds; ++i) 
    { 
     // AES single round encryption 
     data = vaeseq_u8(data, rdkeys[i]); 
     // AES mix columns 
     data = vaesmcq_u8(data); 
    } 
    // One round of encryption: AES, no Mix Columns 
    data = vaeseq_u8(data, rdkeys[i++]); 
    // Final Add (bitwise Xor) 
    data = veorq_u8(data, rdkeys[i]); 
    vst1q_u8(out, data); 

#elif defined(__AES__) 

    __m128i data = _mm_loadu_si128((const __m128i*)in); 
    data = _mm_xor_si128(data, rdkeys[0]); 
    for (unsigned int i=1; i<rounds-1; ++i) 
    { 
     data = _mm_aesenc_si128(data, rdkeys[i]); 
    } 
    data = _mm_aesenc_si128(data, rdkeys[rounds-1]); 
    data = _mm_aesenclast_si128(data, rdkeys[rounds]); 
    _mm_storeu_si128((__m128i*)out, data); 

#endif 
} 

在这一点上,我试图侧步子键计算。我在这两种实现中使用相同的一组圆键:

#if defined(__ARM_FEATURE_CRYPTO) 
typedef uint8x16_t RoundKey; 
typedef uint8_t Byte; 
#elif defined(__AES__) 
typedef __m128i RoundKey; 
typedef uint8_t Byte; 
#endif 

// Avoid subkey scheduling at this point 
RoundKey rdkeys[ROUNDS+1]; 
for (size_t i=0; i<COUNTOF(rdkeys); ++i) 
    memset(&rdkeys[i], (i<<4)|i, sizeof(RoundKey)); 

但是,我得到了不同的结果。这里的垃圾场产生什么:

英特尔AES-NI

In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF 
... 
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 
Data: 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07 
... 
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 
Data: 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA 
Data: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 
... 

Out: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 

ARMv8 AES

In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF 
... 
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 
Data: C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 
... 
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 
Data: C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA 
Data: F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 
... 
Out: F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9 

我一直在抓我的头以上的结果。添加更多的printf并不能帮助识别问题。我开始认为英特尔和ARM内部函数使用不同的子密钥计划。

ARM和Intel内部函数对AES使用相同的子密钥计划吗?


下面的图片是从paper by Cynthia Crutchfield。它检查了英特尔内在函数和ARM内在函数的映射。

enter image description here


下面是完整的程序。列出了它们的命令行。

英特尔

g++ -Wall -maes aes-test.cxx -o aes-test.exe 

AEMv8

g++ -Wall -march=armv8-a+crc+crypto -mtune=cortex-a53 aes-test.cxx -o aes-test.exe 

计划

#include <stdio.h> 
#include <stdlib.h> 
#include <stdint.h> 
#include <string.h> 

#if defined(__ARM_FEATURE_CRYPTO) 
# include <arm_neon.h> 
# include <arm_acle.h> 
#elif defined(__AES__) 
# include <wmmintrin.h> 
# include <emmintrin.h> 
#endif 

#if defined(__ARM_FEATURE_CRYPTO) 
typedef uint8x16_t RoundKey; 
typedef uint8_t Byte; 
#elif defined(__AES__) 
typedef __m128i RoundKey; 
typedef uint8_t Byte; 
#endif 

#define COUNTOF(x) (sizeof(x)/(sizeof(x)[0])) 

static const unsigned int ROUNDS=10; 
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds); 
void AES_decrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds); 

void Print(const char* label, const Byte *in, size_t len, bool lf=false) 
{ 
    if (label) 
     printf("%s: ", label); 

    for (size_t i=0; in && i<len; ++i) 
     printf("%02X ", in[i]);  
    printf("\n"); 

    if (lf) 
     printf("\n"); 
} 

int main(int argc, char* argv[]) 
{ 
    Byte cipher[16], recover[16]; 
    const Byte plain[16] = { 
     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 
     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF 
    }; 

    // Avoid subkey scheduling at this point 
    RoundKey rdkeys[ROUNDS+1]; 
    for (size_t i=0; i<COUNTOF(rdkeys); ++i) 
     memset(&rdkeys[i], (i<<4)|i, sizeof(rdkeys[i])); 

    AES_encrypt(plain, cipher, rdkeys, ROUNDS); 

    return 0; 
} 

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds) 
{ 
    Print("In", in, 16); 

#if defined(__ARM_FEATURE_CRYPTO) 

    // Load the block 
    uint8x16_t data = vld1q_u8(in); 

    Print("Data (in)", (Byte*)&data, 16, true); 

    // AES encryption with ARM intrinsics: 
    // rnds-1 (9 for AES128) cycles of AES: 
    // (Add, Shift, Sub) plus Mix Columns 
    unsigned int i; 
    for (i=0; i<rounds; ++i) 
    { 
     // AES single round encryption 
     data = vaeseq_u8(data, rdkeys[i]); 
     // AES mix columns 
     data = vaesmcq_u8(data); 

     Print("Key", (Byte*)&rdkeys[i], 16); 
     Print("Data", (Byte*)&data, 16, true); 
    } 

    Print("Key", (Byte*)&rdkeys[i], 16); 

    // One round of encryption: AES, no Mix Columns 
    data = vaeseq_u8(data, rdkeys[i++]); 

    Print("Data", (Byte*)&data, 16, true); 

    // Final Add (bitwise Xor) 
    data = veorq_u8(data, rdkeys[i]); 

    Print("Data (xor)", (Byte*)&data, 16); 

    // Store the output data 
    vst1q_u8(out, data); 

#elif defined(__AES__) 

    __m128i data = _mm_loadu_si128((const __m128i*)in); 

    Print("Data (in)", (Byte*)&data, 16); 

    data = _mm_xor_si128(data, rdkeys[0]); 

    Print("Key", (Byte*)&rdkeys[0], 16); 
    Print("Data (xor)", (Byte*)&data, 16, true); 

    for (unsigned int i=1; i<rounds-1; ++i) 
    { 
     data = _mm_aesenc_si128(data, rdkeys[i]); 

     Print("Key", (Byte*)&rdkeys[i], 16); 
     Print("Data", (Byte*)&data, 16, true); 
    } 
    data = _mm_aesenc_si128(data, rdkeys[rounds-1]); 

    Print("Key", (Byte*)&rdkeys[rounds-1], 16); 
    Print("Data", (Byte*)&data, 16, true); 

    data = _mm_aesenclast_si128(data, rdkeys[rounds]); 

    Print("Key", (Byte*)&rdkeys[rounds], 16); 
    Print("Data", (Byte*)&data, 16, true); 

    _mm_storeu_si128((__m128i*)out, data); 

#endif 

    Print("Out", out, 16); 
} 
+1

我注意到在数据值的差异似乎是要么0×90(90)和0xBE(190)...(如0xF9 - 0×69,0xC3 - 0x33,等等),也许这将有助于缩小问题的范围...... ARM的外观看起来有点奇怪。 –

回答

2

ARM和Intel内部函数对AES使用相同的子密钥计划吗?

看来答案是肯定的。我仍然需要对真正的密钥调度进行测试,但是我能够使用相同的密钥计划对英特尔和ARMv8内在函数产生相同的结果。

看起来像在Crutchfield的参考实现中有一个偏移。它应该使用rounds-1,而不是rounds作为循环控制。这意味着我正在测试ARMv8有11轮,而不是10.当ARMv8代码生成F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9而不是F9 F9 ... F9 F9时,我应该怀疑它。

下面是更新后的代码:

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds) 
{ 
#if defined(__ARM_FEATURE_CRYPTO) 

    uint8x16_t data = vld1q_u8(in); 

    unsigned int i; 
    for (i=0; i<rounds-1; ++i) 
    { 
     data = vaeseq_u8(data, rdkeys[i]); 
     data = vaesmcq_u8(data); 
    } 

    data = vaeseq_u8(data, rdkeys[i++]); 
    data = veorq_u8(data, rdkeys[i]); 

    vst1q_u8(out, data); 

#elif defined(__AES__) 

    __m128i data = _mm_loadu_si128((const __m128i*)in); 
    data = _mm_xor_si128(data, rdkeys[0]); 

    unsigned int i; 
    for (i=1; i<rounds-1; ++i) 
    { 
     data = _mm_aesenc_si128(data, rdkeys[i]); 
    } 

    data = _mm_aesenc_si128(data, rdkeys[i++]); 
    data = _mm_aesenclast_si128(data, rdkeys[i]); 
    _mm_storeu_si128((__m128i*)out, data); 

#endif 
} 
+0

因此,ARM代码现在正在运行? –

+0

@MaartenBodewes - 是的,代码现在正在运行。我在[修复ARMv8 AES加密](https://github.com/noloader/cryptopp/commit/701ec3aa1f45a754)和[修复ARMv8 AES解密](https://github.com/noloader/)上在我的测试分支中进行了检查cryptopp /提交/ 14590423249d)。一旦通过测试,我会将其推送给Master。它的运行速度约为2.8至3.0 cpb。它的速度比其他表现最佳的公司要慢1.0 cpb,但对于C++来说,它要比30-45 cpb要好。 – jww

相关问题