ARM和Intel内部函数对AES使用相同的子密钥计划吗？

我试图在内核中嵌入ARMv8上的AES实现。我有一个C++实现，并且我有一个英特尔内核实现。ARM和Intel内部函数对AES使用相同的子密钥计划吗？

实现应该是等效的，所以我试图将英特尔用作ARMv8的蓝图。有一些差异，但他们是占了。问题是，我得到了不同的结果。

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds) 
{ 
#if defined(__ARM_FEATURE_CRYPTO) 

    uint8x16_t data = vld1q_u8(in); 

    // AES encryption with ARM intrinsics: 
    // rnds-1 (9 for AES128) cycles of AES: 
    // (Add, Shift, Sub) plus Mix Columns 
    unsigned int i; 
    for (i=0; i<rounds; ++i) 
    { 
     // AES single round encryption 
     data = vaeseq_u8(data, rdkeys[i]); 
     // AES mix columns 
     data = vaesmcq_u8(data); 
    } 
    // One round of encryption: AES, no Mix Columns 
    data = vaeseq_u8(data, rdkeys[i++]); 
    // Final Add (bitwise Xor) 
    data = veorq_u8(data, rdkeys[i]); 
    vst1q_u8(out, data); 

#elif defined(__AES__) 

    __m128i data = _mm_loadu_si128((const __m128i*)in); 
    data = _mm_xor_si128(data, rdkeys[0]); 
    for (unsigned int i=1; i<rounds-1; ++i) 
    { 
     data = _mm_aesenc_si128(data, rdkeys[i]); 
    } 
    data = _mm_aesenc_si128(data, rdkeys[rounds-1]); 
    data = _mm_aesenclast_si128(data, rdkeys[rounds]); 
    _mm_storeu_si128((__m128i*)out, data); 

#endif 
}

在这一点上，我试图侧步子键计算。我在这两种实现中使用相同的一组圆键：

#if defined(__ARM_FEATURE_CRYPTO) 
typedef uint8x16_t RoundKey; 
typedef uint8_t Byte; 
#elif defined(__AES__) 
typedef __m128i RoundKey; 
typedef uint8_t Byte; 
#endif 

// Avoid subkey scheduling at this point 
RoundKey rdkeys[ROUNDS+1]; 
for (size_t i=0; i<COUNTOF(rdkeys); ++i) 
    memset(&rdkeys[i], (i<<4)|i, sizeof(RoundKey));

但是，我得到了不同的结果。这里的垃圾场产生什么：

英特尔AES-NI：

In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF 
... 
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 
Data: 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07 
... 
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 
Data: 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA 
Data: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 
... 

Out: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69

ARMv8 AES：

In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF 
... 
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 
Data: C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 
... 
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 
Data: C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA 
Data: F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 
... 
Out: F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9

我一直在抓我的头以上的结果。添加更多的printf并不能帮助识别问题。我开始认为英特尔和ARM内部函数使用不同的子密钥计划。

ARM和Intel内部函数对AES使用相同的子密钥计划吗？

下面的图片是从paper by Cynthia Crutchfield。它检查了英特尔内在函数和ARM内在函数的映射。

下面是完整的程序。列出了它们的命令行。

英特尔：

g++ -Wall -maes aes-test.cxx -o aes-test.exe

AEMv8：

g++ -Wall -march=armv8-a+crc+crypto -mtune=cortex-a53 aes-test.cxx -o aes-test.exe

计划：

#include <stdio.h> 
#include <stdlib.h> 
#include <stdint.h> 
#include <string.h> 

#if defined(__ARM_FEATURE_CRYPTO) 
# include <arm_neon.h> 
# include <arm_acle.h> 
#elif defined(__AES__) 
# include <wmmintrin.h> 
# include <emmintrin.h> 
#endif 

#if defined(__ARM_FEATURE_CRYPTO) 
typedef uint8x16_t RoundKey; 
typedef uint8_t Byte; 
#elif defined(__AES__) 
typedef __m128i RoundKey; 
typedef uint8_t Byte; 
#endif 

#define COUNTOF(x) (sizeof(x)/(sizeof(x)[0])) 

static const unsigned int ROUNDS=10; 
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds); 
void AES_decrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds); 

void Print(const char* label, const Byte *in, size_t len, bool lf=false) 
{ 
    if (label) 
     printf("%s: ", label); 

    for (size_t i=0; in && i<len; ++i) 
     printf("%02X ", in[i]);  
    printf("\n"); 

    if (lf) 
     printf("\n"); 
} 

int main(int argc, char* argv[]) 
{ 
    Byte cipher[16], recover[16]; 
    const Byte plain[16] = { 
     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 
     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF 
    }; 

    // Avoid subkey scheduling at this point 
    RoundKey rdkeys[ROUNDS+1]; 
    for (size_t i=0; i<COUNTOF(rdkeys); ++i) 
     memset(&rdkeys[i], (i<<4)|i, sizeof(rdkeys[i])); 

    AES_encrypt(plain, cipher, rdkeys, ROUNDS); 

    return 0; 
} 

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds) 
{ 
    Print("In", in, 16); 

#if defined(__ARM_FEATURE_CRYPTO) 

    // Load the block 
    uint8x16_t data = vld1q_u8(in); 

    Print("Data (in)", (Byte*)&data, 16, true); 

    // AES encryption with ARM intrinsics: 
    // rnds-1 (9 for AES128) cycles of AES: 
    // (Add, Shift, Sub) plus Mix Columns 
    unsigned int i; 
    for (i=0; i<rounds; ++i) 
    { 
     // AES single round encryption 
     data = vaeseq_u8(data, rdkeys[i]); 
     // AES mix columns 
     data = vaesmcq_u8(data); 

     Print("Key", (Byte*)&rdkeys[i], 16); 
     Print("Data", (Byte*)&data, 16, true); 
    } 

    Print("Key", (Byte*)&rdkeys[i], 16); 

    // One round of encryption: AES, no Mix Columns 
    data = vaeseq_u8(data, rdkeys[i++]); 

    Print("Data", (Byte*)&data, 16, true); 

    // Final Add (bitwise Xor) 
    data = veorq_u8(data, rdkeys[i]); 

    Print("Data (xor)", (Byte*)&data, 16); 

    // Store the output data 
    vst1q_u8(out, data); 

#elif defined(__AES__) 

    __m128i data = _mm_loadu_si128((const __m128i*)in); 

    Print("Data (in)", (Byte*)&data, 16); 

    data = _mm_xor_si128(data, rdkeys[0]); 

    Print("Key", (Byte*)&rdkeys[0], 16); 
    Print("Data (xor)", (Byte*)&data, 16, true); 

    for (unsigned int i=1; i<rounds-1; ++i) 
    { 
     data = _mm_aesenc_si128(data, rdkeys[i]); 

     Print("Key", (Byte*)&rdkeys[i], 16); 
     Print("Data", (Byte*)&data, 16, true); 
    } 
    data = _mm_aesenc_si128(data, rdkeys[rounds-1]); 

    Print("Key", (Byte*)&rdkeys[rounds-1], 16); 
    Print("Data", (Byte*)&data, 16, true); 

    data = _mm_aesenclast_si128(data, rdkeys[rounds]); 

    Print("Key", (Byte*)&rdkeys[rounds], 16); 
    Print("Data", (Byte*)&data, 16, true); 

    _mm_storeu_si128((__m128i*)out, data); 

#endif 

    Print("Out", out, 16); 
}

来源

2017-08-06 jww

我注意到在数据值的差异似乎是要么0×90（90）和0xBE（190）...（如0xF9 - 0×69，0xC3 - 0x33，等等），也许这将有助于缩小问题的范围...... ARM的外观看起来有点奇怪。 –

ARM和Intel内部函数对AES使用相同的子密钥计划吗？

看来答案是肯定的。我仍然需要对真正的密钥调度进行测试，但是我能够使用相同的密钥计划对英特尔和ARMv8内在函数产生相同的结果。

看起来像在Crutchfield的参考实现中有一个偏移。它应该使用rounds-1，而不是rounds作为循环控制。这意味着我正在测试ARMv8有11轮，而不是10.当ARMv8代码生成F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9而不是F9 F9 ... F9 F9时，我应该怀疑它。

下面是更新后的代码：

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds) 
{ 
#if defined(__ARM_FEATURE_CRYPTO) 

    uint8x16_t data = vld1q_u8(in); 

    unsigned int i; 
    for (i=0; i<rounds-1; ++i) 
    { 
     data = vaeseq_u8(data, rdkeys[i]); 
     data = vaesmcq_u8(data); 
    } 

    data = vaeseq_u8(data, rdkeys[i++]); 
    data = veorq_u8(data, rdkeys[i]); 

    vst1q_u8(out, data); 

#elif defined(__AES__) 

    __m128i data = _mm_loadu_si128((const __m128i*)in); 
    data = _mm_xor_si128(data, rdkeys[0]); 

    unsigned int i; 
    for (i=1; i<rounds-1; ++i) 
    { 
     data = _mm_aesenc_si128(data, rdkeys[i]); 
    } 

    data = _mm_aesenc_si128(data, rdkeys[i++]); 
    data = _mm_aesenclast_si128(data, rdkeys[i]); 
    _mm_storeu_si128((__m128i*)out, data); 

#endif 
}

来源

2017-08-06 09:10:49 jww

因此，ARM代码现在正在运行？ –

@MaartenBodewes - 是的，代码现在正在运行。我在[修复ARMv8 AES加密]（https://github.com/noloader/cryptopp/commit/701ec3aa1f45a754）和[修复ARMv8 AES解密]（https://github.com/noloader/）上在我的测试分支中进行了检查cryptopp /提交/ 14590423249d）。一旦通过测试，我会将其推送给Master。它的运行速度约为2.8至3.0 cpb。它的速度比其他表现最佳的公司要慢1.0 cpb，但对于C++来说，它要比30-45 cpb要好。 – jww

ARM和Intel内部函数对AES使用相同的子密钥计划吗？

回答

相关问题