2013-08-31 129 views
9

我需要散列一个大数据库的值经常。因此,需要快速实施SHA-2散列器。我目前正在使用SHA256。SHA256性能优化C

我现在使用的sha256_transform算法是这样的: http://bradconte.com/sha256_c (下面的代码)

我已经异型我的代码,这片段正在计算每哈希时间正好96%,使这一功能的关键达到我的目标。

它使用名为data[]的64字节长二进制字符串进行操作,并将结果输出到ctx->state

我问这个函数的更快版本。请记住,即使稍作修改也会对速度造成负面影响。

#define uchar unsigned char 
#define uint unsigned int 

#define ROTLEFT(a,b) (((a) << (b)) | ((a) >> (32-(b)))) 
#define ROTRIGHT(a,b) (((a) >> (b)) | ((a) << (32-(b)))) 

#define CH(x,y,z) (((x) & (y))^(~(x) & (z))) 
#define MAJ(x,y,z) (((x) & (y))^((x) & (z))^((y) & (z))) 
#define EP0(x) (ROTRIGHT(x,2)^ROTRIGHT(x,13)^ROTRIGHT(x,22)) 
#define EP1(x) (ROTRIGHT(x,6)^ROTRIGHT(x,11)^ROTRIGHT(x,25)) 
#define SIG0(x) (ROTRIGHT(x,7)^ROTRIGHT(x,18)^((x) >> 3)) 
#define SIG1(x) (ROTRIGHT(x,17)^ROTRIGHT(x,19)^((x) >> 10)) 

void sha256_transform(SHA256_CTX *ctx, uchar data[]) { 
    uint a,b,c,d,e,f,g,h,i,j,t1,t2,m[64]; 

    a = ctx->state[0]; 
    b = ctx->state[1]; 
    c = ctx->state[2]; 
    d = ctx->state[3]; 
    e = ctx->state[4]; 
    f = ctx->state[5]; 
    g = ctx->state[6]; 
    h = ctx->state[7]; 

    for (i=0,j=0; i < 16; i++, j += 4) 
     m[i] = (data[j] << 24) | (data[j+1] << 16) | (data[j+2] << 8) | (data[j+3]); 

    for (; i < 64; i++) 
     m[i] = SIG1(m[i-2]) + m[i-7] + SIG0(m[i-15]) + m[i-16]; 

    for (i = 0; i < 64; ++i) { 
     t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i]; 
     t2 = EP0(a) + MAJ(a,b,c); 
     h = g; 
     g = f; 
     f = e; 
     e = d + t1; 
     d = c; 
     c = b; 
     b = a; 
     a = t1 + t2; 
    } 

    ctx->state[0] += a; 
    ctx->state[1] += b; 
    ctx->state[2] += c; 
    ctx->state[3] += d; 
    ctx->state[4] += e; 
    ctx->state[5] += f; 
    ctx->state[6] += g; 
    ctx->state[7] += h; 
} 
+0

如果您很高兴将您的代码限制为x86,那么看起来可能有使用SSE/AVX2进行SIMD优化的机会。 –

+2

需要96%的时间不是因为写得不好,而是因为它本身就很复杂。这已经得到了很好的优化,所以如果你需要花更少的时间来计算它,请寻找减少调用次数的方法。 – dasblinkenlight

+0

您现在的代码现在无法执行,因为这会让您的CPU达到新的热量高度? – WhozCraig

回答

6

您可能想要结账/配置文件implementation of SHA256

正在使用cgminer(一种流行的比特币挖掘软件),它是专门为保持性能而编写的。它包括4-way SIMD implementations using SSE2。它遵循与问题中提到的bradconte sha256_transform算法相同的方法。代码太长,无法在此重现。

此外,许可证相当宽容,只要原始作者被认可就允许重新使用/分发。

+0

嗯,只是好奇。从你链接到的C源代码,你提到的'使用SSE2的4路SIMD实现'在哪里? – c00000fd

+1

@ c00000fd更新了答案,并直接链接到[** sha256_4way.c **](https://github.com/pshep/cgminer/blob/master/sha256_4way.c#L102)。 – TheCodeArtist

0

查看Brian Gladman博士的执行情况 - http://www.gladman.me.uk/。它比cgminer中的那个快大约15%。我不认为你可以做得更好,而无需使用SSE

3

SHA256性能优化...

现在,Goldmont微架构已经发布,它包括英特尔的SHA扩展。使用CPU指令可以在压缩功能中获得5x-6x的加速。例如,proposed code for a crypto library witnessed the following(测试发生在Celeron J3455,运行频率为1.5 GHz,但突发频率为2。3千兆赫):

  • C++实现
$ ./botan speed --msec=3000 SHA-1 SHA-224 SHA-256 
    SHA-160 [base] hash 274.826 MiB/sec (824.480 MiB in 3000.009 ms) 
    SHA-224 [base] hash 92.349 MiB/sec (277.051 MiB in 3000.027 ms) 
    SHA-256 [base] hash 92.364 MiB/sec (277.094 MiB in 3000.027 ms) 
  • 英特尔SHA扩展
$ ./botan speed --msec=3000 SHA-1 SHA-224 SHA-256 
    SHA-160 [base] hash 1195.907 MiB/sec (3587.723 MiB in 3000.000 ms) 
    SHA-224 [base] hash 535.740 MiB/sec (1607.219 MiB in 3000.000 ms) 
    SHA-256 [base] hash 535.970 MiB/sec (1607.914 MiB in 3000.005 ms) 

这里使用英特尔SHA扩展的SHA256压缩功能的代码与内在。它基于Sean Gulley的博客Intel® SHA Extensions和他的示例代码mitls | hacl-star | experimental

下面的compress函数只处理64个字节的完整块。您需要设置初始状态,并且需要填充最后一个块。它看起来像你在示例代码中涵盖的那样。

#include <immintrin.h> 
... 

void compress(uint32_t state[8], const uint8_t input[], size_t blocks) 
{ 
    __m128i STATE0, STATE1; 
    __m128i MSG, TMP, MASK; 
    __m128i TMSG0, TMSG1, TMSG2, TMSG3; 
    __m128i ABEF_SAVE, CDGH_SAVE; 

    // Load initial values 
    TMP = _mm_loadu_si128((__m128i*) &state[0]); 
    STATE1 = _mm_loadu_si128((__m128i*) &state[4]); 
    MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); 

    TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB 
    STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH 
    STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF 
    STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH 

    while (blocks) 
    { 
     // Save current hash 
     ABEF_SAVE = STATE0; 
     CDGH_SAVE = STATE1; 

     // Rounds 0-3 
     MSG = _mm_loadu_si128((const __m128i*) (input+0)); 
     TMSG0 = _mm_shuffle_epi8(MSG, MASK); 
     MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 

     // Rounds 4-7 
     TMSG1 = _mm_loadu_si128((const __m128i*) (input+16)); 
     TMSG1 = _mm_shuffle_epi8(TMSG1, MASK); 
     MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 
     TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); 

     // Rounds 8-11 
     TMSG2 = _mm_loadu_si128((const __m128i*) (input+32)); 
     TMSG2 = _mm_shuffle_epi8(TMSG2, MASK); 
     MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 
     TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); 

     // Rounds 12-15 
     TMSG3 = _mm_loadu_si128((const __m128i*) (input+48)); 
     TMSG3 = _mm_shuffle_epi8(TMSG3, MASK); 
     MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); 
     TMSG0 = _mm_add_epi32(TMSG0, TMP); 
     TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 
     TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); 

     // Rounds 16-19 
     MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); 
     TMSG1 = _mm_add_epi32(TMSG1, TMP); 
     TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 
     TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); 

     // Rounds 20-23 
     MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); 
     TMSG2 = _mm_add_epi32(TMSG2, TMP); 
     TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 
     TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); 

     // Rounds 24-27 
     MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); 
     TMSG3 = _mm_add_epi32(TMSG3, TMP); 
     TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 
     TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); 

     // Rounds 28-31 
     MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); 
     TMSG0 = _mm_add_epi32(TMSG0, TMP); 
     TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 
     TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); 

     // Rounds 32-35 
     MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); 
     TMSG1 = _mm_add_epi32(TMSG1, TMP); 
     TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 
     TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); 

     // Rounds 36-39 
     MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); 
     TMSG2 = _mm_add_epi32(TMSG2, TMP); 
     TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 
     TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1); 

     // Rounds 40-43 
     MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); 
     TMSG3 = _mm_add_epi32(TMSG3, TMP); 
     TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 
     TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2); 

     // Rounds 44-47 
     MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4); 
     TMSG0 = _mm_add_epi32(TMSG0, TMP); 
     TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 
     TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3); 

     // Rounds 48-51 
     MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4); 
     TMSG1 = _mm_add_epi32(TMSG1, TMP); 
     TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 
     TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0); 

     // Rounds 52-55 
     MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4); 
     TMSG2 = _mm_add_epi32(TMSG2, TMP); 
     TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 

     // Rounds 56-59 
     MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4); 
     TMSG3 = _mm_add_epi32(TMSG3, TMP); 
     TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 

     // Rounds 60-63 
     MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); 
     STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); 
     MSG = _mm_shuffle_epi32(MSG, 0x0E); 
     STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); 

     // Add values back to state 
     STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); 
     STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); 

     input += 64; 
     blocks--; 
    } 

    TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA 
    STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG 
    STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA 
    STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF 

    // Save state 
    _mm_storeu_si128((__m128i*) &state[0], STATE0); 
    _mm_storeu_si128((__m128i*) &state[4], STATE1); 
} 

你可以找到源英特尔SHA内在和ARMv8 SHA内部函数在Noloader GitHub | SHA-Intrinsics。它们是C源文件,并为SHA-1,SHA-224和SHA-256提供压缩功能。基于内部的实现为SHA-1增加了大约3倍到4倍的吞吐量,并且为SHA-224和SHA-256增加了大约6倍到12倍的吞吐量。