From 6173112e9b2f26dc18302064ce1f36717b958aaf Mon Sep 17 00:00:00 2001 From: Dakoda Greaves Date: Fri, 3 May 2024 18:39:43 -0700 Subject: [PATCH] crypto: add multi-block support to sha1 --- src/crypto/sha1.cpp | 528 ++++++++++++++++++++++---------------------- 1 file changed, 267 insertions(+), 261 deletions(-) diff --git a/src/crypto/sha1.cpp b/src/crypto/sha1.cpp index a6c353264c7..b8ad75a90f2 100644 --- a/src/crypto/sha1.cpp +++ b/src/crypto/sha1.cpp @@ -81,7 +81,7 @@ const uint32_t k3 = 0x8F1BBCDCul; const uint32_t k4 = 0xCA62C1D6ul; /** Perform a SHA-1 transformation, processing a 64-byte chunk. (ARMv8) */ -void Transform_ARMV8(uint32_t* s, const unsigned char* chunk) +void Transform_ARMV8(uint32_t* s, const unsigned char* chunk, size_t blocks) { #if defined(USE_ARMV8) || defined(USE_ARMV82) // this entire block is experimental @@ -96,160 +96,162 @@ void Transform_ARMV8(uint32_t* s, const unsigned char* chunk) ABCD = vld1q_u32(&s[0]); E0 = s[4]; - /** Save state */ - ABCD_SAVED = ABCD; - E0_SAVED = E0; - - /** Load message */ - MSG0 = vld1q_u32((const uint32_t*)(chunk)); - MSG1 = vld1q_u32((const uint32_t*)(chunk + 16)); - MSG2 = vld1q_u32((const uint32_t*)(chunk + 32)); - MSG3 = vld1q_u32((const uint32_t*)(chunk + 48)); - - /** Reverse for little endian */ - MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); - MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); - MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); - MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); - - TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x5A827999)); - TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x5A827999)); - - /** Rounds 0-3 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x5A827999)); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - /** Rounds 4-7 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x5A827999)); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - /** Rounds 8-11 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x5A827999)); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - /** Rounds 12-15 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x6ED9EBA1)); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - /** Rounds 16-19 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x6ED9EBA1)); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - /** Rounds 20-23 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x6ED9EBA1)); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - /** Rounds 24-27 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x6ED9EBA1)); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - /** Rounds 28-31 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x6ED9EBA1)); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - /** Rounds 32-35 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x8F1BBCDC)); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - /** Rounds 36-39 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x8F1BBCDC)); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - /** Rounds 40-43 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x8F1BBCDC)); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - /** Rounds 44-47 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x8F1BBCDC)); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - /** Rounds 48-51 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x8F1BBCDC)); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - /** Rounds 52-55 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0xCA62C1D6)); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - /** Rounds 56-59 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0xCA62C1D6)); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - /** Rounds 60-63 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0xCA62C1D6)); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - /** Rounds 64-67 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0xCA62C1D6)); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - /** Rounds 68-71 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0xCA62C1D6)); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - - /** Rounds 72-75 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - - /** Rounds 76-79 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - - /** Combine state */ - E0 += E0_SAVED; - ABCD = vaddq_u32(ABCD_SAVED, ABCD); + while (blocks--) { + /** Save state */ + ABCD_SAVED = ABCD; + E0_SAVED = E0; + + /** Load message */ + MSG0 = vld1q_u32((const uint32_t*)(chunk)); + MSG1 = vld1q_u32((const uint32_t*)(chunk + 16)); + MSG2 = vld1q_u32((const uint32_t*)(chunk + 32)); + MSG3 = vld1q_u32((const uint32_t*)(chunk + 48)); + + /** Reverse for little endian */ + MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); + MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); + MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); + MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); + + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x5A827999)); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x5A827999)); + + /** Rounds 0-3 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x5A827999)); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /** Rounds 4-7 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x5A827999)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + /** Rounds 8-11 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x5A827999)); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + /** Rounds 12-15 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x6ED9EBA1)); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + /** Rounds 16-19 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x6ED9EBA1)); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /** Rounds 20-23 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x6ED9EBA1)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + /** Rounds 24-27 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x6ED9EBA1)); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + /** Rounds 28-31 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x6ED9EBA1)); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + /** Rounds 32-35 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x8F1BBCDC)); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /** Rounds 36-39 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x8F1BBCDC)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + /** Rounds 40-43 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x8F1BBCDC)); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + /** Rounds 44-47 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x8F1BBCDC)); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + /** Rounds 48-51 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x8F1BBCDC)); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /** Rounds 52-55 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0xCA62C1D6)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + /** Rounds 56-59 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0xCA62C1D6)); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + /** Rounds 60-63 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0xCA62C1D6)); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + /** Rounds 64-67 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0xCA62C1D6)); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /** Rounds 68-71 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0xCA62C1D6)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + + /** Rounds 72-75 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + + /** Rounds 76-79 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + + /** Combine state */ + E0 += E0_SAVED; + ABCD = vaddq_u32(ABCD_SAVED, ABCD); + } /** Save state */ vst1q_u32(&s[0], ABCD); @@ -257,115 +259,119 @@ void Transform_ARMV8(uint32_t* s, const unsigned char* chunk) #endif } -/** Perform a SHA-1 transformation, processing a 64-byte chunk. (AVX2) */ -void Transform_AVX2(uint32_t* s, const unsigned char* chunk) +/** Perform a number of SHA-1 transformation, processing 64-byte chunks. (AVX2) */ +void Transform_AVX2(uint32_t* s, const unsigned char* chunk, size_t blocks) { #if USE_AVX2 // Perform SHA1 one block (Intel AVX2) EXPERIMENTAL_FEATURE - - sha1_one_block_avx2(chunk, s); + while (blocks--) { + sha1_one_block_avx2(chunk, s); + chunk += 64; + } #endif } /** Perform a SHA-1 transformation, processing a 64-byte chunk. */ -void Transform(uint32_t* s, const unsigned char* chunk) +void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks) { // Perform SHA one block (legacy) - - uint32_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4]; - uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; - - Round(a, b, c, d, e, f1(b, c, d), k1, w0 = ReadBE32(chunk + 0)); - Round(e, a, b, c, d, f1(a, b, c), k1, w1 = ReadBE32(chunk + 4)); - Round(d, e, a, b, c, f1(e, a, b), k1, w2 = ReadBE32(chunk + 8)); - Round(c, d, e, a, b, f1(d, e, a), k1, w3 = ReadBE32(chunk + 12)); - Round(b, c, d, e, a, f1(c, d, e), k1, w4 = ReadBE32(chunk + 16)); - Round(a, b, c, d, e, f1(b, c, d), k1, w5 = ReadBE32(chunk + 20)); - Round(e, a, b, c, d, f1(a, b, c), k1, w6 = ReadBE32(chunk + 24)); - Round(d, e, a, b, c, f1(e, a, b), k1, w7 = ReadBE32(chunk + 28)); - Round(c, d, e, a, b, f1(d, e, a), k1, w8 = ReadBE32(chunk + 32)); - Round(b, c, d, e, a, f1(c, d, e), k1, w9 = ReadBE32(chunk + 36)); - Round(a, b, c, d, e, f1(b, c, d), k1, w10 = ReadBE32(chunk + 40)); - Round(e, a, b, c, d, f1(a, b, c), k1, w11 = ReadBE32(chunk + 44)); - Round(d, e, a, b, c, f1(e, a, b), k1, w12 = ReadBE32(chunk + 48)); - Round(c, d, e, a, b, f1(d, e, a), k1, w13 = ReadBE32(chunk + 52)); - Round(b, c, d, e, a, f1(c, d, e), k1, w14 = ReadBE32(chunk + 56)); - Round(a, b, c, d, e, f1(b, c, d), k1, w15 = ReadBE32(chunk + 60)); - - Round(e, a, b, c, d, f1(a, b, c), k1, w0 = left(w0 ^ w13 ^ w8 ^ w2)); - Round(d, e, a, b, c, f1(e, a, b), k1, w1 = left(w1 ^ w14 ^ w9 ^ w3)); - Round(c, d, e, a, b, f1(d, e, a), k1, w2 = left(w2 ^ w15 ^ w10 ^ w4)); - Round(b, c, d, e, a, f1(c, d, e), k1, w3 = left(w3 ^ w0 ^ w11 ^ w5)); - Round(a, b, c, d, e, f2(b, c, d), k2, w4 = left(w4 ^ w1 ^ w12 ^ w6)); - Round(e, a, b, c, d, f2(a, b, c), k2, w5 = left(w5 ^ w2 ^ w13 ^ w7)); - Round(d, e, a, b, c, f2(e, a, b), k2, w6 = left(w6 ^ w3 ^ w14 ^ w8)); - Round(c, d, e, a, b, f2(d, e, a), k2, w7 = left(w7 ^ w4 ^ w15 ^ w9)); - Round(b, c, d, e, a, f2(c, d, e), k2, w8 = left(w8 ^ w5 ^ w0 ^ w10)); - Round(a, b, c, d, e, f2(b, c, d), k2, w9 = left(w9 ^ w6 ^ w1 ^ w11)); - Round(e, a, b, c, d, f2(a, b, c), k2, w10 = left(w10 ^ w7 ^ w2 ^ w12)); - Round(d, e, a, b, c, f2(e, a, b), k2, w11 = left(w11 ^ w8 ^ w3 ^ w13)); - Round(c, d, e, a, b, f2(d, e, a), k2, w12 = left(w12 ^ w9 ^ w4 ^ w14)); - Round(b, c, d, e, a, f2(c, d, e), k2, w13 = left(w13 ^ w10 ^ w5 ^ w15)); - Round(a, b, c, d, e, f2(b, c, d), k2, w14 = left(w14 ^ w11 ^ w6 ^ w0)); - Round(e, a, b, c, d, f2(a, b, c), k2, w15 = left(w15 ^ w12 ^ w7 ^ w1)); - - Round(d, e, a, b, c, f2(e, a, b), k2, w0 = left(w0 ^ w13 ^ w8 ^ w2)); - Round(c, d, e, a, b, f2(d, e, a), k2, w1 = left(w1 ^ w14 ^ w9 ^ w3)); - Round(b, c, d, e, a, f2(c, d, e), k2, w2 = left(w2 ^ w15 ^ w10 ^ w4)); - Round(a, b, c, d, e, f2(b, c, d), k2, w3 = left(w3 ^ w0 ^ w11 ^ w5)); - Round(e, a, b, c, d, f2(a, b, c), k2, w4 = left(w4 ^ w1 ^ w12 ^ w6)); - Round(d, e, a, b, c, f2(e, a, b), k2, w5 = left(w5 ^ w2 ^ w13 ^ w7)); - Round(c, d, e, a, b, f2(d, e, a), k2, w6 = left(w6 ^ w3 ^ w14 ^ w8)); - Round(b, c, d, e, a, f2(c, d, e), k2, w7 = left(w7 ^ w4 ^ w15 ^ w9)); - Round(a, b, c, d, e, f3(b, c, d), k3, w8 = left(w8 ^ w5 ^ w0 ^ w10)); - Round(e, a, b, c, d, f3(a, b, c), k3, w9 = left(w9 ^ w6 ^ w1 ^ w11)); - Round(d, e, a, b, c, f3(e, a, b), k3, w10 = left(w10 ^ w7 ^ w2 ^ w12)); - Round(c, d, e, a, b, f3(d, e, a), k3, w11 = left(w11 ^ w8 ^ w3 ^ w13)); - Round(b, c, d, e, a, f3(c, d, e), k3, w12 = left(w12 ^ w9 ^ w4 ^ w14)); - Round(a, b, c, d, e, f3(b, c, d), k3, w13 = left(w13 ^ w10 ^ w5 ^ w15)); - Round(e, a, b, c, d, f3(a, b, c), k3, w14 = left(w14 ^ w11 ^ w6 ^ w0)); - Round(d, e, a, b, c, f3(e, a, b), k3, w15 = left(w15 ^ w12 ^ w7 ^ w1)); - - Round(c, d, e, a, b, f3(d, e, a), k3, w0 = left(w0 ^ w13 ^ w8 ^ w2)); - Round(b, c, d, e, a, f3(c, d, e), k3, w1 = left(w1 ^ w14 ^ w9 ^ w3)); - Round(a, b, c, d, e, f3(b, c, d), k3, w2 = left(w2 ^ w15 ^ w10 ^ w4)); - Round(e, a, b, c, d, f3(a, b, c), k3, w3 = left(w3 ^ w0 ^ w11 ^ w5)); - Round(d, e, a, b, c, f3(e, a, b), k3, w4 = left(w4 ^ w1 ^ w12 ^ w6)); - Round(c, d, e, a, b, f3(d, e, a), k3, w5 = left(w5 ^ w2 ^ w13 ^ w7)); - Round(b, c, d, e, a, f3(c, d, e), k3, w6 = left(w6 ^ w3 ^ w14 ^ w8)); - Round(a, b, c, d, e, f3(b, c, d), k3, w7 = left(w7 ^ w4 ^ w15 ^ w9)); - Round(e, a, b, c, d, f3(a, b, c), k3, w8 = left(w8 ^ w5 ^ w0 ^ w10)); - Round(d, e, a, b, c, f3(e, a, b), k3, w9 = left(w9 ^ w6 ^ w1 ^ w11)); - Round(c, d, e, a, b, f3(d, e, a), k3, w10 = left(w10 ^ w7 ^ w2 ^ w12)); - Round(b, c, d, e, a, f3(c, d, e), k3, w11 = left(w11 ^ w8 ^ w3 ^ w13)); - Round(a, b, c, d, e, f2(b, c, d), k4, w12 = left(w12 ^ w9 ^ w4 ^ w14)); - Round(e, a, b, c, d, f2(a, b, c), k4, w13 = left(w13 ^ w10 ^ w5 ^ w15)); - Round(d, e, a, b, c, f2(e, a, b), k4, w14 = left(w14 ^ w11 ^ w6 ^ w0)); - Round(c, d, e, a, b, f2(d, e, a), k4, w15 = left(w15 ^ w12 ^ w7 ^ w1)); - - Round(b, c, d, e, a, f2(c, d, e), k4, w0 = left(w0 ^ w13 ^ w8 ^ w2)); - Round(a, b, c, d, e, f2(b, c, d), k4, w1 = left(w1 ^ w14 ^ w9 ^ w3)); - Round(e, a, b, c, d, f2(a, b, c), k4, w2 = left(w2 ^ w15 ^ w10 ^ w4)); - Round(d, e, a, b, c, f2(e, a, b), k4, w3 = left(w3 ^ w0 ^ w11 ^ w5)); - Round(c, d, e, a, b, f2(d, e, a), k4, w4 = left(w4 ^ w1 ^ w12 ^ w6)); - Round(b, c, d, e, a, f2(c, d, e), k4, w5 = left(w5 ^ w2 ^ w13 ^ w7)); - Round(a, b, c, d, e, f2(b, c, d), k4, w6 = left(w6 ^ w3 ^ w14 ^ w8)); - Round(e, a, b, c, d, f2(a, b, c), k4, w7 = left(w7 ^ w4 ^ w15 ^ w9)); - Round(d, e, a, b, c, f2(e, a, b), k4, w8 = left(w8 ^ w5 ^ w0 ^ w10)); - Round(c, d, e, a, b, f2(d, e, a), k4, w9 = left(w9 ^ w6 ^ w1 ^ w11)); - Round(b, c, d, e, a, f2(c, d, e), k4, w10 = left(w10 ^ w7 ^ w2 ^ w12)); - Round(a, b, c, d, e, f2(b, c, d), k4, w11 = left(w11 ^ w8 ^ w3 ^ w13)); - Round(e, a, b, c, d, f2(a, b, c), k4, w12 = left(w12 ^ w9 ^ w4 ^ w14)); - Round(d, e, a, b, c, f2(e, a, b), k4, left(w13 ^ w10 ^ w5 ^ w15)); - Round(c, d, e, a, b, f2(d, e, a), k4, left(w14 ^ w11 ^ w6 ^ w0)); - Round(b, c, d, e, a, f2(c, d, e), k4, left(w15 ^ w12 ^ w7 ^ w1)); - - s[0] += a; - s[1] += b; - s[2] += c; - s[3] += d; - s[4] += e; + while (blocks--) { + uint32_t a = s[0], b = s[1], c = s[2], d = s[3], e = s[4]; + uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; + + Round(a, b, c, d, e, f1(b, c, d), k1, w0 = ReadBE32(chunk + 0)); + Round(e, a, b, c, d, f1(a, b, c), k1, w1 = ReadBE32(chunk + 4)); + Round(d, e, a, b, c, f1(e, a, b), k1, w2 = ReadBE32(chunk + 8)); + Round(c, d, e, a, b, f1(d, e, a), k1, w3 = ReadBE32(chunk + 12)); + Round(b, c, d, e, a, f1(c, d, e), k1, w4 = ReadBE32(chunk + 16)); + Round(a, b, c, d, e, f1(b, c, d), k1, w5 = ReadBE32(chunk + 20)); + Round(e, a, b, c, d, f1(a, b, c), k1, w6 = ReadBE32(chunk + 24)); + Round(d, e, a, b, c, f1(e, a, b), k1, w7 = ReadBE32(chunk + 28)); + Round(c, d, e, a, b, f1(d, e, a), k1, w8 = ReadBE32(chunk + 32)); + Round(b, c, d, e, a, f1(c, d, e), k1, w9 = ReadBE32(chunk + 36)); + Round(a, b, c, d, e, f1(b, c, d), k1, w10 = ReadBE32(chunk + 40)); + Round(e, a, b, c, d, f1(a, b, c), k1, w11 = ReadBE32(chunk + 44)); + Round(d, e, a, b, c, f1(e, a, b), k1, w12 = ReadBE32(chunk + 48)); + Round(c, d, e, a, b, f1(d, e, a), k1, w13 = ReadBE32(chunk + 52)); + Round(b, c, d, e, a, f1(c, d, e), k1, w14 = ReadBE32(chunk + 56)); + Round(a, b, c, d, e, f1(b, c, d), k1, w15 = ReadBE32(chunk + 60)); + + Round(e, a, b, c, d, f1(a, b, c), k1, w0 = left(w0 ^ w13 ^ w8 ^ w2)); + Round(d, e, a, b, c, f1(e, a, b), k1, w1 = left(w1 ^ w14 ^ w9 ^ w3)); + Round(c, d, e, a, b, f1(d, e, a), k1, w2 = left(w2 ^ w15 ^ w10 ^ w4)); + Round(b, c, d, e, a, f1(c, d, e), k1, w3 = left(w3 ^ w0 ^ w11 ^ w5)); + Round(a, b, c, d, e, f2(b, c, d), k2, w4 = left(w4 ^ w1 ^ w12 ^ w6)); + Round(e, a, b, c, d, f2(a, b, c), k2, w5 = left(w5 ^ w2 ^ w13 ^ w7)); + Round(d, e, a, b, c, f2(e, a, b), k2, w6 = left(w6 ^ w3 ^ w14 ^ w8)); + Round(c, d, e, a, b, f2(d, e, a), k2, w7 = left(w7 ^ w4 ^ w15 ^ w9)); + Round(b, c, d, e, a, f2(c, d, e), k2, w8 = left(w8 ^ w5 ^ w0 ^ w10)); + Round(a, b, c, d, e, f2(b, c, d), k2, w9 = left(w9 ^ w6 ^ w1 ^ w11)); + Round(e, a, b, c, d, f2(a, b, c), k2, w10 = left(w10 ^ w7 ^ w2 ^ w12)); + Round(d, e, a, b, c, f2(e, a, b), k2, w11 = left(w11 ^ w8 ^ w3 ^ w13)); + Round(c, d, e, a, b, f2(d, e, a), k2, w12 = left(w12 ^ w9 ^ w4 ^ w14)); + Round(b, c, d, e, a, f2(c, d, e), k2, w13 = left(w13 ^ w10 ^ w5 ^ w15)); + Round(a, b, c, d, e, f2(b, c, d), k2, w14 = left(w14 ^ w11 ^ w6 ^ w0)); + Round(e, a, b, c, d, f2(a, b, c), k2, w15 = left(w15 ^ w12 ^ w7 ^ w1)); + + Round(d, e, a, b, c, f2(e, a, b), k2, w0 = left(w0 ^ w13 ^ w8 ^ w2)); + Round(c, d, e, a, b, f2(d, e, a), k2, w1 = left(w1 ^ w14 ^ w9 ^ w3)); + Round(b, c, d, e, a, f2(c, d, e), k2, w2 = left(w2 ^ w15 ^ w10 ^ w4)); + Round(a, b, c, d, e, f2(b, c, d), k2, w3 = left(w3 ^ w0 ^ w11 ^ w5)); + Round(e, a, b, c, d, f2(a, b, c), k2, w4 = left(w4 ^ w1 ^ w12 ^ w6)); + Round(d, e, a, b, c, f2(e, a, b), k2, w5 = left(w5 ^ w2 ^ w13 ^ w7)); + Round(c, d, e, a, b, f2(d, e, a), k2, w6 = left(w6 ^ w3 ^ w14 ^ w8)); + Round(b, c, d, e, a, f2(c, d, e), k2, w7 = left(w7 ^ w4 ^ w15 ^ w9)); + Round(a, b, c, d, e, f3(b, c, d), k3, w8 = left(w8 ^ w5 ^ w0 ^ w10)); + Round(e, a, b, c, d, f3(a, b, c), k3, w9 = left(w9 ^ w6 ^ w1 ^ w11)); + Round(d, e, a, b, c, f3(e, a, b), k3, w10 = left(w10 ^ w7 ^ w2 ^ w12)); + Round(c, d, e, a, b, f3(d, e, a), k3, w11 = left(w11 ^ w8 ^ w3 ^ w13)); + Round(b, c, d, e, a, f3(c, d, e), k3, w12 = left(w12 ^ w9 ^ w4 ^ w14)); + Round(a, b, c, d, e, f3(b, c, d), k3, w13 = left(w13 ^ w10 ^ w5 ^ w15)); + Round(e, a, b, c, d, f3(a, b, c), k3, w14 = left(w14 ^ w11 ^ w6 ^ w0)); + Round(d, e, a, b, c, f3(e, a, b), k3, w15 = left(w15 ^ w12 ^ w7 ^ w1)); + + Round(c, d, e, a, b, f3(d, e, a), k3, w0 = left(w0 ^ w13 ^ w8 ^ w2)); + Round(b, c, d, e, a, f3(c, d, e), k3, w1 = left(w1 ^ w14 ^ w9 ^ w3)); + Round(a, b, c, d, e, f3(b, c, d), k3, w2 = left(w2 ^ w15 ^ w10 ^ w4)); + Round(e, a, b, c, d, f3(a, b, c), k3, w3 = left(w3 ^ w0 ^ w11 ^ w5)); + Round(d, e, a, b, c, f3(e, a, b), k3, w4 = left(w4 ^ w1 ^ w12 ^ w6)); + Round(c, d, e, a, b, f3(d, e, a), k3, w5 = left(w5 ^ w2 ^ w13 ^ w7)); + Round(b, c, d, e, a, f3(c, d, e), k3, w6 = left(w6 ^ w3 ^ w14 ^ w8)); + Round(a, b, c, d, e, f3(b, c, d), k3, w7 = left(w7 ^ w4 ^ w15 ^ w9)); + Round(e, a, b, c, d, f3(a, b, c), k3, w8 = left(w8 ^ w5 ^ w0 ^ w10)); + Round(d, e, a, b, c, f3(e, a, b), k3, w9 = left(w9 ^ w6 ^ w1 ^ w11)); + Round(c, d, e, a, b, f3(d, e, a), k3, w10 = left(w10 ^ w7 ^ w2 ^ w12)); + Round(b, c, d, e, a, f3(c, d, e), k3, w11 = left(w11 ^ w8 ^ w3 ^ w13)); + Round(a, b, c, d, e, f2(b, c, d), k4, w12 = left(w12 ^ w9 ^ w4 ^ w14)); + Round(e, a, b, c, d, f2(a, b, c), k4, w13 = left(w13 ^ w10 ^ w5 ^ w15)); + Round(d, e, a, b, c, f2(e, a, b), k4, w14 = left(w14 ^ w11 ^ w6 ^ w0)); + Round(c, d, e, a, b, f2(d, e, a), k4, w15 = left(w15 ^ w12 ^ w7 ^ w1)); + + Round(b, c, d, e, a, f2(c, d, e), k4, w0 = left(w0 ^ w13 ^ w8 ^ w2)); + Round(a, b, c, d, e, f2(b, c, d), k4, w1 = left(w1 ^ w14 ^ w9 ^ w3)); + Round(e, a, b, c, d, f2(a, b, c), k4, w2 = left(w2 ^ w15 ^ w10 ^ w4)); + Round(d, e, a, b, c, f2(e, a, b), k4, w3 = left(w3 ^ w0 ^ w11 ^ w5)); + Round(c, d, e, a, b, f2(d, e, a), k4, w4 = left(w4 ^ w1 ^ w12 ^ w6)); + Round(b, c, d, e, a, f2(c, d, e), k4, w5 = left(w5 ^ w2 ^ w13 ^ w7)); + Round(a, b, c, d, e, f2(b, c, d), k4, w6 = left(w6 ^ w3 ^ w14 ^ w8)); + Round(e, a, b, c, d, f2(a, b, c), k4, w7 = left(w7 ^ w4 ^ w15 ^ w9)); + Round(d, e, a, b, c, f2(e, a, b), k4, w8 = left(w8 ^ w5 ^ w0 ^ w10)); + Round(c, d, e, a, b, f2(d, e, a), k4, w9 = left(w9 ^ w6 ^ w1 ^ w11)); + Round(b, c, d, e, a, f2(c, d, e), k4, w10 = left(w10 ^ w7 ^ w2 ^ w12)); + Round(a, b, c, d, e, f2(b, c, d), k4, w11 = left(w11 ^ w8 ^ w3 ^ w13)); + Round(e, a, b, c, d, f2(a, b, c), k4, w12 = left(w12 ^ w9 ^ w4 ^ w14)); + Round(d, e, a, b, c, f2(e, a, b), k4, left(w13 ^ w10 ^ w5 ^ w15)); + Round(c, d, e, a, b, f2(d, e, a), k4, left(w14 ^ w11 ^ w6 ^ w0)); + Round(b, c, d, e, a, f2(c, d, e), k4, left(w15 ^ w12 ^ w7 ^ w1)); + + s[0] += a; + s[1] += b; + s[2] += c; + s[3] += d; + s[4] += e; + chunk += 64; + } } #if defined(USE_ASM) && (defined(__x86_64__) || defined(__amd64__) || defined(__i386__)) @@ -379,7 +385,7 @@ bool AVXEnabled() #endif /** Define a function pointer for Transform */ -void (*transform_ptr) (uint32_t*, const unsigned char*) = &Transform; +void (*transform_ptr) (uint32_t*, const unsigned char*, size_t) = &Transform; /** Initialize the function pointer */ void inline Initialize_transform_ptr(void) @@ -419,14 +425,14 @@ CSHA1& CSHA1::Write(const unsigned char* data, size_t len) memcpy(buf + bufsize, data, 64 - bufsize); bytes += 64 - bufsize; data += 64 - bufsize; - sha1::transform_ptr(s, buf); + sha1::transform_ptr(s, buf, 1); bufsize = 0; } - while (end - data >= 64) { - // Process full chunks directly from the source. - sha1::transform_ptr(s, data); - bytes += 64; - data += 64; + if (end - data >= 64) { + size_t blocks = (end - data) / 64; + sha1::transform_ptr(s, data, blocks); + data += 64 * blocks; + bytes += 64 * blocks; } if (end > data) { // Fill the buffer with what remains.