diff --git a/sha1_mb/sha1_mb_x16_avx512.asm b/sha1_mb/sha1_mb_x16_avx512.asm index 84eb0de..fa3a019 100644 --- a/sha1_mb/sha1_mb_x16_avx512.asm +++ b/sha1_mb/sha1_mb_x16_avx512.asm @@ -490,16 +490,11 @@ lastLoop: vpaddd E,E,EE ;; update into data pointers -%assign I 0 -%rep 8 - mov inp0, [IN + (2*I)*8] - mov inp1, [IN + (2*I +1)*8] - add inp0, IDX - add inp1, IDX - mov [IN + (2*I)*8], inp0 - mov [IN + (2*I+1)*8], inp1 -%assign I (I+1) -%endrep + vpbroadcastq TMP1, IDX + vpaddq TMP0, TMP1, [IN] + vpaddq TMP1, TMP1, [IN+64] + vmovdqu64 [IN], TMP0 + vmovdqu64 [IN+64], TMP1 ; Write out digest ; Do we need to untranspose digests??? diff --git a/sha1_mb/sha1_mb_x8_avx2.asm b/sha1_mb/sha1_mb_x8_avx2.asm index edcba6d..6a9a685 100644 --- a/sha1_mb/sha1_mb_x8_avx2.asm +++ b/sha1_mb/sha1_mb_x8_avx2.asm @@ -475,22 +475,13 @@ lloop: vmovdqu [arg1 + 4*32], E ;; update input pointers - add inp0, IDX - add inp1, IDX - add inp2, IDX - add inp3, IDX - add inp4, IDX - add inp5, IDX - add inp6, IDX - add inp7, IDX - mov [arg1+_data_ptr+0*8], inp0 - mov [arg1+_data_ptr+1*8], inp1 - mov [arg1+_data_ptr+2*8], inp2 - mov [arg1+_data_ptr+3*8], inp3 - mov [arg1+_data_ptr+4*8], inp4 - mov [arg1+_data_ptr+5*8], inp5 - mov [arg1+_data_ptr+6*8], inp6 - mov [arg1+_data_ptr+7*8], inp7 + vmovq xmm1, IDX + vpbroadcastq ymm1, xmm1 + lea IDX, [arg1+_data_ptr] + vpaddq ymm0, ymm1, [IDX] + vpaddq ymm1, ymm1, [IDX+32] + vmovdqu [IDX], ymm0 + vmovdqu [IDX+32], ymm1 ;;;;;;;;;;;;;;;; ;; Postamble diff --git a/sha256_mb/sha256_mb_x16_avx512.asm b/sha256_mb/sha256_mb_x16_avx512.asm index 66beff0..c069150 100644 --- a/sha256_mb/sha256_mb_x16_avx512.asm +++ b/sha256_mb/sha256_mb_x16_avx512.asm @@ -607,16 +607,11 @@ lastLoop: vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7] ;; update into data pointers -%assign I 0 -%rep 8 - mov inp0, [IN + (2*I)*8] - mov inp1, [IN + (2*I +1)*8] - add inp0, IDX - add inp1, IDX - mov [IN + (2*I)*8], inp0 - mov [IN + (2*I+1)*8], inp1 -%assign I (I+1) -%endrep + vpbroadcastq TMP1, IDX + vpaddq TMP0, TMP1, [IN] + vpaddq TMP1, TMP1, [IN+64] + vmovdqu64 [IN], TMP0 + vmovdqu64 [IN+64], TMP1 ; Write out digest ; Do we need to untranspose digests??? diff --git a/sha256_mb/sha256_mb_x8_avx2.asm b/sha256_mb/sha256_mb_x8_avx2.asm index dbd9db1..3d86396 100644 --- a/sha256_mb/sha256_mb_x8_avx2.asm +++ b/sha256_mb/sha256_mb_x8_avx2.asm @@ -463,22 +463,13 @@ Lrounds_16_xx: vmovdqu [STATE + 7*SHA256_DIGEST_ROW_SIZE],h ; update input pointers - add inp0, IDX - mov [STATE + _args_data_ptr + 0*8], inp0 - add inp1, IDX - mov [STATE + _args_data_ptr + 1*8], inp1 - add inp2, IDX - mov [STATE + _args_data_ptr + 2*8], inp2 - add inp3, IDX - mov [STATE + _args_data_ptr + 3*8], inp3 - add inp4, IDX - mov [STATE + _args_data_ptr + 4*8], inp4 - add inp5, IDX - mov [STATE + _args_data_ptr + 5*8], inp5 - add inp6, IDX - mov [STATE + _args_data_ptr + 6*8], inp6 - add inp7, IDX - mov [STATE + _args_data_ptr + 7*8], inp7 + vmovq XWORD(TMP0), IDX + vpbroadcastq TMP1, XWORD(TMP0) + lea IDX, [STATE + _args_data_ptr] + vpaddq TMP0, TMP1, [IDX] + vpaddq TMP1, TMP1, [IDX + 32] + vmovdqu [IDX], TMP0 + vmovdqu [IDX+32], TMP1 ;;;;;;;;;;;;;;;; ;; Postamble diff --git a/sha512_mb/sha512_mb_x4_avx2.asm b/sha512_mb/sha512_mb_x4_avx2.asm index 0058f33..7e6c3b5 100644 --- a/sha512_mb/sha512_mb_x4_avx2.asm +++ b/sha512_mb/sha512_mb_x4_avx2.asm @@ -379,14 +379,11 @@ Lrounds_16_xx: vmovdqu [STATE+ 7*SHA512_DIGEST_ROW_SIZE ],h ;; update input data pointers - add inp0, IDX - mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0 - add inp1, IDX - mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1 - add inp2, IDX - mov [STATE + _data_ptr_sha512 + 2*PTR_SZ], inp2 - add inp3, IDX - mov [STATE + _data_ptr_sha512 + 3*PTR_SZ], inp3 + vmovq xmm0, IDX + lea IDX, [STATE + _data_ptr_sha512] + vpbroadcastq ymm0, xmm0 + vpaddq ymm0, ymm0, [IDX] + vmovdqu [IDX], ymm0 ;;;;;;;;;;;;;;;; ;; Postamble diff --git a/sha512_mb/sha512_mb_x8_avx512.asm b/sha512_mb/sha512_mb_x8_avx512.asm index e273510..7864c14 100644 --- a/sha512_mb/sha512_mb_x8_avx512.asm +++ b/sha512_mb/sha512_mb_x8_avx512.asm @@ -494,16 +494,9 @@ lastLoop: vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7] ;; update into data pointers -%assign I 0 -%rep 4 - mov inp0, [IN + (2*I)*8] - mov inp1, [IN + (2*I +1)*8] - add inp0, IDX - add inp1, IDX - mov [IN + (2*I)*8], inp0 - mov [IN + (2*I+1)*8], inp1 -%assign I (I+1) -%endrep + vpbroadcastq TMP0, IDX + vpaddq TMP0, TMP0, [IN] + vmovdqu64 [IN], TMP0 VMOVDQ32 [DIGEST + 0*8*8], A VMOVDQ32 [DIGEST + 1*8*8], B