Skip to content

Commit

Permalink
avx512 mt irate
Browse files Browse the repository at this point in the history
  • Loading branch information
clamchowder committed Jan 23, 2024
1 parent 748e7fe commit 4ae495c
Show file tree
Hide file tree
Showing 2 changed files with 299 additions and 48 deletions.
306 changes: 258 additions & 48 deletions mt_instructionrate/InstructionRateFunctions.asm
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ global avx2_int64_mul_test
global sse_fp32_add_test
global sse_fp32_mul_test
global sse_fp32_muladd_test
global sse_fp32_rsqrt_test
global avx_fp32_add_test
global avx_fp32_mul_test
global avx_fp32_muladd_test
global avx_fp32_rsqrt_test
global fp32_fma_test
global fp64_fma_test

Expand All @@ -26,13 +28,23 @@ global avx_fp64_add_test
global avx_fp64_mul_test
global avx_fp64_muladd_test

global avx512_int32_add_test
global avx512_int32_mul_test
global avx512_int64_add_test
global avx512_int64_mul_test
global avx512_fp32_rsqrt_test
global avx512_fp32_add_test
global avx512_fp32_fma_test
global avx512_fp64_add_test
global avx512_fp64_fma_test

sse_int32_add_test:
movups xmm0, [rdx]
movups xmm1, [rdx + 16]
movups xmm2, [rdx + 32]
movups xmm3, [rdx + 48]
movups xmm4, [rdx + 64]
movups xmm5, [rdx + 72]
movdqu xmm0, [rdx]
movdqu xmm1, [rdx + 16]
movdqu xmm2, [rdx + 32]
movdqu xmm3, [rdx + 48]
movdqu xmm4, [rdx + 64]
movdqu xmm5, [rdx + 72]
sse_int32_add_test_loop:
paddd xmm0, xmm0
paddd xmm1, xmm1
Expand All @@ -46,12 +58,12 @@ sse_int32_add_test_loop:
ret

sse_int64_add_test:
movups xmm0, [rdx]
movups xmm1, [rdx + 16]
movups xmm2, [rdx + 32]
movups xmm3, [rdx + 48]
movups xmm4, [rdx + 64]
movups xmm5, [rdx + 72]
movdqu xmm0, [rdx]
movdqu xmm1, [rdx + 16]
movdqu xmm2, [rdx + 32]
movdqu xmm3, [rdx + 48]
movdqu xmm4, [rdx + 64]
movdqu xmm5, [rdx + 72]
sse_int64_add_test_loop:
paddq xmm0, xmm0
paddq xmm1, xmm1
Expand All @@ -64,12 +76,12 @@ sse_int64_add_test_loop:
ret

sse_int32_mul_test:
movups xmm0, [rdx]
movups xmm1, [rdx + 16]
movups xmm2, [rdx + 32]
movups xmm3, [rdx + 48]
movups xmm4, [rdx + 64]
movups xmm5, [rdx + 72]
movdqu xmm0, [rdx]
movdqu xmm1, [rdx + 16]
movdqu xmm2, [rdx + 32]
movdqu xmm3, [rdx + 48]
movdqu xmm4, [rdx + 64]
movdqu xmm5, [rdx + 72]
sse_int32_mul_test_loop:
pmulld xmm0, xmm0
pmulld xmm1, xmm1
Expand All @@ -82,12 +94,12 @@ sse_int32_mul_test_loop:
ret

sse_int64_mul_test:
movups xmm0, [rdx]
movups xmm1, [rdx + 16]
movups xmm2, [rdx + 32]
movups xmm3, [rdx + 48]
movups xmm4, [rdx + 64]
movups xmm5, [rdx + 72]
movdqu xmm0, [rdx]
movdqu xmm1, [rdx + 16]
movdqu xmm2, [rdx + 32]
movdqu xmm3, [rdx + 48]
movdqu xmm4, [rdx + 64]
movdqu xmm5, [rdx + 72]
sse_int64_mul_test_loop:
pmuludq xmm0, xmm0
pmuludq xmm1, xmm1
Expand All @@ -100,12 +112,12 @@ sse_int64_mul_test_loop:
ret

avx2_int32_add_test:
vmovups ymm0, [rdx]
vmovups ymm1, [rdx + 32]
vmovups ymm2, [rdx + 64]
vmovups ymm3, [rdx + 96]
vmovups ymm4, [rdx + 128]
vmovups ymm5, [rdx + 160]
vmovdqu ymm0, [rdx]
vmovdqu ymm1, [rdx + 32]
vmovdqu ymm2, [rdx + 64]
vmovdqu ymm3, [rdx + 96]
vmovdqu ymm4, [rdx + 128]
vmovdqu ymm5, [rdx + 160]
avx2_int32_add_test_loop:
vpaddd ymm0, ymm0, ymm0
vpaddd ymm1, ymm1, ymm1
Expand All @@ -119,12 +131,12 @@ avx2_int32_add_test_loop:
ret

avx2_int32_mul_test:
vmovups ymm0, [rdx]
vmovups ymm1, [rdx + 32]
vmovups ymm2, [rdx + 64]
vmovups ymm3, [rdx + 96]
vmovups ymm4, [rdx + 128]
vmovups ymm5, [rdx + 160]
vmovdqu ymm0, [rdx]
vmovdqu ymm1, [rdx + 32]
vmovdqu ymm2, [rdx + 64]
vmovdqu ymm3, [rdx + 96]
vmovdqu ymm4, [rdx + 128]
vmovdqu ymm5, [rdx + 160]
avx2_int32_mul_test_loop:
vpmulld ymm0, ymm0, ymm0
vpmulld ymm1, ymm1, ymm1
Expand All @@ -137,12 +149,12 @@ avx2_int32_mul_test_loop:
ret

avx2_int64_add_test:
vmovups ymm0, [rdx]
vmovups ymm1, [rdx + 32]
vmovups ymm2, [rdx + 64]
vmovups ymm3, [rdx + 96]
vmovups ymm4, [rdx + 128]
vmovups ymm5, [rdx + 160]
vmovdqu ymm0, [rdx]
vmovdqu ymm1, [rdx + 32]
vmovdqu ymm2, [rdx + 64]
vmovdqu ymm3, [rdx + 96]
vmovdqu ymm4, [rdx + 128]
vmovdqu ymm5, [rdx + 160]
avx2_int64_add_test_loop:
vpaddq ymm0, ymm0, ymm0
vpaddq ymm1, ymm1, ymm1
Expand All @@ -155,12 +167,12 @@ avx2_int64_add_test_loop:
ret

avx2_int64_mul_test:
vmovups ymm0, [rdx]
vmovups ymm1, [rdx + 32]
vmovups ymm2, [rdx + 64]
vmovups ymm3, [rdx + 96]
vmovups ymm4, [rdx + 128]
vmovups ymm5, [rdx + 160]
vmovdqu ymm0, [rdx]
vmovdqu ymm1, [rdx + 32]
vmovdqu ymm2, [rdx + 64]
vmovdqu ymm3, [rdx + 96]
vmovdqu ymm4, [rdx + 128]
vmovdqu ymm5, [rdx + 160]
avx2_int64_mul_test_loop:
vpmuldq ymm0, ymm0, ymm0
vpmuldq ymm1, ymm1, ymm1
Expand Down Expand Up @@ -268,6 +280,42 @@ sse_fp32_muladd_test_loop:
jg sse_fp32_muladd_test_loop
ret

sse_fp32_rsqrt_test:
movups xmm0, [rdx]
movups xmm1, [rdx + 16]
movups xmm2, [rdx + 32]
movups xmm3, [rdx + 48]
movups xmm4, [rdx + 64]
movups xmm5, [rdx + 72]
sse_fp32_rsqrt_test_loop:
rsqrtps xmm0, xmm0
rsqrtps xmm1, xmm1
rsqrtps xmm2, xmm2
rsqrtps xmm3, xmm3
rsqrtps xmm4, xmm4
rsqrtps xmm5, xmm5
sub rcx, 24
jg sse_fp32_rsqrt_test_loop
ret

avx_fp32_rsqrt_test:
vmovups ymm0, [rdx]
vmovups ymm1, [rdx + 32]
vmovups ymm2, [rdx + 64]
vmovups ymm3, [rdx + 96]
vmovups ymm4, [rdx + 128]
vmovups ymm5, [rdx + 160]
avx_fp32_rsqrt_test_loop:
vrsqrtps ymm0, ymm0
vrsqrtps ymm1, ymm1
vrsqrtps ymm2, ymm2
vrsqrtps ymm3, ymm3
vrsqrtps ymm4, ymm4
vrsqrtps ymm5, ymm5
sub rcx, 48
jg avx_fp32_rsqrt_test_loop
ret

sse_fp64_muladd_test:
movups xmm0, [rdx]
movups xmm1, [rdx + 16]
Expand Down Expand Up @@ -450,4 +498,166 @@ fp64_fma_test_loop:
vfmadd132pd ymm5, ymm5, ymm6
sub rcx, 24
jg fp64_fma_test_loop
ret

avx512_int32_add_test:
vmovdqu16 zmm0, [rdx]
vmovdqu16 zmm1, [rdx + 64]
vmovdqu16 zmm2, [rdx + 128]
vmovdqu16 zmm3, [rdx + 192]
vmovdqu16 zmm4, [rdx + 256]
vmovdqu16 zmm5, [rdx + 384]
avx512_int32_add_test_loop:
vpaddd zmm0, zmm0, zmm0
vpaddd zmm1, zmm1, zmm1
vpaddd zmm2, zmm2, zmm2
vpaddd zmm3, zmm3, zmm3
vpaddd zmm4, zmm4, zmm4
vpaddd zmm5, zmm5, zmm5
sub rcx, 96
jg avx512_int32_add_test_loop
ret

avx512_int32_mul_test:
vmovdqu16 zmm0, [rdx]
vmovdqu16 zmm1, [rdx + 64]
vmovdqu16 zmm2, [rdx + 128]
vmovdqu16 zmm3, [rdx + 192]
vmovdqu16 zmm4, [rdx + 256]
vmovdqu16 zmm5, [rdx + 384]
avx512_int32_mul_test_loop:
vpmulld zmm0, zmm0, zmm0
vpmulld zmm1, zmm1, zmm1
vpmulld zmm2, zmm2, zmm2
vpmulld zmm3, zmm3, zmm3
vpmulld zmm4, zmm4, zmm4
vpmulld zmm5, zmm5, zmm5
sub rcx, 96
jg avx512_int32_mul_test_loop
ret

avx512_int64_add_test:
vmovdqu16 zmm0, [rdx]
vmovdqu16 zmm1, [rdx + 64]
vmovdqu16 zmm2, [rdx + 128]
vmovdqu16 zmm3, [rdx + 192]
vmovdqu16 zmm4, [rdx + 256]
vmovdqu16 zmm5, [rdx + 384]
avx512_int64_add_test_loop:
vpaddq zmm0, zmm0, zmm0
vpaddq zmm1, zmm1, zmm1
vpaddq zmm2, zmm2, zmm2
vpaddq zmm3, zmm3, zmm3
vpaddq zmm4, zmm4, zmm4
vpaddq zmm5, zmm5, zmm5
sub rcx, 48
jg avx512_int64_add_test_loop
ret

avx512_int64_mul_test:
vmovdqu16 zmm0, [rdx]
vmovdqu16 zmm1, [rdx + 64]
vmovdqu16 zmm2, [rdx + 128]
vmovdqu16 zmm3, [rdx + 192]
vmovdqu16 zmm4, [rdx + 256]
vmovdqu16 zmm5, [rdx + 384]
avx512_int64_mul_test_loop:
vpmuldq zmm0, zmm0, zmm0
vpmuldq zmm1, zmm1, zmm1
vpmuldq zmm2, zmm2, zmm2
vpmuldq zmm3, zmm3, zmm3
vpmuldq zmm4, zmm4, zmm4
vpmuldq zmm5, zmm5, zmm5
sub rcx, 48
jg avx512_int64_mul_test_loop
ret

avx512_fp32_rsqrt_test:
vmovups zmm0, [rdx]
vmovups zmm1, [rdx + 64]
vmovups zmm2, [rdx + 128]
vmovups zmm3, [rdx + 192]
vmovups zmm4, [rdx + 256]
vmovups zmm5, [rdx + 384]
avx512_fp32_rsqrt_test_loop:
vrsqrt14ps zmm0, zmm0
vrsqrt14ps zmm1, zmm1
vrsqrt14ps zmm2, zmm2
vrsqrt14ps zmm3, zmm3
vrsqrt14ps zmm4, zmm4
vrsqrt14ps zmm5, zmm5
sub rcx, 96
jg avx512_fp32_rsqrt_test_loop
ret

avx512_fp32_add_test:
vmovups zmm0, [rdx]
vmovups zmm1, [rdx + 64]
vmovups zmm2, [rdx + 128]
vmovups zmm3, [rdx + 192]
vmovups zmm4, [rdx + 256]
vmovups zmm5, [rdx + 384]
avx512_fp32_add_test_loop:
vaddps zmm0, zmm0, zmm0
vaddps zmm1, zmm1, zmm1
vaddps zmm2, zmm2, zmm2
vaddps zmm3, zmm3, zmm3
vaddps zmm4, zmm4, zmm4
vaddps zmm5, zmm5, zmm5
sub rcx, 96
jg avx512_fp32_add_test_loop
ret

avx512_fp32_fma_test:
vmovups zmm0, [rdx]
vmovups zmm1, [rdx + 64]
vmovups zmm2, [rdx + 128]
vmovups zmm3, [rdx + 192]
vmovups zmm4, [rdx + 256]
vmovups zmm5, [rdx + 384]
avx512_fp32_fma_test_loop:
vfmadd132ps zmm0, zmm0, zmm0
vfmadd132ps zmm1, zmm1, zmm1
vfmadd132ps zmm2, zmm2, zmm2
vfmadd132ps zmm3, zmm3, zmm3
vfmadd132ps zmm4, zmm4, zmm4
vfmadd132ps zmm5, zmm5, zmm5
sub rcx, 96
jg avx512_fp32_fma_test_loop
ret

avx512_fp64_add_test:
vmovups zmm0, [rdx]
vmovups zmm1, [rdx + 64]
vmovups zmm2, [rdx + 128]
vmovups zmm3, [rdx + 192]
vmovups zmm4, [rdx + 256]
vmovups zmm5, [rdx + 384]
avx512_fp64_add_test_loop:
vfmadd132pd zmm0, zmm0, zmm0
vfmadd132pd zmm1, zmm1, zmm1
vfmadd132pd zmm2, zmm2, zmm2
vfmadd132pd zmm3, zmm3, zmm3
vfmadd132pd zmm4, zmm4, zmm4
vfmadd132pd zmm5, zmm5, zmm5
sub rcx, 48
jg avx512_fp64_add_test_loop
ret

avx512_fp64_fma_test:
vmovups zmm0, [rdx]
vmovups zmm1, [rdx + 64]
vmovups zmm2, [rdx + 128]
vmovups zmm3, [rdx + 192]
vmovups zmm4, [rdx + 256]
vmovups zmm5, [rdx + 384]
avx512_fp64_fma_test_loop:
vfmadd132ps zmm0, zmm0, zmm0
vfmadd132ps zmm1, zmm1, zmm1
vfmadd132ps zmm2, zmm2, zmm2
vfmadd132ps zmm3, zmm3, zmm3
vfmadd132ps zmm4, zmm4, zmm4
vfmadd132ps zmm5, zmm5, zmm5
sub rcx, 48
jg avx512_fp64_fma_test_loop
ret
Loading

0 comments on commit 4ae495c

Please sign in to comment.