From 81e295c21a8e417e3ac6e87ada7ebb471cda537b Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Mon, 9 Sep 2024 18:17:39 +0000 Subject: [PATCH] fix(deps): update golang.org/x/exp digest to 701f63a --- go.mod | 20 +- go.sum | 32 +- pkg/push/go.mod | 6 +- pkg/push/go.sum | 7 +- .../testdata/client_intermediate_cert.der | Bin 998 -> 0 bytes .../testdata/client_leaf_cert.der | Bin 1147 -> 0 bytes .../testdata/client_root_cert.der | Bin 1013 -> 0 bytes .../testdata/server_intermediate_cert.der | Bin 998 -> 0 bytes .../testdata/server_leaf_cert.der | Bin 1147 -> 0 bytes .../testdata/server_root_cert.der | Bin 1013 -> 0 bytes .../v2/remotesigner/testdata/client_cert.der | Bin 1013 -> 0 bytes .../v2/remotesigner/testdata/client_cert.pem | 24 - .../v2/remotesigner/testdata/client_key.pem | 27 - .../v2/remotesigner/testdata/server_cert.der | Bin 1013 -> 0 bytes .../v2/remotesigner/testdata/server_cert.pem | 24 - .../v2/remotesigner/testdata/server_key.pem | 27 - .../internal/v2/testdata/client_cert.pem | 24 - .../internal/v2/testdata/client_key.pem | 27 - .../internal/v2/testdata/server_cert.pem | 24 - .../internal/v2/testdata/server_key.pem | 27 - .../tlsconfigstore/testdata/client_cert.pem | 24 - .../v2/tlsconfigstore/testdata/client_key.pem | 27 - .../tlsconfigstore/testdata/server_cert.pem | 24 - .../v2/tlsconfigstore/testdata/server_key.pem | 27 - .../google/s2a-go/testdata/client_cert.pem | 24 - .../google/s2a-go/testdata/client_key.pem | 27 - .../s2a-go/testdata/mds_client_cert.pem | 19 - .../google/s2a-go/testdata/mds_client_key.pem | 28 - .../google/s2a-go/testdata/mds_root_cert.pem | 21 - .../s2a-go/testdata/mds_server_cert.pem | 21 - .../google/s2a-go/testdata/mds_server_key.pem | 28 - .../s2a-go/testdata/self_signed_cert.pem | 19 - .../s2a-go/testdata/self_signed_key.pem | 28 - .../google/s2a-go/testdata/server_cert.pem | 24 - .../google/s2a-go/testdata/server_key.pem | 27 - .../golang.org/x/crypto/argon2/blamka_amd64.s | 2972 +++- .../x/crypto/blake2b/blake2bAVX2_amd64.s | 5167 ++++++- .../x/crypto/blake2b/blake2b_amd64.s | 1681 ++- .../chacha20poly1305/chacha20poly1305_amd64.s | 11503 +++++++++++++--- .../x/crypto/internal/poly1305/sum_amd64.s | 133 +- vendor/golang.org/x/exp/LICENSE | 4 +- vendor/golang.org/x/exp/slices/sort.go | 4 +- vendor/golang.org/x/mod/LICENSE | 4 +- vendor/golang.org/x/sys/cpu/cpu.go | 19 + .../golang.org/x/sys/cpu/cpu_linux_noinit.go | 2 +- .../golang.org/x/sys/cpu/cpu_linux_riscv64.go | 137 + vendor/golang.org/x/sys/cpu/cpu_riscv64.go | 11 +- vendor/golang.org/x/sys/unix/mkerrors.sh | 1 + .../golang.org/x/sys/unix/syscall_darwin.go | 37 + vendor/golang.org/x/sys/unix/syscall_hurd.go | 1 + .../x/sys/unix/zerrors_darwin_amd64.go | 7 + .../x/sys/unix/zerrors_darwin_arm64.go | 7 + .../x/sys/unix/zerrors_zos_s390x.go | 2 + .../x/sys/unix/zsyscall_darwin_amd64.go | 20 + .../x/sys/unix/zsyscall_darwin_amd64.s | 5 + .../x/sys/unix/zsyscall_darwin_arm64.go | 20 + .../x/sys/unix/zsyscall_darwin_arm64.s | 5 + .../x/sys/unix/ztypes_darwin_amd64.go | 13 + .../x/sys/unix/ztypes_darwin_arm64.go | 13 + .../x/sys/unix/ztypes_freebsd_386.go | 1 + .../x/sys/unix/ztypes_freebsd_amd64.go | 1 + .../x/sys/unix/ztypes_freebsd_arm.go | 1 + .../x/sys/unix/ztypes_freebsd_arm64.go | 1 + .../x/sys/unix/ztypes_freebsd_riscv64.go | 1 + vendor/golang.org/x/sys/unix/ztypes_linux.go | 2 +- .../x/sys/unix/ztypes_linux_riscv64.go | 33 + .../x/sys/windows/syscall_windows.go | 4 + .../golang.org/x/sys/windows/types_windows.go | 1 + .../x/sys/windows/zsyscall_windows.go | 38 + vendor/golang.org/x/term/term_windows.go | 1 + vendor/golang.org/x/tools/LICENSE | 4 +- .../x/tools/go/packages/external.go | 2 +- .../x/tools/go/packages/packages.go | 11 +- .../golang.org/x/tools/go/packages/visit.go | 9 + .../x/tools/go/types/objectpath/objectpath.go | 40 +- .../x/tools/internal/aliases/aliases.go | 10 +- .../x/tools/internal/aliases/aliases_go121.go | 16 +- .../x/tools/internal/aliases/aliases_go122.go | 41 +- .../x/tools/internal/gcimporter/iexport.go | 249 +- .../x/tools/internal/gcimporter/iimport.go | 18 +- .../tools/internal/gcimporter/ureader_yes.go | 28 +- .../x/tools/internal/pkgbits/decoder.go | 38 +- .../x/tools/internal/pkgbits/encoder.go | 43 +- .../x/tools/internal/pkgbits/frames_go1.go | 21 - .../x/tools/internal/pkgbits/frames_go17.go | 28 - .../x/tools/internal/pkgbits/support.go | 2 +- .../x/tools/internal/pkgbits/sync.go | 23 + .../internal/pkgbits/syncmarker_string.go | 7 +- .../x/tools/internal/pkgbits/version.go | 85 + .../x/tools/internal/stdlib/manifest.go | 2 +- .../tools/internal/typesinternal/errorcode.go | 8 +- .../x/tools/internal/versions/constraint.go | 13 + .../internal/versions/constraint_go121.go | 14 + vendor/modules.txt | 24 +- 94 files changed, 19009 insertions(+), 4216 deletions(-) delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/client_intermediate_cert.der delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/client_leaf_cert.der delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/client_root_cert.der delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/server_intermediate_cert.der delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/server_leaf_cert.der delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/server_root_cert.der delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/client_cert.der delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/client_cert.pem delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/client_key.pem delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/server_cert.der delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/server_cert.pem delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/server_key.pem delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/testdata/client_cert.pem delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/testdata/client_key.pem delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/testdata/server_cert.pem delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/testdata/server_key.pem delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/tlsconfigstore/testdata/client_cert.pem delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/tlsconfigstore/testdata/client_key.pem delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/tlsconfigstore/testdata/server_cert.pem delete mode 100644 vendor/github.com/google/s2a-go/internal/v2/tlsconfigstore/testdata/server_key.pem delete mode 100644 vendor/github.com/google/s2a-go/testdata/client_cert.pem delete mode 100644 vendor/github.com/google/s2a-go/testdata/client_key.pem delete mode 100644 vendor/github.com/google/s2a-go/testdata/mds_client_cert.pem delete mode 100644 vendor/github.com/google/s2a-go/testdata/mds_client_key.pem delete mode 100644 vendor/github.com/google/s2a-go/testdata/mds_root_cert.pem delete mode 100644 vendor/github.com/google/s2a-go/testdata/mds_server_cert.pem delete mode 100644 vendor/github.com/google/s2a-go/testdata/mds_server_key.pem delete mode 100644 vendor/github.com/google/s2a-go/testdata/self_signed_cert.pem delete mode 100644 vendor/github.com/google/s2a-go/testdata/self_signed_key.pem delete mode 100644 vendor/github.com/google/s2a-go/testdata/server_cert.pem delete mode 100644 vendor/github.com/google/s2a-go/testdata/server_key.pem create mode 100644 vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go delete mode 100644 vendor/golang.org/x/tools/internal/pkgbits/frames_go1.go delete mode 100644 vendor/golang.org/x/tools/internal/pkgbits/frames_go17.go create mode 100644 vendor/golang.org/x/tools/internal/pkgbits/version.go create mode 100644 vendor/golang.org/x/tools/internal/versions/constraint.go create mode 100644 vendor/golang.org/x/tools/internal/versions/constraint_go121.go diff --git a/go.mod b/go.mod index 8579f77e0e4b2..6444e6eec5fd3 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,8 @@ module github.com/grafana/loki/v3 -go 1.21.8 +go 1.22.0 + +toolchain go1.23.1 require ( cloud.google.com/go/bigtable v1.29.0 @@ -96,10 +98,10 @@ require ( go.etcd.io/bbolt v1.3.10 go.uber.org/atomic v1.11.0 go.uber.org/goleak v1.3.0 - golang.org/x/crypto v0.26.0 - golang.org/x/net v0.28.0 + golang.org/x/crypto v0.27.0 + golang.org/x/net v0.29.0 golang.org/x/sync v0.8.0 - golang.org/x/sys v0.24.0 + golang.org/x/sys v0.25.0 golang.org/x/time v0.6.0 google.golang.org/api v0.193.0 google.golang.org/grpc v1.65.0 @@ -143,9 +145,9 @@ require ( github.com/willf/bloom v2.0.3+incompatible go.opentelemetry.io/collector/pdata v1.12.0 go4.org/netipx v0.0.0-20230125063823-8449b0a6169f - golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 + golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 golang.org/x/oauth2 v0.22.0 - golang.org/x/text v0.17.0 + golang.org/x/text v0.18.0 google.golang.org/protobuf v1.34.2 gotest.tools v2.2.0+incompatible k8s.io/apimachinery v0.29.3 @@ -352,9 +354,9 @@ require ( go.opentelemetry.io/otel/trace v1.28.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.21.0 // indirect - golang.org/x/mod v0.19.0 // indirect - golang.org/x/term v0.23.0 // indirect - golang.org/x/tools v0.23.0 // indirect + golang.org/x/mod v0.21.0 // indirect + golang.org/x/term v0.24.0 // indirect + golang.org/x/tools v0.25.0 // indirect google.golang.org/genproto v0.0.0-20240820151423-278611b39280 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20240820151423-278611b39280 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240820151423-278611b39280 // indirect diff --git a/go.sum b/go.sum index cc26b57251f09..e19097a0704b0 100644 --- a/go.sum +++ b/go.sum @@ -2006,8 +2006,8 @@ golang.org/x/crypto v0.0.0-20220829220503-c86fa9a7ed90/go.mod h1:IxCIyHEi3zRg3s0 golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.6.0/go.mod h1:OFC/31mSvZgRz0V1QTNCzfAI1aIRzbiufJtkMIlEp58= golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= -golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= -golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/crypto v0.27.0 h1:GXm2NjJrPaiv/h1tb2UH8QfgC/hOf/+z0p6PT8o1w7A= +golang.org/x/crypto v0.27.0/go.mod h1:1Xngt8kV6Dvbssa53Ziq6Eqn0HqbZi5Z6R0ZpwQzt70= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -2021,8 +2021,8 @@ golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u0 golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= -golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 h1:aAcj0Da7eBAtrTp03QXWvm88pSyOt+UgdZw2BFZ+lEw= -golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8/go.mod h1:CQ1k9gNrJ50XIzaKCRR2hssIjF07kZFEiieALBM/ARQ= +golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk= +golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY= golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= @@ -2051,8 +2051,8 @@ golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.19.0 h1:fEdghXQSo20giMthA7cd28ZC+jts4amQ3YMXiP5oMQ8= -golang.org/x/mod v0.19.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0= +golang.org/x/mod v0.21.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/net v0.0.0-20170114055629-f2499483f923/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180611182652-db08ff08e862/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -2135,8 +2135,8 @@ golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.4.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= -golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE= -golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg= +golang.org/x/net v0.29.0 h1:5ORfpBpCs4HzDYoodCDBbwHzdR5UrLBZ3sOnUJmFoHo= +golang.org/x/net v0.29.0/go.mod h1:gLkgy8jTGERgjzMic6DS9+SP0ajcu6Xu3Orq/SpETg0= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190402181905-9f3314589c9a/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -2306,8 +2306,8 @@ golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= -golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34= +golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -2317,8 +2317,8 @@ golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= golang.org/x/term v0.15.0/go.mod h1:BDl952bC7+uMoWR75FIrCDx79TPU9oHkTZ9yRbYOrX0= golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= -golang.org/x/term v0.23.0 h1:F6D4vR+EHoL9/sWAWgAR1H2DcHr4PareCbAaCo1RpuU= -golang.org/x/term v0.23.0/go.mod h1:DgV24QBUrK6jhZXl+20l6UWznPlwAHm1Q1mGHtydmSk= +golang.org/x/term v0.24.0 h1:Mh5cbb+Zk2hqqXNO7S1iTjEphVL+jb8ZWaqh/g+JWkM= +golang.org/x/term v0.24.0/go.mod h1:lOBK/LVxemqiMij05LGJ0tzNr8xlmwBRJ81PX6wVLH8= golang.org/x/text v0.0.0-20160726164857-2910a502d2bf/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -2336,8 +2336,8 @@ golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc= -golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/text v0.18.0 h1:XvMDiNzPAl0jr17s6W9lcaIhGUfUORdGCNsuLmPG224= +golang.org/x/text v0.18.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= golang.org/x/time v0.0.0-20161028155119-f51c12702a4d/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20180412165947-fbb02b2291d2/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -2427,8 +2427,8 @@ golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= -golang.org/x/tools v0.23.0 h1:SGsXPZ+2l4JsgaCKkx+FQ9YZ5XEtA1GZYuoDjenLjvg= -golang.org/x/tools v0.23.0/go.mod h1:pnu6ufv6vQkll6szChhK3C3L/ruaIv5eBeztNG8wtsI= +golang.org/x/tools v0.25.0 h1:oFU9pkj/iJgs+0DT+VMHrx+oBKs/LJMV+Uvg78sl+fE= +golang.org/x/tools v0.25.0/go.mod h1:/vtpO8WL1N9cQC3FN5zPqb//fRXskFHbLKk4OW1Q7rg= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/pkg/push/go.mod b/pkg/push/go.mod index a67d4b7daa016..289412e0e8a2e 100644 --- a/pkg/push/go.mod +++ b/pkg/push/go.mod @@ -1,11 +1,13 @@ module github.com/grafana/loki/pkg/push -go 1.19 +go 1.22.0 + +toolchain go1.23.1 require ( github.com/gogo/protobuf v1.3.2 github.com/stretchr/testify v1.8.2 - golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 + golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 google.golang.org/grpc v1.56.3 ) diff --git a/pkg/push/go.sum b/pkg/push/go.sum index 9745e9d379079..57b621d97d9d3 100644 --- a/pkg/push/go.sum +++ b/pkg/push/go.sum @@ -8,7 +8,8 @@ github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaS github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= -github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= @@ -31,8 +32,8 @@ github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9dec golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 h1:aAcj0Da7eBAtrTp03QXWvm88pSyOt+UgdZw2BFZ+lEw= -golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8/go.mod h1:CQ1k9gNrJ50XIzaKCRR2hssIjF07kZFEiieALBM/ARQ= +golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk= +golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= diff --git a/vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/client_intermediate_cert.der b/vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/client_intermediate_cert.der deleted file mode 100644 index 958f3cfaddf3645fa6c0578b5b6955d65ac4c172..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 998 zcmXqLVt!=M#B^!_GZP~dlZfxTkgMw#Mx5CteAhsdS=K*t&xAY!UN%mxHjlRNyo`+8 ztPBQ??S|Y2oNUaYENsF|p}{Z?2M?38qoI(207!`V#>K<# zoS$2em{(~iXuuB=;pX9R$t=q(&dkp<6f+P32{H5V78@nTm!uY##3!c~l^9AHNWiV< zWEA7BsH}1TV!h=2Tmw0AULzv|Gec7&Lt`^z^C)p%6J+ina%mHz5^^vyvNA9?G4eAQ zG%<29H8CnfQWlH+MyjTjz z2Bjk+Ub9TzT(Z$!`+w1f9a{SYn?h^$U$mDJKK^KOurlxGlhf8coXvjW^}bgt-rFjD znHJ|Xo9Wew;}%JlW_Kjsthc+chb^XXpU3LOdy_KCS4OTntgUFd?YsGkV{8gPCrIUU z7xZ1<|6Oe5g44&3J$ODp>G5YKW=00a#V!U;2J*mEAgjzGVIbBZvce&_^W7!+r|T*d zavIkkZ(n>dQ`LY6q(GR3)qt6i@xOr}h$qOxWx&zImXe>Fn2DZTf#J-^uui3sYst!# z{ImS~I{#0uS#WDz%!i-3yIxOGkACKhN$DyM2L{LbrkL@xJCpY3crHP)J~LHEx;nsZlX$z1Na++S~*x&F8L zSHsIX`Pf;-7fz4f$yxWfRV}c$Te2$X#22gkrUlwjtNp9{IiD}~zjo}z)1J=iJoj~z zPFk$n)~eK8B%5+0@w1g$N8H(l+-H4!LOMmyS_+%b`$^TD`}g=z?!9=Gts-l=GE2@y HTP6bly#{pd diff --git a/vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/client_leaf_cert.der b/vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/client_leaf_cert.der deleted file mode 100644 index d2817641bafb022339926786ab85b545f40ac665..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1147 zcmXqLVktLhVvb+H%*4pVB*OPMYW498DSV5WgVNVW|Jr8oCgimNFB_*;n@8JsUPeZ4 zRtAH{c0+ChPB!LH7B*p~&|nycgNMo4(NM@h03^fC!x>zfmseSqn3HNKV890w6yxnyM_06xZQcZ?W@M4y!->Y40>Qzg1y0 z&gNQpqF{|c>D+ZZ(rFIKEL_n{%!~|-iyLnkG+s4m+z3nvvdS!tD-9Z#Eo|&v(%3b4 zVzNoZ_g%=<^$R1;>=M3fAjvH2AGv2jo&gWY$-*qG2F#3%{|y8|JV6#N1CA!Pl>FSp z%sk}C2j(GQ()!XwH^ngbB~t_{rh@w&h5|n>_#Tt3odjy zuPCdY_u)fF^#8=jc^^+Q{aqN5`$@Gm({I-q6VU_LtQOD6naL>j(`V|D&;QI6%^$Dc z@$uH$=blqSKTV1FFaEpmVP=>@vvi*Y`l|mqO|ORNA0oIS8nOI z?NiA8%5=ljL@D29-OKpueEYz|zv5rHb554&taa#n7142tpUKWU?R-O_r^0)6j>G3Y aOB;Xxy<$H9NT-Uzt(mu6i*GEf3IzZ)c%0q< diff --git a/vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/client_root_cert.der b/vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/client_root_cert.der deleted file mode 100644 index d8c3710c85f9ff41ddfc709924c866350a727a4f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1013 zcmXqLV*Y5*#B_53GZP~dlZa+?pWMd9xVH0Kw4Tf@Xyh=ks1AdSQHxGwPW?5!&W`3Tbn1KjLh?$4C*eEf+B(=CCJ~_3h#8ApW0&YDg zqZoHZWt9UE>m}#s8pw(B8W|aw8XFoJ7#JIxMv3#90J%nHQ0^ddX%nLoaxgNoGB7tW z@-rATF>*0AF)}hdZr#y6M}F(|x(>G^%gcAlKWcTb*S((Se#vIyuMojqJ6T@W9G$-N zZ|C3aC}V-ru$PA$a>||`V5!;WvLO9;#Z4xLTYkS~Q%%)mH+DGbsA<2rr@8F$Psyk! zC02_czBy5RY)XCG=Yl1DX=(>VSrdJ-C#|hK7^zn?b35awX)f$Ntvf$$ zcUg9sL1fPSDbXqWt6$VVZu~MYZrj`2y}fI?80UuXsujC8ao09pzReSEFH3&Hjk;!&j?yPw^7|NHlfMLseyY=TyHB}&&4U!T;kt&%Ic z-@@f=`1w!pGyVU%zJI$PWGYXZ;?Bg($iTQb*dWkA7MKcT`B=nQL{>NicfPwM|8!l2 zLQdoQtUFt($qvv(cdI=i6tz3tUW`|a0C z&5N-8^H(H$?psgMz!}^;s& z=?+d?)Hkg%emvc~QLj+DwRvTMu)Tl4_m}{_7kTC1{xNsF-p9krsM-B)Ge_#Kiu#~{ zd67^5U(N6lVcaug+5}Tw2V+f_-BCNQyG;7WAZ_<^`6QjCrOhu7&ztq4tdo7t^%&>) zTR0VF*I2Sr HHQW{e5x{wf diff --git a/vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/server_intermediate_cert.der b/vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/server_intermediate_cert.der deleted file mode 100644 index dae619c097512f20d09d2054c63fc0f715d7be24..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 998 zcmXqLVt!=M#B^!_GZP~dlZZk6mxvepr#-!NVoqOL*`yn*LOZ7#@Un4gwRyCC=VfH% zW@RvFY&YaK;ACSCWnmL$3Jr#FICz+x9SwyH1VA$EJeHTH8JZfJT11KSnjmurkxQExm5_sxk(GhDiIJbd zpox)-sfm%1VO{t`EzVyWX$3+}`kJnrGQY>^=jnZyz4d%;|IQeuB_Gcfo1a{_>B5Fg zXYbI=4K*7QZdjc*51-)~EjRzyir2SPA5Z@zcTLeXV*g^*2f0P(7p>c z?c<>#uh>H(ryKBq6bQ4h8Za|5{x=W=@dR183^XGxLzc8JJvw;mpWjp+5Pu z!*lq#5Sb^i@`a>klJNjzx%iu)2*?h3yy-gEL(;p^g~mZo#n!wzh9Qua5C+wfRd ztarh_l9E{Oro7&WNVTl@3-~vy&EDU;)-f|r-8<&mQ>C=;``&D2+5V0(pW(g3o1}Sj z^gY793Mi&+ntK1`1tm%ENt=U{rb%k1PtbRHEhYTylqR?8HqOeuvA1fBY>pc~NUFTR KZE!sDzAgX``fm0B diff --git a/vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/server_leaf_cert.der b/vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/server_leaf_cert.der deleted file mode 100644 index ce7f8d31d6802c7e68c188af8797c3a063894857..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1147 zcmXqLVktLhVvb+H%*4pVB%&$)xidhT`$ARBi)~L`p2_`@+!*`b5acj4ER7|Ts-W~ z`MCv&d6kBO2K*oqZXOPo%(Bel%=|nTUzhK>3dC_Z(le%bCn$9moio9jI@8{ zd;5%c{XT!(ig$;5K+$bpwI6ChVmse+UV1r`!RY5*U#A%ppFJxS>PX#w$sz7*X_R~A zg!|$g&z66XI$)aW-TrxDio@fp!U^sS#+*GdHMB|H?BTf!^g~J_=D$Y-Ot{~xpTg~G+{k6 Zdr{DieczoQDL-a%Ib3(Tsv+bGCjf+Ap-TV& diff --git a/vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/server_root_cert.der b/vendor/github.com/google/s2a-go/internal/v2/certverifier/testdata/server_root_cert.der deleted file mode 100644 index 04b0d73600b72f80a03943d41973b279db9e8b32..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1013 zcmXqLV*Y5*#B_53GZP~dlZb{E^N!s~snriRh?%)HJiGKl>CAisUN%mxHjlRNyo`+8 ztPBQ??S|Y2oNUaYENsF|p}{Z?2M?38qoI(207!`V#>K<# zoS$2em{(~iXuuB=;pX9R$t=q(&dkp<6f+P32{H5V78@nTm!uY##3!c~l^9AHNWiV< zWEA7BsH}1TV!h=2Tmw0AULzv|Q)5FT0|N^q(aL-(RkWjuO*nCZC(^b4{cp%V+e53*PfoZW)2DU5BJN^Jypf0+ z%dyqB7eAAEDfH_&Z*6C3aP8^9M)sYbgC^`~v;4F5#oepTLYl5+2bdY(i!-?My>4G) z&8u+L`?_9LxQB1$HP?fpd*5i(-|SJJ!`sZHa8UZu(QeP3Dc(Yr8?G#iY20z>);Fh$ zMQ>DZHkC%KUBdEb&cYv;1sHCx@Vv~#%*epFIM^W2Ko*z^WcgUcSVVXxICA%7?w=9J zwOIRjXviz}kjUu<@*rtt76}8f2J8wz2}@R(k?}tZs{u2RLJoFd>IDWnBZI}4y?mP{ zW+WALO{`uTY0T-O8uy=9`&z_;*e^$wYu3!Wz-Re-&a7vV%g_8h++q8CMf&zxTFdl~ zp5i+{-R9L&+mdUFEBoKr@u=5M->UQ>MNWJ7BRi)4^M4~B*uKhe`J%yAbG&-Thr?bk zCap+`USuL4V}qf=6kBV_EFAN zmIn*31Qb`P{oxd|=2o5l$|&AdTJ_-*k&5oDQvTTOiN8OjczyHyta?pWMd9xVH0Kw4Tf@Xyh=ks1AdSQHxGwPW?5!&W`3Tbn1KjLh?$4C*eEf+B(=CCJ~_3h#8ApW0&YDg zqZoHZWt9UE>m}#s8pw(B8W|aw8XFoJ7#JIxMv3#90J%nHQ0^ddX%nLoaxgNoGB7tW z@-rATF>*0AF)}hdZr#y6M}F(|x(>G^%gcAlKWcTb*S((Se#vIyuMojqJ6T@W9G$-N zZ|C3aC}V-ru$PA$a>||`V5!;WvLO9;#Z4xLTYkS~Q%%)mH+DGbsA<2rr@8F$Psyk! zC02_czBy5RY)XCG=Yl1DX=(>VSrdJ-C#|hK7^zn?b35awX)f$Ntvf$$ zcUg9sL1fPSDbXqWt6$VVZu~MYZrj`2y}fI?80UuXsujC8ao09pzReSEFH3&Hjk;!&j?yPw^7|NHlfMLseyY=TyHB}&&4U!T;kt&%Ic z-@@f=`1w!pGyVU%zJI$PWGYXZ;?Bg($iTQb*dWkA7MKcT`B=nQL{>NicfPwM|8!l2 zLQdoQtUFt($qvv(cdI=i6tz3tUW`|a0C z&5N-8^H(H$?psgMz!}^;s& z=?+d?)Hkg%emvc~QLj+DwRvTMu)Tl4_m}{_7kTC1{xNsF-p9krsM-B)Ge_#Kiu#~{ zd67^5U(N6lVcaug+5}Tw2V+f_-BCNQyG;7WAZ_<^`6QjCrOhu7&ztq4tdo7t^%&>) zTR0VF*I2Sr HHQW{e5x{wf diff --git a/vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/client_cert.pem b/vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/client_cert.pem deleted file mode 100644 index 493a5a2648101..0000000000000 --- a/vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/client_cert.pem +++ /dev/null @@ -1,24 +0,0 @@ ------BEGIN CERTIFICATE----- -MIID8TCCAtmgAwIBAgIUKXNlBRVe6UepjQUijIFPZBd/4qYwDQYJKoZIhvcNAQEL -BQAwgYcxCzAJBgNVBAYTAlVTMQswCQYDVQQIDAJDQTESMBAGA1UEBwwJU3Vubnl2 -YWxlMRAwDgYDVQQKDAdDb21wYW55MREwDwYDVQQLDAhEaXZpc2lvbjEWMBQGA1UE -AwwNczJhX3Rlc3RfY2VydDEaMBgGCSqGSIb3DQEJARYLeHl6QHh5ei5jb20wHhcN -MjIwNTMxMjAwMzE1WhcNNDIwNTI2MjAwMzE1WjCBhzELMAkGA1UEBhMCVVMxCzAJ -BgNVBAgMAkNBMRIwEAYDVQQHDAlTdW5ueXZhbGUxEDAOBgNVBAoMB0NvbXBhbnkx -ETAPBgNVBAsMCERpdmlzaW9uMRYwFAYDVQQDDA1zMmFfdGVzdF9jZXJ0MRowGAYJ -KoZIhvcNAQkBFgt4eXpAeHl6LmNvbTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCC -AQoCggEBAOOFuIucH7XXfohGxKd3uR/ihUA/LdduR9I8kfpUEbq5BOt8xZe5/Yn9 -a1ozEHVW6cOAbHbnwAR8tkSgZ/t42QIA2k77HWU1Jh2xiEIsJivo3imm4/kZWuR0 -OqPh7MhzxpR/hvNwpI5mJsAVBWFMa5KtecFZLnyZtwHylrRN1QXzuLrOxuKFufK3 -RKbTABScn5RbZL976H/jgfSeXrbt242NrIoBnVe6fRbekbq2DQ6zFArbQMUgHjHK -P0UqBgdr1QmHfi9KytFyx9BTP3gXWnWIu+bY7/v7qKJMHFwGETo+dCLWYevJL316 -HnLfhApDMfP8U+Yv/y1N/YvgaSOSlEcCAwEAAaNTMFEwHQYDVR0OBBYEFKhAU4nu -0h/lrnggbIGvx4ej0WklMB8GA1UdIwQYMBaAFKhAU4nu0h/lrnggbIGvx4ej0Wkl -MA8GA1UdEwEB/wQFMAMBAf8wDQYJKoZIhvcNAQELBQADggEBAE/6NghzQ5fu6yR6 -EHKbj/YMrFdT7aGn5n2sAf7wJ33LIhiFHkpWBsVlm7rDtZtwhe891ZK/P60anlg9 -/P0Ua53tSRVRmCvTnEbXWOVMN4is6MsR7BlmzUxl4AtIn7jbeifEwRL7B4xDYmdA -QrQnsqoz45dLgS5xK4WDqXATP09Q91xQDuhud/b+A4jrvgwFASmL7rMIZbp4f1JQ -nlnl/9VoTBQBvJiWkDUtQDMpRLtauddEkv4AGz75p5IspXWD6cOemuh2iQec11xD -X20rs2WZbAcAiUa3nmy8OKYw435vmpj8gp39WYbX/Yx9TymrFFbVY92wYn+quTco -pKklVz0= ------END CERTIFICATE----- diff --git a/vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/client_key.pem b/vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/client_key.pem deleted file mode 100644 index 55a7f10c742db..0000000000000 --- a/vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/client_key.pem +++ /dev/null @@ -1,27 +0,0 @@ ------BEGIN RSA PRIVATE KEY----- -MIIEogIBAAKCAQEA44W4i5wftdd+iEbEp3e5H+KFQD8t125H0jyR+lQRurkE63zF -l7n9if1rWjMQdVbpw4BsdufABHy2RKBn+3jZAgDaTvsdZTUmHbGIQiwmK+jeKabj -+Rla5HQ6o+HsyHPGlH+G83CkjmYmwBUFYUxrkq15wVkufJm3AfKWtE3VBfO4us7G -4oW58rdEptMAFJyflFtkv3vof+OB9J5etu3bjY2sigGdV7p9Ft6RurYNDrMUCttA -xSAeMco/RSoGB2vVCYd+L0rK0XLH0FM/eBdadYi75tjv+/uookwcXAYROj50ItZh -68kvfXoect+ECkMx8/xT5i//LU39i+BpI5KURwIDAQABAoIBABgyjo/6iLzUMFbZ -/+w3pW6orrdIgN2akvTfED9pVYFgUA+jc3hRhY95bkNnjuaL2cy7Cc4Tk65mfRQL -Y0OxdJLr+EvSFSxAXM9npDA1ddHRsF8JqtFBSxNk8R+g1Yf0GDiO35Fgd3/ViWWA -VtQkRoSRApP3oiQKTRZd8H04keFR+PvmDk/Lq11l3Kc24A1PevKIPX1oI990ggw9 -9i4uSV+cnuMxmcI9xxJtgwdDFdjr39l2arLOHr4s6LGoV2IOdXHNlv5xRqWUZ0FH -MDHowkLgwDrdSTnNeaVNkce14Gqx+bd4hNaLCdKXMpedBTEmrut3f3hdV1kKjaKt -aqRYr8ECgYEA/YDGZY2jvFoHHBywlqmEMFrrCvQGH51m5R1Ntpkzr+Rh3YCmrpvq -xgwJXING0PUw3dz+xrH5lJICrfNE5Kt3fPu1rAEy+13mYsNowghtUq2Rtu0Hsjjx -2E3Bf8vEB6RNBMmGkUpTTIAroGF5tpJoRvfnWax+k4pFdrKYFtyZdNcCgYEA5cNv -EPltvOobjTXlUmtVP3n27KZN2aXexTcagLzRxE9CV4cYySENl3KuOMmccaZpIl6z -aHk6BT4X+M0LqElNUczrInfVqI+SGAFLGy7W6CJaqSr6cpyFUP/fosKpm6wKGgLq -udHfpvz5rckhKd8kJxFLvhGOK9yN5qpzih0gfhECgYAJfwRvk3G5wYmYpP58dlcs -VIuPenqsPoI3PPTHTU/hW+XKnWIhElgmGRdUrto9Q6IT/Y5RtSMLTLjq+Tzwb/fm -56rziYv2XJsfwgAvnI8z1Kqrto9ePsHYf3krJ1/thVsZPc9bq/QY3ohD1sLvcuaT -GgBBnLOVJU3a12/ZE2RwOwKBgF0csWMAoj8/5IB6if+3ral2xOGsl7oPZVMo/J2V -Z7EVqb4M6rd/pKFugTpUQgkwtkSOekhpcGD1hAN5HTNK2YG/+L5UMAsKe9sskwJm -HgOfAHy0BSDzW3ey6i9skg2bT9Cww+0gJ3Hl7U1HSCBO5LjMYpSZSrNtwzfqdb5Q -BX3xAoGARZdR28Ej3+/+0+fz47Yu2h4z0EI/EbrudLOWY936jIeAVwHckI3+BuqH -qR4poj1gfbnMxNuI9UzIXzjEmGewx9kDZ7IYnvloZKqoVQODO5GlKF2ja6IcMNlh -GCNdD6PSAS6HcmalmWo9sj+1YMkrl+GJikKZqVBHrHNwMGAG67w= ------END RSA PRIVATE KEY----- diff --git a/vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/server_cert.der b/vendor/github.com/google/s2a-go/internal/v2/remotesigner/testdata/server_cert.der deleted file mode 100644 index 04b0d73600b72f80a03943d41973b279db9e8b32..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1013 zcmXqLV*Y5*#B_53GZP~dlZb{E^N!s~snriRh?%)HJiGKl>CAisUN%mxHjlRNyo`+8 ztPBQ??S|Y2oNUaYENsF|p}{Z?2M?38qoI(207!`V#>K<# zoS$2em{(~iXuuB=;pX9R$t=q(&dkp<6f+P32{H5V78@nTm!uY##3!c~l^9AHNWiV< zWEA7BsH}1TV!h=2Tmw0AULzv|Q)5FT0|N^q(aL-(RkWjuO*nCZC(^b4{cp%V+e53*PfoZW)2DU5BJN^Jypf0+ z%dyqB7eAAEDfH_&Z*6C3aP8^9M)sYbgC^`~v;4F5#oepTLYl5+2bdY(i!-?My>4G) z&8u+L`?_9LxQB1$HP?fpd*5i(-|SJJ!`sZHa8UZu(QeP3Dc(Yr8?G#iY20z>);Fh$ zMQ>DZHkC%KUBdEb&cYv;1sHCx@Vv~#%*epFIM^W2Ko*z^WcgUcSVVXxICA%7?w=9J zwOIRjXviz}kjUu<@*rtt76}8f2J8wz2}@R(k?}tZs{u2RLJoFd>IDWnBZI}4y?mP{ zW+WALO{`uTY0T-O8uy=9`&z_;*e^$wYu3!Wz-Re-&a7vV%g_8h++q8CMf&zxTFdl~ zp5i+{-R9L&+mdUFEBoKr@u=5M->UQ>MNWJ7BRi)4^M4~B*uKhe`J%yAbG&-Thr?bk zCap+`USuL4V}qf=6kBV_EFAN zmIn*31Qb`P{oxd|=2o5l$|&AdTJ_-*k&5oDQvTTOiN8OjczyHyta+0x00(SB)/8, $0x0201000706050403 -DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 - -#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v6, t1; \ - PUNPCKLQDQ v6, t2; \ - PUNPCKHQDQ v7, v6; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ v7, t2; \ - MOVO t1, v7; \ - MOVO v2, t1; \ - PUNPCKHQDQ t2, v7; \ - PUNPCKLQDQ v3, t2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v3 - -#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v2, t1; \ - PUNPCKLQDQ v2, t2; \ - PUNPCKHQDQ v3, v2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ v3, t2; \ - MOVO t1, v3; \ - MOVO v6, t1; \ - PUNPCKHQDQ t2, v3; \ - PUNPCKLQDQ v7, t2; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v7 - -#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, t0, c40, c48) \ - MOVO v0, t0; \ - PMULULQ v2, t0; \ - PADDQ v2, v0; \ - PADDQ t0, v0; \ - PADDQ t0, v0; \ - PXOR v0, v6; \ - PSHUFD $0xB1, v6, v6; \ - MOVO v4, t0; \ - PMULULQ v6, t0; \ - PADDQ v6, v4; \ - PADDQ t0, v4; \ - PADDQ t0, v4; \ - PXOR v4, v2; \ - PSHUFB c40, v2; \ - MOVO v0, t0; \ - PMULULQ v2, t0; \ - PADDQ v2, v0; \ - PADDQ t0, v0; \ - PADDQ t0, v0; \ - PXOR v0, v6; \ - PSHUFB c48, v6; \ - MOVO v4, t0; \ - PMULULQ v6, t0; \ - PADDQ v6, v4; \ - PADDQ t0, v4; \ - PADDQ t0, v4; \ - PXOR v4, v2; \ - MOVO v2, t0; \ - PADDQ v2, t0; \ - PSRLQ $63, v2; \ - PXOR t0, v2; \ - MOVO v1, t0; \ - PMULULQ v3, t0; \ - PADDQ v3, v1; \ - PADDQ t0, v1; \ - PADDQ t0, v1; \ - PXOR v1, v7; \ - PSHUFD $0xB1, v7, v7; \ - MOVO v5, t0; \ - PMULULQ v7, t0; \ - PADDQ v7, v5; \ - PADDQ t0, v5; \ - PADDQ t0, v5; \ - PXOR v5, v3; \ - PSHUFB c40, v3; \ - MOVO v1, t0; \ - PMULULQ v3, t0; \ - PADDQ v3, v1; \ - PADDQ t0, v1; \ - PADDQ t0, v1; \ - PXOR v1, v7; \ - PSHUFB c48, v7; \ - MOVO v5, t0; \ - PMULULQ v7, t0; \ - PADDQ v7, v5; \ - PADDQ t0, v5; \ - PADDQ t0, v5; \ - PXOR v5, v3; \ - MOVO v3, t0; \ - PADDQ v3, t0; \ - PSRLQ $63, v3; \ - PXOR t0, v3 - -#define LOAD_MSG_0(block, off) \ - MOVOU 8*(off+0)(block), X0; \ - MOVOU 8*(off+2)(block), X1; \ - MOVOU 8*(off+4)(block), X2; \ - MOVOU 8*(off+6)(block), X3; \ - MOVOU 8*(off+8)(block), X4; \ - MOVOU 8*(off+10)(block), X5; \ - MOVOU 8*(off+12)(block), X6; \ - MOVOU 8*(off+14)(block), X7 - -#define STORE_MSG_0(block, off) \ - MOVOU X0, 8*(off+0)(block); \ - MOVOU X1, 8*(off+2)(block); \ - MOVOU X2, 8*(off+4)(block); \ - MOVOU X3, 8*(off+6)(block); \ - MOVOU X4, 8*(off+8)(block); \ - MOVOU X5, 8*(off+10)(block); \ - MOVOU X6, 8*(off+12)(block); \ - MOVOU X7, 8*(off+14)(block) - -#define LOAD_MSG_1(block, off) \ - MOVOU 8*off+0*8(block), X0; \ - MOVOU 8*off+16*8(block), X1; \ - MOVOU 8*off+32*8(block), X2; \ - MOVOU 8*off+48*8(block), X3; \ - MOVOU 8*off+64*8(block), X4; \ - MOVOU 8*off+80*8(block), X5; \ - MOVOU 8*off+96*8(block), X6; \ - MOVOU 8*off+112*8(block), X7 - -#define STORE_MSG_1(block, off) \ - MOVOU X0, 8*off+0*8(block); \ - MOVOU X1, 8*off+16*8(block); \ - MOVOU X2, 8*off+32*8(block); \ - MOVOU X3, 8*off+48*8(block); \ - MOVOU X4, 8*off+64*8(block); \ - MOVOU X5, 8*off+80*8(block); \ - MOVOU X6, 8*off+96*8(block); \ - MOVOU X7, 8*off+112*8(block) - -#define BLAMKA_ROUND_0(block, off, t0, t1, c40, c48) \ - LOAD_MSG_0(block, off); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ - STORE_MSG_0(block, off) - -#define BLAMKA_ROUND_1(block, off, t0, t1, c40, c48) \ - LOAD_MSG_1(block, off); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE(X2, X3, X4, X5, X6, X7, t0, t1); \ - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, t0, c40, c48); \ - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, t0, t1); \ - STORE_MSG_1(block, off) - // func blamkaSSE4(b *block) -TEXT ·blamkaSSE4(SB), 4, $0-8 - MOVQ b+0(FP), AX - - MOVOU ·c40<>(SB), X10 - MOVOU ·c48<>(SB), X11 +// Requires: SSE2, SSSE3 +TEXT ·blamkaSSE4(SB), NOSPLIT, $0-8 + MOVQ b+0(FP), AX + MOVOU ·c40<>+0(SB), X10 + MOVOU ·c48<>+0(SB), X11 + MOVOU (AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU 64(AX), X4 + MOVOU 80(AX), X5 + MOVOU 96(AX), X6 + MOVOU 112(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, 32(AX) + MOVOU X3, 48(AX) + MOVOU X4, 64(AX) + MOVOU X5, 80(AX) + MOVOU X6, 96(AX) + MOVOU X7, 112(AX) + MOVOU 128(AX), X0 + MOVOU 144(AX), X1 + MOVOU 160(AX), X2 + MOVOU 176(AX), X3 + MOVOU 192(AX), X4 + MOVOU 208(AX), X5 + MOVOU 224(AX), X6 + MOVOU 240(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 128(AX) + MOVOU X1, 144(AX) + MOVOU X2, 160(AX) + MOVOU X3, 176(AX) + MOVOU X4, 192(AX) + MOVOU X5, 208(AX) + MOVOU X6, 224(AX) + MOVOU X7, 240(AX) + MOVOU 256(AX), X0 + MOVOU 272(AX), X1 + MOVOU 288(AX), X2 + MOVOU 304(AX), X3 + MOVOU 320(AX), X4 + MOVOU 336(AX), X5 + MOVOU 352(AX), X6 + MOVOU 368(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 256(AX) + MOVOU X1, 272(AX) + MOVOU X2, 288(AX) + MOVOU X3, 304(AX) + MOVOU X4, 320(AX) + MOVOU X5, 336(AX) + MOVOU X6, 352(AX) + MOVOU X7, 368(AX) + MOVOU 384(AX), X0 + MOVOU 400(AX), X1 + MOVOU 416(AX), X2 + MOVOU 432(AX), X3 + MOVOU 448(AX), X4 + MOVOU 464(AX), X5 + MOVOU 480(AX), X6 + MOVOU 496(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 384(AX) + MOVOU X1, 400(AX) + MOVOU X2, 416(AX) + MOVOU X3, 432(AX) + MOVOU X4, 448(AX) + MOVOU X5, 464(AX) + MOVOU X6, 480(AX) + MOVOU X7, 496(AX) + MOVOU 512(AX), X0 + MOVOU 528(AX), X1 + MOVOU 544(AX), X2 + MOVOU 560(AX), X3 + MOVOU 576(AX), X4 + MOVOU 592(AX), X5 + MOVOU 608(AX), X6 + MOVOU 624(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 512(AX) + MOVOU X1, 528(AX) + MOVOU X2, 544(AX) + MOVOU X3, 560(AX) + MOVOU X4, 576(AX) + MOVOU X5, 592(AX) + MOVOU X6, 608(AX) + MOVOU X7, 624(AX) + MOVOU 640(AX), X0 + MOVOU 656(AX), X1 + MOVOU 672(AX), X2 + MOVOU 688(AX), X3 + MOVOU 704(AX), X4 + MOVOU 720(AX), X5 + MOVOU 736(AX), X6 + MOVOU 752(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 640(AX) + MOVOU X1, 656(AX) + MOVOU X2, 672(AX) + MOVOU X3, 688(AX) + MOVOU X4, 704(AX) + MOVOU X5, 720(AX) + MOVOU X6, 736(AX) + MOVOU X7, 752(AX) + MOVOU 768(AX), X0 + MOVOU 784(AX), X1 + MOVOU 800(AX), X2 + MOVOU 816(AX), X3 + MOVOU 832(AX), X4 + MOVOU 848(AX), X5 + MOVOU 864(AX), X6 + MOVOU 880(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 768(AX) + MOVOU X1, 784(AX) + MOVOU X2, 800(AX) + MOVOU X3, 816(AX) + MOVOU X4, 832(AX) + MOVOU X5, 848(AX) + MOVOU X6, 864(AX) + MOVOU X7, 880(AX) + MOVOU 896(AX), X0 + MOVOU 912(AX), X1 + MOVOU 928(AX), X2 + MOVOU 944(AX), X3 + MOVOU 960(AX), X4 + MOVOU 976(AX), X5 + MOVOU 992(AX), X6 + MOVOU 1008(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 896(AX) + MOVOU X1, 912(AX) + MOVOU X2, 928(AX) + MOVOU X3, 944(AX) + MOVOU X4, 960(AX) + MOVOU X5, 976(AX) + MOVOU X6, 992(AX) + MOVOU X7, 1008(AX) + MOVOU (AX), X0 + MOVOU 128(AX), X1 + MOVOU 256(AX), X2 + MOVOU 384(AX), X3 + MOVOU 512(AX), X4 + MOVOU 640(AX), X5 + MOVOU 768(AX), X6 + MOVOU 896(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, (AX) + MOVOU X1, 128(AX) + MOVOU X2, 256(AX) + MOVOU X3, 384(AX) + MOVOU X4, 512(AX) + MOVOU X5, 640(AX) + MOVOU X6, 768(AX) + MOVOU X7, 896(AX) + MOVOU 16(AX), X0 + MOVOU 144(AX), X1 + MOVOU 272(AX), X2 + MOVOU 400(AX), X3 + MOVOU 528(AX), X4 + MOVOU 656(AX), X5 + MOVOU 784(AX), X6 + MOVOU 912(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 16(AX) + MOVOU X1, 144(AX) + MOVOU X2, 272(AX) + MOVOU X3, 400(AX) + MOVOU X4, 528(AX) + MOVOU X5, 656(AX) + MOVOU X6, 784(AX) + MOVOU X7, 912(AX) + MOVOU 32(AX), X0 + MOVOU 160(AX), X1 + MOVOU 288(AX), X2 + MOVOU 416(AX), X3 + MOVOU 544(AX), X4 + MOVOU 672(AX), X5 + MOVOU 800(AX), X6 + MOVOU 928(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 32(AX) + MOVOU X1, 160(AX) + MOVOU X2, 288(AX) + MOVOU X3, 416(AX) + MOVOU X4, 544(AX) + MOVOU X5, 672(AX) + MOVOU X6, 800(AX) + MOVOU X7, 928(AX) + MOVOU 48(AX), X0 + MOVOU 176(AX), X1 + MOVOU 304(AX), X2 + MOVOU 432(AX), X3 + MOVOU 560(AX), X4 + MOVOU 688(AX), X5 + MOVOU 816(AX), X6 + MOVOU 944(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 48(AX) + MOVOU X1, 176(AX) + MOVOU X2, 304(AX) + MOVOU X3, 432(AX) + MOVOU X4, 560(AX) + MOVOU X5, 688(AX) + MOVOU X6, 816(AX) + MOVOU X7, 944(AX) + MOVOU 64(AX), X0 + MOVOU 192(AX), X1 + MOVOU 320(AX), X2 + MOVOU 448(AX), X3 + MOVOU 576(AX), X4 + MOVOU 704(AX), X5 + MOVOU 832(AX), X6 + MOVOU 960(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 64(AX) + MOVOU X1, 192(AX) + MOVOU X2, 320(AX) + MOVOU X3, 448(AX) + MOVOU X4, 576(AX) + MOVOU X5, 704(AX) + MOVOU X6, 832(AX) + MOVOU X7, 960(AX) + MOVOU 80(AX), X0 + MOVOU 208(AX), X1 + MOVOU 336(AX), X2 + MOVOU 464(AX), X3 + MOVOU 592(AX), X4 + MOVOU 720(AX), X5 + MOVOU 848(AX), X6 + MOVOU 976(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 80(AX) + MOVOU X1, 208(AX) + MOVOU X2, 336(AX) + MOVOU X3, 464(AX) + MOVOU X4, 592(AX) + MOVOU X5, 720(AX) + MOVOU X6, 848(AX) + MOVOU X7, 976(AX) + MOVOU 96(AX), X0 + MOVOU 224(AX), X1 + MOVOU 352(AX), X2 + MOVOU 480(AX), X3 + MOVOU 608(AX), X4 + MOVOU 736(AX), X5 + MOVOU 864(AX), X6 + MOVOU 992(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 96(AX) + MOVOU X1, 224(AX) + MOVOU X2, 352(AX) + MOVOU X3, 480(AX) + MOVOU X4, 608(AX) + MOVOU X5, 736(AX) + MOVOU X6, 864(AX) + MOVOU X7, 992(AX) + MOVOU 112(AX), X0 + MOVOU 240(AX), X1 + MOVOU 368(AX), X2 + MOVOU 496(AX), X3 + MOVOU 624(AX), X4 + MOVOU 752(AX), X5 + MOVOU 880(AX), X6 + MOVOU 1008(AX), X7 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFD $0xb1, X6, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + PSHUFB X10, X2 + MOVO X0, X8 + PMULULQ X2, X8 + PADDQ X2, X0 + PADDQ X8, X0 + PADDQ X8, X0 + PXOR X0, X6 + PSHUFB X11, X6 + MOVO X4, X8 + PMULULQ X6, X8 + PADDQ X6, X4 + PADDQ X8, X4 + PADDQ X8, X4 + PXOR X4, X2 + MOVO X2, X8 + PADDQ X2, X8 + PSRLQ $0x3f, X2 + PXOR X8, X2 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFD $0xb1, X7, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + PSHUFB X10, X3 + MOVO X1, X8 + PMULULQ X3, X8 + PADDQ X3, X1 + PADDQ X8, X1 + PADDQ X8, X1 + PXOR X1, X7 + PSHUFB X11, X7 + MOVO X5, X8 + PMULULQ X7, X8 + PADDQ X7, X5 + PADDQ X8, X5 + PADDQ X8, X5 + PXOR X5, X3 + MOVO X3, X8 + PADDQ X3, X8 + PSRLQ $0x3f, X3 + PXOR X8, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU X0, 112(AX) + MOVOU X1, 240(AX) + MOVOU X2, 368(AX) + MOVOU X3, 496(AX) + MOVOU X4, 624(AX) + MOVOU X5, 752(AX) + MOVOU X6, 880(AX) + MOVOU X7, 1008(AX) + RET - BLAMKA_ROUND_0(AX, 0, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 16, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 32, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 48, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 64, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 80, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 96, X8, X9, X10, X11) - BLAMKA_ROUND_0(AX, 112, X8, X9, X10, X11) +DATA ·c40<>+0(SB)/8, $0x0201000706050403 +DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·c40<>(SB), RODATA|NOPTR, $16 - BLAMKA_ROUND_1(AX, 0, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 2, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 4, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 6, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 8, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 10, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 12, X8, X9, X10, X11) - BLAMKA_ROUND_1(AX, 14, X8, X9, X10, X11) - RET +DATA ·c48<>+0(SB)/8, $0x0100070605040302 +DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·c48<>(SB), RODATA|NOPTR, $16 -// func mixBlocksSSE2(out, a, b, c *block) -TEXT ·mixBlocksSSE2(SB), 4, $0-32 +// func mixBlocksSSE2(out *block, a *block, b *block, c *block) +// Requires: SSE2 +TEXT ·mixBlocksSSE2(SB), NOSPLIT, $0-32 MOVQ out+0(FP), DX MOVQ a+8(FP), AX MOVQ b+16(FP), BX MOVQ c+24(FP), CX - MOVQ $128, DI + MOVQ $0x00000080, DI loop: - MOVOU 0(AX), X0 - MOVOU 0(BX), X1 - MOVOU 0(CX), X2 + MOVOU (AX), X0 + MOVOU (BX), X1 + MOVOU (CX), X2 PXOR X1, X0 PXOR X2, X0 - MOVOU X0, 0(DX) - ADDQ $16, AX - ADDQ $16, BX - ADDQ $16, CX - ADDQ $16, DX - SUBQ $2, DI + MOVOU X0, (DX) + ADDQ $0x10, AX + ADDQ $0x10, BX + ADDQ $0x10, CX + ADDQ $0x10, DX + SUBQ $0x02, DI JA loop RET -// func xorBlocksSSE2(out, a, b, c *block) -TEXT ·xorBlocksSSE2(SB), 4, $0-32 +// func xorBlocksSSE2(out *block, a *block, b *block, c *block) +// Requires: SSE2 +TEXT ·xorBlocksSSE2(SB), NOSPLIT, $0-32 MOVQ out+0(FP), DX MOVQ a+8(FP), AX MOVQ b+16(FP), BX MOVQ c+24(FP), CX - MOVQ $128, DI + MOVQ $0x00000080, DI loop: - MOVOU 0(AX), X0 - MOVOU 0(BX), X1 - MOVOU 0(CX), X2 - MOVOU 0(DX), X3 + MOVOU (AX), X0 + MOVOU (BX), X1 + MOVOU (CX), X2 + MOVOU (DX), X3 PXOR X1, X0 PXOR X2, X0 PXOR X3, X0 - MOVOU X0, 0(DX) - ADDQ $16, AX - ADDQ $16, BX - ADDQ $16, CX - ADDQ $16, DX - SUBQ $2, DI + MOVOU X0, (DX) + ADDQ $0x10, AX + ADDQ $0x10, BX + ADDQ $0x10, CX + ADDQ $0x10, DX + SUBQ $0x02, DI JA loop RET diff --git a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s index 9ae8206c20165..f75162e039c10 100644 --- a/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s +++ b/vendor/golang.org/x/crypto/blake2b/blake2bAVX2_amd64.s @@ -1,722 +1,4517 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2bAVX2_amd64_asm.go -out ../../blake2bAVX2_amd64.s -pkg blake2b. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·AVX2_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·AVX2_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -DATA ·AVX2_iv0<>+0x10(SB)/8, $0x3c6ef372fe94f82b -DATA ·AVX2_iv0<>+0x18(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·AVX2_iv0<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_iv1<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·AVX2_iv1<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -DATA ·AVX2_iv1<>+0x10(SB)/8, $0x1f83d9abfb41bd6b -DATA ·AVX2_iv1<>+0x18(SB)/8, $0x5be0cd19137e2179 -GLOBL ·AVX2_iv1<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·AVX2_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -DATA ·AVX2_c40<>+0x10(SB)/8, $0x0201000706050403 -DATA ·AVX2_c40<>+0x18(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·AVX2_c40<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX2_c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·AVX2_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -DATA ·AVX2_c48<>+0x10(SB)/8, $0x0100070605040302 -DATA ·AVX2_c48<>+0x18(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·AVX2_c48<>(SB), (NOPTR+RODATA), $32 - -DATA ·AVX_iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·AVX_iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -GLOBL ·AVX_iv0<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b -DATA ·AVX_iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·AVX_iv1<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv2<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·AVX_iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -GLOBL ·AVX_iv2<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b -DATA ·AVX_iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 -GLOBL ·AVX_iv3<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·AVX_c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·AVX_c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·AVX_c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·AVX_c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·AVX_c48<>(SB), (NOPTR+RODATA), $16 - -#define VPERMQ_0x39_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x39 -#define VPERMQ_0x93_Y1_Y1 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xc9; BYTE $0x93 -#define VPERMQ_0x4E_Y2_Y2 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xd2; BYTE $0x4e -#define VPERMQ_0x93_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x93 -#define VPERMQ_0x39_Y3_Y3 BYTE $0xc4; BYTE $0xe3; BYTE $0xfd; BYTE $0x00; BYTE $0xdb; BYTE $0x39 - -#define ROUND_AVX2(m0, m1, m2, m3, t, c40, c48) \ - VPADDQ m0, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFD $-79, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPSHUFB c40, Y1, Y1; \ - VPADDQ m1, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFB c48, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPADDQ Y1, Y1, t; \ - VPSRLQ $63, Y1, Y1; \ - VPXOR t, Y1, Y1; \ - VPERMQ_0x39_Y1_Y1; \ - VPERMQ_0x4E_Y2_Y2; \ - VPERMQ_0x93_Y3_Y3; \ - VPADDQ m2, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFD $-79, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPSHUFB c40, Y1, Y1; \ - VPADDQ m3, Y0, Y0; \ - VPADDQ Y1, Y0, Y0; \ - VPXOR Y0, Y3, Y3; \ - VPSHUFB c48, Y3, Y3; \ - VPADDQ Y3, Y2, Y2; \ - VPXOR Y2, Y1, Y1; \ - VPADDQ Y1, Y1, t; \ - VPSRLQ $63, Y1, Y1; \ - VPXOR t, Y1, Y1; \ - VPERMQ_0x39_Y3_Y3; \ - VPERMQ_0x4E_Y2_Y2; \ - VPERMQ_0x93_Y1_Y1 - -#define VMOVQ_SI_X11_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x1E -#define VMOVQ_SI_X12_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x26 -#define VMOVQ_SI_X13_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x2E -#define VMOVQ_SI_X14_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x36 -#define VMOVQ_SI_X15_0 BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x3E - -#define VMOVQ_SI_X11(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x5E; BYTE $n -#define VMOVQ_SI_X12(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x66; BYTE $n -#define VMOVQ_SI_X13(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x6E; BYTE $n -#define VMOVQ_SI_X14(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x76; BYTE $n -#define VMOVQ_SI_X15(n) BYTE $0xC5; BYTE $0x7A; BYTE $0x7E; BYTE $0x7E; BYTE $n - -#define VPINSRQ_1_SI_X11_0 BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x1E; BYTE $0x01 -#define VPINSRQ_1_SI_X12_0 BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x26; BYTE $0x01 -#define VPINSRQ_1_SI_X13_0 BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x2E; BYTE $0x01 -#define VPINSRQ_1_SI_X14_0 BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x36; BYTE $0x01 -#define VPINSRQ_1_SI_X15_0 BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x3E; BYTE $0x01 - -#define VPINSRQ_1_SI_X11(n) BYTE $0xC4; BYTE $0x63; BYTE $0xA1; BYTE $0x22; BYTE $0x5E; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X12(n) BYTE $0xC4; BYTE $0x63; BYTE $0x99; BYTE $0x22; BYTE $0x66; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X13(n) BYTE $0xC4; BYTE $0x63; BYTE $0x91; BYTE $0x22; BYTE $0x6E; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X14(n) BYTE $0xC4; BYTE $0x63; BYTE $0x89; BYTE $0x22; BYTE $0x76; BYTE $n; BYTE $0x01 -#define VPINSRQ_1_SI_X15(n) BYTE $0xC4; BYTE $0x63; BYTE $0x81; BYTE $0x22; BYTE $0x7E; BYTE $n; BYTE $0x01 - -#define VMOVQ_R8_X15 BYTE $0xC4; BYTE $0x41; BYTE $0xF9; BYTE $0x6E; BYTE $0xF8 -#define VPINSRQ_1_R9_X15 BYTE $0xC4; BYTE $0x43; BYTE $0x81; BYTE $0x22; BYTE $0xF9; BYTE $0x01 - -// load msg: Y12 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y12(i0, i1, i2, i3) \ - VMOVQ_SI_X12(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X12(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y12, Y12 - -// load msg: Y13 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y13(i0, i1, i2, i3) \ - VMOVQ_SI_X13(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X13(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y13, Y13 - -// load msg: Y14 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y14(i0, i1, i2, i3) \ - VMOVQ_SI_X14(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X14(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y14, Y14 - -// load msg: Y15 = (i0, i1, i2, i3) -// i0, i1, i2, i3 must not be 0 -#define LOAD_MSG_AVX2_Y15(i0, i1, i2, i3) \ - VMOVQ_SI_X15(i0*8); \ - VMOVQ_SI_X11(i2*8); \ - VPINSRQ_1_SI_X15(i1*8); \ - VPINSRQ_1_SI_X11(i3*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() \ - VMOVQ_SI_X12_0; \ - VMOVQ_SI_X11(4*8); \ - VPINSRQ_1_SI_X12(2*8); \ - VPINSRQ_1_SI_X11(6*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(1, 3, 5, 7); \ - LOAD_MSG_AVX2_Y14(8, 10, 12, 14); \ - LOAD_MSG_AVX2_Y15(9, 11, 13, 15) - -#define LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() \ - LOAD_MSG_AVX2_Y12(14, 4, 9, 13); \ - LOAD_MSG_AVX2_Y13(10, 8, 15, 6); \ - VMOVQ_SI_X11(11*8); \ - VPSHUFD $0x4E, 0*8(SI), X14; \ - VPINSRQ_1_SI_X11(5*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - LOAD_MSG_AVX2_Y15(12, 2, 7, 3) - -#define LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() \ - VMOVQ_SI_X11(5*8); \ - VMOVDQU 11*8(SI), X12; \ - VPINSRQ_1_SI_X11(15*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - VMOVQ_SI_X13(8*8); \ - VMOVQ_SI_X11(2*8); \ - VPINSRQ_1_SI_X13_0; \ - VPINSRQ_1_SI_X11(13*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(10, 3, 7, 9); \ - LOAD_MSG_AVX2_Y15(14, 6, 1, 4) - -#define LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() \ - LOAD_MSG_AVX2_Y12(7, 3, 13, 11); \ - LOAD_MSG_AVX2_Y13(9, 1, 12, 14); \ - LOAD_MSG_AVX2_Y14(2, 5, 4, 15); \ - VMOVQ_SI_X15(6*8); \ - VMOVQ_SI_X11_0; \ - VPINSRQ_1_SI_X15(10*8); \ - VPINSRQ_1_SI_X11(8*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() \ - LOAD_MSG_AVX2_Y12(9, 5, 2, 10); \ - VMOVQ_SI_X13_0; \ - VMOVQ_SI_X11(4*8); \ - VPINSRQ_1_SI_X13(7*8); \ - VPINSRQ_1_SI_X11(15*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(14, 11, 6, 3); \ - LOAD_MSG_AVX2_Y15(1, 12, 8, 13) - -#define LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X11_0; \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X11(8*8); \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(12, 10, 11, 3); \ - LOAD_MSG_AVX2_Y14(4, 7, 15, 1); \ - LOAD_MSG_AVX2_Y15(13, 5, 14, 9) - -#define LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() \ - LOAD_MSG_AVX2_Y12(12, 1, 14, 4); \ - LOAD_MSG_AVX2_Y13(5, 15, 13, 10); \ - VMOVQ_SI_X14_0; \ - VPSHUFD $0x4E, 8*8(SI), X11; \ - VPINSRQ_1_SI_X14(6*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - LOAD_MSG_AVX2_Y15(7, 3, 2, 11) - -#define LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() \ - LOAD_MSG_AVX2_Y12(13, 7, 12, 3); \ - LOAD_MSG_AVX2_Y13(11, 14, 1, 9); \ - LOAD_MSG_AVX2_Y14(5, 15, 8, 2); \ - VMOVQ_SI_X15_0; \ - VMOVQ_SI_X11(6*8); \ - VPINSRQ_1_SI_X15(4*8); \ - VPINSRQ_1_SI_X11(10*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() \ - VMOVQ_SI_X12(6*8); \ - VMOVQ_SI_X11(11*8); \ - VPINSRQ_1_SI_X12(14*8); \ - VPINSRQ_1_SI_X11_0; \ - VINSERTI128 $1, X11, Y12, Y12; \ - LOAD_MSG_AVX2_Y13(15, 9, 3, 8); \ - VMOVQ_SI_X11(1*8); \ - VMOVDQU 12*8(SI), X14; \ - VPINSRQ_1_SI_X11(10*8); \ - VINSERTI128 $1, X11, Y14, Y14; \ - VMOVQ_SI_X15(2*8); \ - VMOVDQU 4*8(SI), X11; \ - VPINSRQ_1_SI_X15(7*8); \ - VINSERTI128 $1, X11, Y15, Y15 - -#define LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() \ - LOAD_MSG_AVX2_Y12(10, 8, 7, 1); \ - VMOVQ_SI_X13(2*8); \ - VPSHUFD $0x4E, 5*8(SI), X11; \ - VPINSRQ_1_SI_X13(4*8); \ - VINSERTI128 $1, X11, Y13, Y13; \ - LOAD_MSG_AVX2_Y14(15, 9, 3, 13); \ - VMOVQ_SI_X15(11*8); \ - VMOVQ_SI_X11(12*8); \ - VPINSRQ_1_SI_X15(14*8); \ - VPINSRQ_1_SI_X11_0; \ - VINSERTI128 $1, X11, Y15, Y15 - // func hashBlocksAVX2(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksAVX2(SB), 4, $320-48 // frame size = 288 + 32 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, DX - ADDQ $31, DX - ANDQ $~31, DX - - MOVQ CX, 16(DX) - XORQ CX, CX - MOVQ CX, 24(DX) - - VMOVDQU ·AVX2_c40<>(SB), Y4 - VMOVDQU ·AVX2_c48<>(SB), Y5 - - VMOVDQU 0(AX), Y8 +// Requires: AVX, AVX2 +TEXT ·hashBlocksAVX2(SB), NOSPLIT, $320-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, DX + ADDQ $+31, DX + ANDQ $-32, DX + MOVQ CX, 16(DX) + XORQ CX, CX + MOVQ CX, 24(DX) + VMOVDQU ·AVX2_c40<>+0(SB), Y4 + VMOVDQU ·AVX2_c48<>+0(SB), Y5 + VMOVDQU (AX), Y8 VMOVDQU 32(AX), Y9 - VMOVDQU ·AVX2_iv0<>(SB), Y6 - VMOVDQU ·AVX2_iv1<>(SB), Y7 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 - MOVQ R9, 8(DX) + VMOVDQU ·AVX2_iv0<>+0(SB), Y6 + VMOVDQU ·AVX2_iv1<>+0(SB), Y7 + MOVQ (BX), R8 + MOVQ 8(BX), R9 + MOVQ R9, 8(DX) loop: - ADDQ $128, R8 - MOVQ R8, 0(DX) - CMPQ R8, $128 + ADDQ $0x80, R8 + MOVQ R8, (DX) + CMPQ R8, $0x80 JGE noinc INCQ R9 MOVQ R9, 8(DX) noinc: - VMOVDQA Y8, Y0 - VMOVDQA Y9, Y1 - VMOVDQA Y6, Y2 - VPXOR 0(DX), Y7, Y3 - - LOAD_MSG_AVX2_0_2_4_6_1_3_5_7_8_10_12_14_9_11_13_15() - VMOVDQA Y12, 32(DX) - VMOVDQA Y13, 64(DX) - VMOVDQA Y14, 96(DX) - VMOVDQA Y15, 128(DX) - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_14_4_9_13_10_8_15_6_1_0_11_5_12_2_7_3() - VMOVDQA Y12, 160(DX) - VMOVDQA Y13, 192(DX) - VMOVDQA Y14, 224(DX) - VMOVDQA Y15, 256(DX) - - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_11_12_5_15_8_0_2_13_10_3_7_9_14_6_1_4() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_7_3_13_11_9_1_12_14_2_5_4_15_6_10_0_8() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_9_5_2_10_0_7_4_15_14_11_6_3_1_12_8_13() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_2_6_0_8_12_10_11_3_4_7_15_1_13_5_14_9() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_12_1_14_4_5_15_13_10_0_6_9_8_7_3_2_11() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_13_7_12_3_11_14_1_9_5_15_8_2_0_4_6_10() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_6_14_11_0_15_9_3_8_12_13_1_10_2_7_4_5() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - LOAD_MSG_AVX2_10_8_7_1_2_4_6_5_15_9_3_13_11_14_12_0() - ROUND_AVX2(Y12, Y13, Y14, Y15, Y10, Y4, Y5) - - ROUND_AVX2(32(DX), 64(DX), 96(DX), 128(DX), Y10, Y4, Y5) - ROUND_AVX2(160(DX), 192(DX), 224(DX), 256(DX), Y10, Y4, Y5) - - VPXOR Y0, Y8, Y8 - VPXOR Y1, Y9, Y9 - VPXOR Y2, Y8, Y8 - VPXOR Y3, Y9, Y9 - - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop - - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) - - VMOVDQU Y8, 0(AX) - VMOVDQU Y9, 32(AX) + VMOVDQA Y8, Y0 + VMOVDQA Y9, Y1 + VMOVDQA Y6, Y2 + VPXOR (DX), Y7, Y3 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x26 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x28 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x38 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x70 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VMOVDQA Y12, 32(DX) + VMOVDQA Y13, 64(DX) + VMOVDQA Y14, 96(DX) + VMOVDQA Y15, 128(DX) + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x48 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + VPSHUFD $0x4e, (SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x28 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VMOVDQA Y12, 160(DX) + VMOVDQA Y13, 192(DX) + VMOVDQA Y14, 224(DX) + VMOVDQA Y15, 256(DX) + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x28 + VMOVDQU 88(SI), X12 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x2e + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x58 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x70 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x1e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x2e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x78 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x60 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x1e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x08 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x36 + VPSHUFD $0x4e, 64(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x30 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x58 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x18 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x48 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x10 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x3e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x1e + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x18 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x40 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x08 + VMOVDQU 96(SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x50 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x10 + VMOVDQU 32(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x38 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x08 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y12, Y12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x10 + VPSHUFD $0x4e, 40(SI), X11 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x20 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y13, Y13 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x18 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x5e + BYTE $0x68 + BYTE $0x01 + VINSERTI128 $0x01, X11, Y14, Y14 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x5e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0xa1 + BYTE $0x22 + BYTE $0x1e + BYTE $0x01 + VINSERTI128 $0x01, X11, Y15, Y15 + VPADDQ Y12, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y13, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ Y14, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ Y15, Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPADDQ 32(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 64(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ 96(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 128(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPADDQ 160(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 192(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x93 + VPADDQ 224(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFD $-79, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPSHUFB Y4, Y1, Y1 + VPADDQ 256(DX), Y0, Y0 + VPADDQ Y1, Y0, Y0 + VPXOR Y0, Y3, Y3 + VPSHUFB Y5, Y3, Y3 + VPADDQ Y3, Y2, Y2 + VPXOR Y2, Y1, Y1 + VPADDQ Y1, Y1, Y10 + VPSRLQ $0x3f, Y1, Y1 + VPXOR Y10, Y1, Y1 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xdb + BYTE $0x39 + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xd2 + BYTE $0x4e + BYTE $0xc4 + BYTE $0xe3 + BYTE $0xfd + BYTE $0x00 + BYTE $0xc9 + BYTE $0x93 + VPXOR Y0, Y8, Y8 + VPXOR Y1, Y9, Y9 + VPXOR Y2, Y8, Y8 + VPXOR Y3, Y9, Y9 + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + MOVQ R8, (BX) + MOVQ R9, 8(BX) + VMOVDQU Y8, (AX) + VMOVDQU Y9, 32(AX) VZEROUPPER - RET -#define VPUNPCKLQDQ_X2_X2_X15 BYTE $0xC5; BYTE $0x69; BYTE $0x6C; BYTE $0xFA -#define VPUNPCKLQDQ_X3_X3_X15 BYTE $0xC5; BYTE $0x61; BYTE $0x6C; BYTE $0xFB -#define VPUNPCKLQDQ_X7_X7_X15 BYTE $0xC5; BYTE $0x41; BYTE $0x6C; BYTE $0xFF -#define VPUNPCKLQDQ_X13_X13_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x11; BYTE $0x6C; BYTE $0xFD -#define VPUNPCKLQDQ_X14_X14_X15 BYTE $0xC4; BYTE $0x41; BYTE $0x09; BYTE $0x6C; BYTE $0xFE - -#define VPUNPCKHQDQ_X15_X2_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x69; BYTE $0x6D; BYTE $0xD7 -#define VPUNPCKHQDQ_X15_X3_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xDF -#define VPUNPCKHQDQ_X15_X6_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x49; BYTE $0x6D; BYTE $0xF7 -#define VPUNPCKHQDQ_X15_X7_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xFF -#define VPUNPCKHQDQ_X15_X3_X2 BYTE $0xC4; BYTE $0xC1; BYTE $0x61; BYTE $0x6D; BYTE $0xD7 -#define VPUNPCKHQDQ_X15_X7_X6 BYTE $0xC4; BYTE $0xC1; BYTE $0x41; BYTE $0x6D; BYTE $0xF7 -#define VPUNPCKHQDQ_X15_X13_X3 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xDF -#define VPUNPCKHQDQ_X15_X13_X7 BYTE $0xC4; BYTE $0xC1; BYTE $0x11; BYTE $0x6D; BYTE $0xFF - -#define SHUFFLE_AVX() \ - VMOVDQA X6, X13; \ - VMOVDQA X2, X14; \ - VMOVDQA X4, X6; \ - VPUNPCKLQDQ_X13_X13_X15; \ - VMOVDQA X5, X4; \ - VMOVDQA X6, X5; \ - VPUNPCKHQDQ_X15_X7_X6; \ - VPUNPCKLQDQ_X7_X7_X15; \ - VPUNPCKHQDQ_X15_X13_X7; \ - VPUNPCKLQDQ_X3_X3_X15; \ - VPUNPCKHQDQ_X15_X2_X2; \ - VPUNPCKLQDQ_X14_X14_X15; \ - VPUNPCKHQDQ_X15_X3_X3; \ - -#define SHUFFLE_AVX_INV() \ - VMOVDQA X2, X13; \ - VMOVDQA X4, X14; \ - VPUNPCKLQDQ_X2_X2_X15; \ - VMOVDQA X5, X4; \ - VPUNPCKHQDQ_X15_X3_X2; \ - VMOVDQA X14, X5; \ - VPUNPCKLQDQ_X3_X3_X15; \ - VMOVDQA X6, X14; \ - VPUNPCKHQDQ_X15_X13_X3; \ - VPUNPCKLQDQ_X7_X7_X15; \ - VPUNPCKHQDQ_X15_X6_X6; \ - VPUNPCKLQDQ_X14_X14_X15; \ - VPUNPCKHQDQ_X15_X7_X7; \ - -#define HALF_ROUND_AVX(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ - VPADDQ m0, v0, v0; \ - VPADDQ v2, v0, v0; \ - VPADDQ m1, v1, v1; \ - VPADDQ v3, v1, v1; \ - VPXOR v0, v6, v6; \ - VPXOR v1, v7, v7; \ - VPSHUFD $-79, v6, v6; \ - VPSHUFD $-79, v7, v7; \ - VPADDQ v6, v4, v4; \ - VPADDQ v7, v5, v5; \ - VPXOR v4, v2, v2; \ - VPXOR v5, v3, v3; \ - VPSHUFB c40, v2, v2; \ - VPSHUFB c40, v3, v3; \ - VPADDQ m2, v0, v0; \ - VPADDQ v2, v0, v0; \ - VPADDQ m3, v1, v1; \ - VPADDQ v3, v1, v1; \ - VPXOR v0, v6, v6; \ - VPXOR v1, v7, v7; \ - VPSHUFB c48, v6, v6; \ - VPSHUFB c48, v7, v7; \ - VPADDQ v6, v4, v4; \ - VPADDQ v7, v5, v5; \ - VPXOR v4, v2, v2; \ - VPXOR v5, v3, v3; \ - VPADDQ v2, v2, t0; \ - VPSRLQ $63, v2, v2; \ - VPXOR t0, v2, v2; \ - VPADDQ v3, v3, t0; \ - VPSRLQ $63, v3, v3; \ - VPXOR t0, v3, v3 - -// load msg: X12 = (i0, i1), X13 = (i2, i3), X14 = (i4, i5), X15 = (i6, i7) -// i0, i1, i2, i3, i4, i5, i6, i7 must not be 0 -#define LOAD_MSG_AVX(i0, i1, i2, i3, i4, i5, i6, i7) \ - VMOVQ_SI_X12(i0*8); \ - VMOVQ_SI_X13(i2*8); \ - VMOVQ_SI_X14(i4*8); \ - VMOVQ_SI_X15(i6*8); \ - VPINSRQ_1_SI_X12(i1*8); \ - VPINSRQ_1_SI_X13(i3*8); \ - VPINSRQ_1_SI_X14(i5*8); \ - VPINSRQ_1_SI_X15(i7*8) - -// load msg: X12 = (0, 2), X13 = (4, 6), X14 = (1, 3), X15 = (5, 7) -#define LOAD_MSG_AVX_0_2_4_6_1_3_5_7() \ - VMOVQ_SI_X12_0; \ - VMOVQ_SI_X13(4*8); \ - VMOVQ_SI_X14(1*8); \ - VMOVQ_SI_X15(5*8); \ - VPINSRQ_1_SI_X12(2*8); \ - VPINSRQ_1_SI_X13(6*8); \ - VPINSRQ_1_SI_X14(3*8); \ - VPINSRQ_1_SI_X15(7*8) - -// load msg: X12 = (1, 0), X13 = (11, 5), X14 = (12, 2), X15 = (7, 3) -#define LOAD_MSG_AVX_1_0_11_5_12_2_7_3() \ - VPSHUFD $0x4E, 0*8(SI), X12; \ - VMOVQ_SI_X13(11*8); \ - VMOVQ_SI_X14(12*8); \ - VMOVQ_SI_X15(7*8); \ - VPINSRQ_1_SI_X13(5*8); \ - VPINSRQ_1_SI_X14(2*8); \ - VPINSRQ_1_SI_X15(3*8) - -// load msg: X12 = (11, 12), X13 = (5, 15), X14 = (8, 0), X15 = (2, 13) -#define LOAD_MSG_AVX_11_12_5_15_8_0_2_13() \ - VMOVDQU 11*8(SI), X12; \ - VMOVQ_SI_X13(5*8); \ - VMOVQ_SI_X14(8*8); \ - VMOVQ_SI_X15(2*8); \ - VPINSRQ_1_SI_X13(15*8); \ - VPINSRQ_1_SI_X14_0; \ - VPINSRQ_1_SI_X15(13*8) - -// load msg: X12 = (2, 5), X13 = (4, 15), X14 = (6, 10), X15 = (0, 8) -#define LOAD_MSG_AVX_2_5_4_15_6_10_0_8() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X13(4*8); \ - VMOVQ_SI_X14(6*8); \ - VMOVQ_SI_X15_0; \ - VPINSRQ_1_SI_X12(5*8); \ - VPINSRQ_1_SI_X13(15*8); \ - VPINSRQ_1_SI_X14(10*8); \ - VPINSRQ_1_SI_X15(8*8) +DATA ·AVX2_c40<>+0(SB)/8, $0x0201000706050403 +DATA ·AVX2_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +DATA ·AVX2_c40<>+16(SB)/8, $0x0201000706050403 +DATA ·AVX2_c40<>+24(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·AVX2_c40<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (9, 5), X13 = (2, 10), X14 = (0, 7), X15 = (4, 15) -#define LOAD_MSG_AVX_9_5_2_10_0_7_4_15() \ - VMOVQ_SI_X12(9*8); \ - VMOVQ_SI_X13(2*8); \ - VMOVQ_SI_X14_0; \ - VMOVQ_SI_X15(4*8); \ - VPINSRQ_1_SI_X12(5*8); \ - VPINSRQ_1_SI_X13(10*8); \ - VPINSRQ_1_SI_X14(7*8); \ - VPINSRQ_1_SI_X15(15*8) +DATA ·AVX2_c48<>+0(SB)/8, $0x0100070605040302 +DATA ·AVX2_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +DATA ·AVX2_c48<>+16(SB)/8, $0x0100070605040302 +DATA ·AVX2_c48<>+24(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·AVX2_c48<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (2, 6), X13 = (0, 8), X14 = (12, 10), X15 = (11, 3) -#define LOAD_MSG_AVX_2_6_0_8_12_10_11_3() \ - VMOVQ_SI_X12(2*8); \ - VMOVQ_SI_X13_0; \ - VMOVQ_SI_X14(12*8); \ - VMOVQ_SI_X15(11*8); \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X13(8*8); \ - VPINSRQ_1_SI_X14(10*8); \ - VPINSRQ_1_SI_X15(3*8) +DATA ·AVX2_iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·AVX2_iv0<>+8(SB)/8, $0xbb67ae8584caa73b +DATA ·AVX2_iv0<>+16(SB)/8, $0x3c6ef372fe94f82b +DATA ·AVX2_iv0<>+24(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·AVX2_iv0<>(SB), RODATA|NOPTR, $32 -// load msg: X12 = (0, 6), X13 = (9, 8), X14 = (7, 3), X15 = (2, 11) -#define LOAD_MSG_AVX_0_6_9_8_7_3_2_11() \ - MOVQ 0*8(SI), X12; \ - VPSHUFD $0x4E, 8*8(SI), X13; \ - MOVQ 7*8(SI), X14; \ - MOVQ 2*8(SI), X15; \ - VPINSRQ_1_SI_X12(6*8); \ - VPINSRQ_1_SI_X14(3*8); \ - VPINSRQ_1_SI_X15(11*8) - -// load msg: X12 = (6, 14), X13 = (11, 0), X14 = (15, 9), X15 = (3, 8) -#define LOAD_MSG_AVX_6_14_11_0_15_9_3_8() \ - MOVQ 6*8(SI), X12; \ - MOVQ 11*8(SI), X13; \ - MOVQ 15*8(SI), X14; \ - MOVQ 3*8(SI), X15; \ - VPINSRQ_1_SI_X12(14*8); \ - VPINSRQ_1_SI_X13_0; \ - VPINSRQ_1_SI_X14(9*8); \ - VPINSRQ_1_SI_X15(8*8) - -// load msg: X12 = (5, 15), X13 = (8, 2), X14 = (0, 4), X15 = (6, 10) -#define LOAD_MSG_AVX_5_15_8_2_0_4_6_10() \ - MOVQ 5*8(SI), X12; \ - MOVQ 8*8(SI), X13; \ - MOVQ 0*8(SI), X14; \ - MOVQ 6*8(SI), X15; \ - VPINSRQ_1_SI_X12(15*8); \ - VPINSRQ_1_SI_X13(2*8); \ - VPINSRQ_1_SI_X14(4*8); \ - VPINSRQ_1_SI_X15(10*8) - -// load msg: X12 = (12, 13), X13 = (1, 10), X14 = (2, 7), X15 = (4, 5) -#define LOAD_MSG_AVX_12_13_1_10_2_7_4_5() \ - VMOVDQU 12*8(SI), X12; \ - MOVQ 1*8(SI), X13; \ - MOVQ 2*8(SI), X14; \ - VPINSRQ_1_SI_X13(10*8); \ - VPINSRQ_1_SI_X14(7*8); \ - VMOVDQU 4*8(SI), X15 - -// load msg: X12 = (15, 9), X13 = (3, 13), X14 = (11, 14), X15 = (12, 0) -#define LOAD_MSG_AVX_15_9_3_13_11_14_12_0() \ - MOVQ 15*8(SI), X12; \ - MOVQ 3*8(SI), X13; \ - MOVQ 11*8(SI), X14; \ - MOVQ 12*8(SI), X15; \ - VPINSRQ_1_SI_X12(9*8); \ - VPINSRQ_1_SI_X13(13*8); \ - VPINSRQ_1_SI_X14(14*8); \ - VPINSRQ_1_SI_X15_0 +DATA ·AVX2_iv1<>+0(SB)/8, $0x510e527fade682d1 +DATA ·AVX2_iv1<>+8(SB)/8, $0x9b05688c2b3e6c1f +DATA ·AVX2_iv1<>+16(SB)/8, $0x1f83d9abfb41bd6b +DATA ·AVX2_iv1<>+24(SB)/8, $0x5be0cd19137e2179 +GLOBL ·AVX2_iv1<>(SB), RODATA|NOPTR, $32 // func hashBlocksAVX(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksAVX(SB), 4, $288-48 // frame size = 272 + 16 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, R10 - ADDQ $15, R10 - ANDQ $~15, R10 - - VMOVDQU ·AVX_c40<>(SB), X0 - VMOVDQU ·AVX_c48<>(SB), X1 +// Requires: AVX, SSE2 +TEXT ·hashBlocksAVX(SB), NOSPLIT, $288-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, R10 + ADDQ $0x0f, R10 + ANDQ $-16, R10 + VMOVDQU ·AVX_c40<>+0(SB), X0 + VMOVDQU ·AVX_c48<>+0(SB), X1 VMOVDQA X0, X8 VMOVDQA X1, X9 - - VMOVDQU ·AVX_iv3<>(SB), X0 - VMOVDQA X0, 0(R10) - XORQ CX, 0(R10) // 0(R10) = ·AVX_iv3 ^ (CX || 0) - - VMOVDQU 0(AX), X10 + VMOVDQU ·AVX_iv3<>+0(SB), X0 + VMOVDQA X0, (R10) + XORQ CX, (R10) + VMOVDQU (AX), X10 VMOVDQU 16(AX), X11 VMOVDQU 32(AX), X2 VMOVDQU 48(AX), X3 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 + MOVQ (BX), R8 + MOVQ 8(BX), R9 loop: - ADDQ $128, R8 - CMPQ R8, $128 + ADDQ $0x80, R8 + CMPQ R8, $0x80 JGE noinc INCQ R9 noinc: - VMOVQ_R8_X15 - VPINSRQ_1_R9_X15 - + BYTE $0xc4 + BYTE $0x41 + BYTE $0xf9 + BYTE $0x6e + BYTE $0xf8 + BYTE $0xc4 + BYTE $0x43 + BYTE $0x81 + BYTE $0x22 + BYTE $0xf9 + BYTE $0x01 VMOVDQA X10, X0 VMOVDQA X11, X1 - VMOVDQU ·AVX_iv0<>(SB), X4 - VMOVDQU ·AVX_iv1<>(SB), X5 - VMOVDQU ·AVX_iv2<>(SB), X6 - + VMOVDQU ·AVX_iv0<>+0(SB), X4 + VMOVDQU ·AVX_iv1<>+0(SB), X5 + VMOVDQU ·AVX_iv2<>+0(SB), X6 VPXOR X15, X6, X6 - VMOVDQA 0(R10), X7 - - LOAD_MSG_AVX_0_2_4_6_1_3_5_7() + VMOVDQA (R10), X7 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x26 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x28 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x38 + BYTE $0x01 VMOVDQA X12, 16(R10) VMOVDQA X13, 32(R10) VMOVDQA X14, 48(R10) VMOVDQA X15, 64(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(8, 10, 12, 14, 9, 11, 13, 15) + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x78 + BYTE $0x01 VMOVDQA X12, 80(R10) VMOVDQA X13, 96(R10) VMOVDQA X14, 112(R10) VMOVDQA X15, 128(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(14, 4, 9, 13, 10, 8, 15, 6) + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x78 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x68 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x30 + BYTE $0x01 VMOVDQA X12, 144(R10) VMOVDQA X13, 160(R10) VMOVDQA X14, 176(R10) VMOVDQA X15, 192(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_1_0_11_5_12_2_7_3() + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPSHUFD $0x4e, (SI), X12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x38 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 VMOVDQA X12, 208(R10) VMOVDQA X13, 224(R10) VMOVDQA X14, 240(R10) VMOVDQA X15, 256(R10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_11_12_5_15_8_0_2_13() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(10, 3, 7, 9, 14, 6, 1, 4) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(7, 3, 13, 11, 9, 1, 12, 14) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_2_5_4_15_6_10_0_8() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_9_5_2_10_0_7_4_15() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(14, 11, 6, 3, 1, 12, 8, 13) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_2_6_0_8_12_10_11_3() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX(4, 7, 15, 1, 13, 5, 14, 9) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(12, 1, 14, 4, 5, 15, 13, 10) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_0_6_9_8_7_3_2_11() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(13, 7, 12, 3, 11, 14, 1, 9) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_5_15_8_2_0_4_6_10() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX_6_14_11_0_15_9_3_8() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_12_13_1_10_2_7_4_5() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - LOAD_MSG_AVX(10, 8, 7, 1, 2, 4, 6, 5) - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX() - LOAD_MSG_AVX_15_9_3_13_11_14_12_0() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, X12, X13, X14, X15, X15, X8, X9) - SHUFFLE_AVX_INV() - - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X15, X8, X9) - SHUFFLE_AVX() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X15, X8, X9) - SHUFFLE_AVX_INV() - - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X15, X8, X9) - SHUFFLE_AVX() - HALF_ROUND_AVX(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X15, X8, X9) - SHUFFLE_AVX_INV() - + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VMOVDQU 88(SI), X12 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x40 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x10 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x36 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x68 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x20 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x60 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x70 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x3e + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x40 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x48 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x36 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x20 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x78 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x30 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x08 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x40 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x58 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x60 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x68 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x2e + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x58 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x18 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x20 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x78 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x70 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x28 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x48 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x70 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x28 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x68 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ (SI), X12 + VPSHUFD $0x4e, 64(SI), X13 + MOVQ 56(SI), X14 + MOVQ 16(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x30 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x58 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x68 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x60 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x58 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x08 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x38 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x18 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x48 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ 40(SI), X12 + MOVQ 64(SI), X13 + MOVQ (SI), X14 + MOVQ 48(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x78 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x10 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x50 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + MOVQ 48(SI), X12 + MOVQ 88(SI), X13 + MOVQ 120(SI), X14 + MOVQ 24(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x2e + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x40 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VMOVDQU 96(SI), X12 + MOVQ 8(SI), X13 + MOVQ 16(SI), X14 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x50 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x38 + BYTE $0x01 + VMOVDQU 32(SI), X15 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x66 + BYTE $0x50 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x6e + BYTE $0x38 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x76 + BYTE $0x10 + BYTE $0xc5 + BYTE $0x7a + BYTE $0x7e + BYTE $0x7e + BYTE $0x30 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x40 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x08 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x20 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x7e + BYTE $0x28 + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + MOVQ 120(SI), X12 + MOVQ 24(SI), X13 + MOVQ 88(SI), X14 + MOVQ 96(SI), X15 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x99 + BYTE $0x22 + BYTE $0x66 + BYTE $0x48 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x91 + BYTE $0x22 + BYTE $0x6e + BYTE $0x68 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x89 + BYTE $0x22 + BYTE $0x76 + BYTE $0x70 + BYTE $0x01 + BYTE $0xc4 + BYTE $0x63 + BYTE $0x81 + BYTE $0x22 + BYTE $0x3e + BYTE $0x01 + VPADDQ X12, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X13, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ X14, X0, X0 + VPADDQ X2, X0, X0 + VPADDQ X15, X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VPADDQ 16(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 32(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 48(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 64(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPADDQ 80(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 96(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 112(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 128(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff + VPADDQ 144(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 160(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 176(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 192(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X6, X13 + VMOVDQA X2, X14 + VMOVDQA X4, X6 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x11 + BYTE $0x6c + BYTE $0xfd + VMOVDQA X5, X4 + VMOVDQA X6, X5 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xff + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x69 + BYTE $0x6d + BYTE $0xd7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xdf + VPADDQ 208(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 224(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFD $-79, X6, X6 + VPSHUFD $-79, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPSHUFB X8, X2, X2 + VPSHUFB X8, X3, X3 + VPADDQ 240(R10), X0, X0 + VPADDQ X2, X0, X0 + VPADDQ 256(R10), X1, X1 + VPADDQ X3, X1, X1 + VPXOR X0, X6, X6 + VPXOR X1, X7, X7 + VPSHUFB X9, X6, X6 + VPSHUFB X9, X7, X7 + VPADDQ X6, X4, X4 + VPADDQ X7, X5, X5 + VPXOR X4, X2, X2 + VPXOR X5, X3, X3 + VPADDQ X2, X2, X15 + VPSRLQ $0x3f, X2, X2 + VPXOR X15, X2, X2 + VPADDQ X3, X3, X15 + VPSRLQ $0x3f, X3, X3 + VPXOR X15, X3, X3 + VMOVDQA X2, X13 + VMOVDQA X4, X14 + BYTE $0xc5 + BYTE $0x69 + BYTE $0x6c + BYTE $0xfa + VMOVDQA X5, X4 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x61 + BYTE $0x6d + BYTE $0xd7 + VMOVDQA X14, X5 + BYTE $0xc5 + BYTE $0x61 + BYTE $0x6c + BYTE $0xfb + VMOVDQA X6, X14 + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x11 + BYTE $0x6d + BYTE $0xdf + BYTE $0xc5 + BYTE $0x41 + BYTE $0x6c + BYTE $0xff + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x49 + BYTE $0x6d + BYTE $0xf7 + BYTE $0xc4 + BYTE $0x41 + BYTE $0x09 + BYTE $0x6c + BYTE $0xfe + BYTE $0xc4 + BYTE $0xc1 + BYTE $0x41 + BYTE $0x6d + BYTE $0xff VMOVDQU 32(AX), X14 VMOVDQU 48(AX), X15 VPXOR X0, X10, X10 @@ -729,16 +4524,36 @@ noinc: VPXOR X7, X15, X3 VMOVDQU X2, 32(AX) VMOVDQU X3, 48(AX) + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + VMOVDQU X10, (AX) + VMOVDQU X11, 16(AX) + MOVQ R8, (BX) + MOVQ R9, 8(BX) + VZEROUPPER + RET - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop +DATA ·AVX_c40<>+0(SB)/8, $0x0201000706050403 +DATA ·AVX_c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·AVX_c40<>(SB), RODATA|NOPTR, $16 - VMOVDQU X10, 0(AX) - VMOVDQU X11, 16(AX) +DATA ·AVX_c48<>+0(SB)/8, $0x0100070605040302 +DATA ·AVX_c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·AVX_c48<>(SB), RODATA|NOPTR, $16 - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) - VZEROUPPER +DATA ·AVX_iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b +DATA ·AVX_iv3<>+8(SB)/8, $0x5be0cd19137e2179 +GLOBL ·AVX_iv3<>(SB), RODATA|NOPTR, $16 - RET +DATA ·AVX_iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·AVX_iv0<>+8(SB)/8, $0xbb67ae8584caa73b +GLOBL ·AVX_iv0<>(SB), RODATA|NOPTR, $16 + +DATA ·AVX_iv1<>+0(SB)/8, $0x3c6ef372fe94f82b +DATA ·AVX_iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·AVX_iv1<>(SB), RODATA|NOPTR, $16 + +DATA ·AVX_iv2<>+0(SB)/8, $0x510e527fade682d1 +DATA ·AVX_iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f +GLOBL ·AVX_iv2<>(SB), RODATA|NOPTR, $16 diff --git a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s index adfac00c15c9c..9a0ce2124462f 100644 --- a/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s +++ b/vendor/golang.org/x/crypto/blake2b/blake2b_amd64.s @@ -1,278 +1,1441 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run blake2b_amd64_asm.go -out ../../blake2b_amd64.s -pkg blake2b. DO NOT EDIT. //go:build amd64 && gc && !purego #include "textflag.h" -DATA ·iv0<>+0x00(SB)/8, $0x6a09e667f3bcc908 -DATA ·iv0<>+0x08(SB)/8, $0xbb67ae8584caa73b -GLOBL ·iv0<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv1<>+0x00(SB)/8, $0x3c6ef372fe94f82b -DATA ·iv1<>+0x08(SB)/8, $0xa54ff53a5f1d36f1 -GLOBL ·iv1<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv2<>+0x00(SB)/8, $0x510e527fade682d1 -DATA ·iv2<>+0x08(SB)/8, $0x9b05688c2b3e6c1f -GLOBL ·iv2<>(SB), (NOPTR+RODATA), $16 - -DATA ·iv3<>+0x00(SB)/8, $0x1f83d9abfb41bd6b -DATA ·iv3<>+0x08(SB)/8, $0x5be0cd19137e2179 -GLOBL ·iv3<>(SB), (NOPTR+RODATA), $16 - -DATA ·c40<>+0x00(SB)/8, $0x0201000706050403 -DATA ·c40<>+0x08(SB)/8, $0x0a09080f0e0d0c0b -GLOBL ·c40<>(SB), (NOPTR+RODATA), $16 - -DATA ·c48<>+0x00(SB)/8, $0x0100070605040302 -DATA ·c48<>+0x08(SB)/8, $0x09080f0e0d0c0b0a -GLOBL ·c48<>(SB), (NOPTR+RODATA), $16 - -#define SHUFFLE(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v6, t1; \ - PUNPCKLQDQ v6, t2; \ - PUNPCKHQDQ v7, v6; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ v7, t2; \ - MOVO t1, v7; \ - MOVO v2, t1; \ - PUNPCKHQDQ t2, v7; \ - PUNPCKLQDQ v3, t2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v3 - -#define SHUFFLE_INV(v2, v3, v4, v5, v6, v7, t1, t2) \ - MOVO v4, t1; \ - MOVO v5, v4; \ - MOVO t1, v5; \ - MOVO v2, t1; \ - PUNPCKLQDQ v2, t2; \ - PUNPCKHQDQ v3, v2; \ - PUNPCKHQDQ t2, v2; \ - PUNPCKLQDQ v3, t2; \ - MOVO t1, v3; \ - MOVO v6, t1; \ - PUNPCKHQDQ t2, v3; \ - PUNPCKLQDQ v7, t2; \ - PUNPCKHQDQ t2, v6; \ - PUNPCKLQDQ t1, t2; \ - PUNPCKHQDQ t2, v7 - -#define HALF_ROUND(v0, v1, v2, v3, v4, v5, v6, v7, m0, m1, m2, m3, t0, c40, c48) \ - PADDQ m0, v0; \ - PADDQ m1, v1; \ - PADDQ v2, v0; \ - PADDQ v3, v1; \ - PXOR v0, v6; \ - PXOR v1, v7; \ - PSHUFD $0xB1, v6, v6; \ - PSHUFD $0xB1, v7, v7; \ - PADDQ v6, v4; \ - PADDQ v7, v5; \ - PXOR v4, v2; \ - PXOR v5, v3; \ - PSHUFB c40, v2; \ - PSHUFB c40, v3; \ - PADDQ m2, v0; \ - PADDQ m3, v1; \ - PADDQ v2, v0; \ - PADDQ v3, v1; \ - PXOR v0, v6; \ - PXOR v1, v7; \ - PSHUFB c48, v6; \ - PSHUFB c48, v7; \ - PADDQ v6, v4; \ - PADDQ v7, v5; \ - PXOR v4, v2; \ - PXOR v5, v3; \ - MOVOU v2, t0; \ - PADDQ v2, t0; \ - PSRLQ $63, v2; \ - PXOR t0, v2; \ - MOVOU v3, t0; \ - PADDQ v3, t0; \ - PSRLQ $63, v3; \ - PXOR t0, v3 - -#define LOAD_MSG(m0, m1, m2, m3, src, i0, i1, i2, i3, i4, i5, i6, i7) \ - MOVQ i0*8(src), m0; \ - PINSRQ $1, i1*8(src), m0; \ - MOVQ i2*8(src), m1; \ - PINSRQ $1, i3*8(src), m1; \ - MOVQ i4*8(src), m2; \ - PINSRQ $1, i5*8(src), m2; \ - MOVQ i6*8(src), m3; \ - PINSRQ $1, i7*8(src), m3 - // func hashBlocksSSE4(h *[8]uint64, c *[2]uint64, flag uint64, blocks []byte) -TEXT ·hashBlocksSSE4(SB), 4, $288-48 // frame size = 272 + 16 byte alignment - MOVQ h+0(FP), AX - MOVQ c+8(FP), BX - MOVQ flag+16(FP), CX - MOVQ blocks_base+24(FP), SI - MOVQ blocks_len+32(FP), DI - - MOVQ SP, R10 - ADDQ $15, R10 - ANDQ $~15, R10 - - MOVOU ·iv3<>(SB), X0 - MOVO X0, 0(R10) - XORQ CX, 0(R10) // 0(R10) = ·iv3 ^ (CX || 0) - - MOVOU ·c40<>(SB), X13 - MOVOU ·c48<>(SB), X14 - - MOVOU 0(AX), X12 +// Requires: SSE2, SSE4.1, SSSE3 +TEXT ·hashBlocksSSE4(SB), NOSPLIT, $288-48 + MOVQ h+0(FP), AX + MOVQ c+8(FP), BX + MOVQ flag+16(FP), CX + MOVQ blocks_base+24(FP), SI + MOVQ blocks_len+32(FP), DI + MOVQ SP, R10 + ADDQ $0x0f, R10 + ANDQ $-16, R10 + MOVOU ·iv3<>+0(SB), X0 + MOVO X0, (R10) + XORQ CX, (R10) + MOVOU ·c40<>+0(SB), X13 + MOVOU ·c48<>+0(SB), X14 + MOVOU (AX), X12 MOVOU 16(AX), X15 - - MOVQ 0(BX), R8 - MOVQ 8(BX), R9 + MOVQ (BX), R8 + MOVQ 8(BX), R9 loop: - ADDQ $128, R8 - CMPQ R8, $128 + ADDQ $0x80, R8 + CMPQ R8, $0x80 JGE noinc INCQ R9 noinc: - MOVQ R8, X8 - PINSRQ $1, R9, X8 - - MOVO X12, X0 - MOVO X15, X1 - MOVOU 32(AX), X2 - MOVOU 48(AX), X3 - MOVOU ·iv0<>(SB), X4 - MOVOU ·iv1<>(SB), X5 - MOVOU ·iv2<>(SB), X6 - - PXOR X8, X6 - MOVO 0(R10), X7 - - LOAD_MSG(X8, X9, X10, X11, SI, 0, 2, 4, 6, 1, 3, 5, 7) - MOVO X8, 16(R10) - MOVO X9, 32(R10) - MOVO X10, 48(R10) - MOVO X11, 64(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 8, 10, 12, 14, 9, 11, 13, 15) - MOVO X8, 80(R10) - MOVO X9, 96(R10) - MOVO X10, 112(R10) - MOVO X11, 128(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 14, 4, 9, 13, 10, 8, 15, 6) - MOVO X8, 144(R10) - MOVO X9, 160(R10) - MOVO X10, 176(R10) - MOVO X11, 192(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 1, 0, 11, 5, 12, 2, 7, 3) - MOVO X8, 208(R10) - MOVO X9, 224(R10) - MOVO X10, 240(R10) - MOVO X11, 256(R10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 11, 12, 5, 15, 8, 0, 2, 13) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 10, 3, 7, 9, 14, 6, 1, 4) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 7, 3, 13, 11, 9, 1, 12, 14) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 2, 5, 4, 15, 6, 10, 0, 8) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 9, 5, 2, 10, 0, 7, 4, 15) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 14, 11, 6, 3, 1, 12, 8, 13) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 2, 6, 0, 8, 12, 10, 11, 3) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 4, 7, 15, 1, 13, 5, 14, 9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 12, 1, 14, 4, 5, 15, 13, 10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 0, 6, 9, 8, 7, 3, 2, 11) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 13, 7, 12, 3, 11, 14, 1, 9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 5, 15, 8, 2, 0, 4, 6, 10) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 6, 14, 11, 0, 15, 9, 3, 8) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 12, 13, 1, 10, 2, 7, 4, 5) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - LOAD_MSG(X8, X9, X10, X11, SI, 10, 8, 7, 1, 2, 4, 6, 5) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - LOAD_MSG(X8, X9, X10, X11, SI, 15, 9, 3, 13, 11, 14, 12, 0) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) - - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 16(R10), 32(R10), 48(R10), 64(R10), X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 80(R10), 96(R10), 112(R10), 128(R10), X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) + MOVQ R8, X8 + PINSRQ $0x01, R9, X8 + MOVO X12, X0 + MOVO X15, X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVOU ·iv0<>+0(SB), X4 + MOVOU ·iv1<>+0(SB), X5 + MOVOU ·iv2<>+0(SB), X6 + PXOR X8, X6 + MOVO (R10), X7 + MOVQ (SI), X8 + PINSRQ $0x01, 16(SI), X8 + MOVQ 32(SI), X9 + PINSRQ $0x01, 48(SI), X9 + MOVQ 8(SI), X10 + PINSRQ $0x01, 24(SI), X10 + MOVQ 40(SI), X11 + PINSRQ $0x01, 56(SI), X11 + MOVO X8, 16(R10) + MOVO X9, 32(R10) + MOVO X10, 48(R10) + MOVO X11, 64(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 64(SI), X8 + PINSRQ $0x01, 80(SI), X8 + MOVQ 96(SI), X9 + PINSRQ $0x01, 112(SI), X9 + MOVQ 72(SI), X10 + PINSRQ $0x01, 88(SI), X10 + MOVQ 104(SI), X11 + PINSRQ $0x01, 120(SI), X11 + MOVO X8, 80(R10) + MOVO X9, 96(R10) + MOVO X10, 112(R10) + MOVO X11, 128(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 112(SI), X8 + PINSRQ $0x01, 32(SI), X8 + MOVQ 72(SI), X9 + PINSRQ $0x01, 104(SI), X9 + MOVQ 80(SI), X10 + PINSRQ $0x01, 64(SI), X10 + MOVQ 120(SI), X11 + PINSRQ $0x01, 48(SI), X11 + MOVO X8, 144(R10) + MOVO X9, 160(R10) + MOVO X10, 176(R10) + MOVO X11, 192(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 8(SI), X8 + PINSRQ $0x01, (SI), X8 + MOVQ 88(SI), X9 + PINSRQ $0x01, 40(SI), X9 + MOVQ 96(SI), X10 + PINSRQ $0x01, 16(SI), X10 + MOVQ 56(SI), X11 + PINSRQ $0x01, 24(SI), X11 + MOVO X8, 208(R10) + MOVO X9, 224(R10) + MOVO X10, 240(R10) + MOVO X11, 256(R10) + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 88(SI), X8 + PINSRQ $0x01, 96(SI), X8 + MOVQ 40(SI), X9 + PINSRQ $0x01, 120(SI), X9 + MOVQ 64(SI), X10 + PINSRQ $0x01, (SI), X10 + MOVQ 16(SI), X11 + PINSRQ $0x01, 104(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 80(SI), X8 + PINSRQ $0x01, 24(SI), X8 + MOVQ 56(SI), X9 + PINSRQ $0x01, 72(SI), X9 + MOVQ 112(SI), X10 + PINSRQ $0x01, 48(SI), X10 + MOVQ 8(SI), X11 + PINSRQ $0x01, 32(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 56(SI), X8 + PINSRQ $0x01, 24(SI), X8 + MOVQ 104(SI), X9 + PINSRQ $0x01, 88(SI), X9 + MOVQ 72(SI), X10 + PINSRQ $0x01, 8(SI), X10 + MOVQ 96(SI), X11 + PINSRQ $0x01, 112(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 16(SI), X8 + PINSRQ $0x01, 40(SI), X8 + MOVQ 32(SI), X9 + PINSRQ $0x01, 120(SI), X9 + MOVQ 48(SI), X10 + PINSRQ $0x01, 80(SI), X10 + MOVQ (SI), X11 + PINSRQ $0x01, 64(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 72(SI), X8 + PINSRQ $0x01, 40(SI), X8 + MOVQ 16(SI), X9 + PINSRQ $0x01, 80(SI), X9 + MOVQ (SI), X10 + PINSRQ $0x01, 56(SI), X10 + MOVQ 32(SI), X11 + PINSRQ $0x01, 120(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 112(SI), X8 + PINSRQ $0x01, 88(SI), X8 + MOVQ 48(SI), X9 + PINSRQ $0x01, 24(SI), X9 + MOVQ 8(SI), X10 + PINSRQ $0x01, 96(SI), X10 + MOVQ 64(SI), X11 + PINSRQ $0x01, 104(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 16(SI), X8 + PINSRQ $0x01, 48(SI), X8 + MOVQ (SI), X9 + PINSRQ $0x01, 64(SI), X9 + MOVQ 96(SI), X10 + PINSRQ $0x01, 80(SI), X10 + MOVQ 88(SI), X11 + PINSRQ $0x01, 24(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 32(SI), X8 + PINSRQ $0x01, 56(SI), X8 + MOVQ 120(SI), X9 + PINSRQ $0x01, 8(SI), X9 + MOVQ 104(SI), X10 + PINSRQ $0x01, 40(SI), X10 + MOVQ 112(SI), X11 + PINSRQ $0x01, 72(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 96(SI), X8 + PINSRQ $0x01, 8(SI), X8 + MOVQ 112(SI), X9 + PINSRQ $0x01, 32(SI), X9 + MOVQ 40(SI), X10 + PINSRQ $0x01, 120(SI), X10 + MOVQ 104(SI), X11 + PINSRQ $0x01, 80(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ (SI), X8 + PINSRQ $0x01, 48(SI), X8 + MOVQ 72(SI), X9 + PINSRQ $0x01, 64(SI), X9 + MOVQ 56(SI), X10 + PINSRQ $0x01, 24(SI), X10 + MOVQ 16(SI), X11 + PINSRQ $0x01, 88(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 104(SI), X8 + PINSRQ $0x01, 56(SI), X8 + MOVQ 96(SI), X9 + PINSRQ $0x01, 24(SI), X9 + MOVQ 88(SI), X10 + PINSRQ $0x01, 112(SI), X10 + MOVQ 8(SI), X11 + PINSRQ $0x01, 72(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 40(SI), X8 + PINSRQ $0x01, 120(SI), X8 + MOVQ 64(SI), X9 + PINSRQ $0x01, 16(SI), X9 + MOVQ (SI), X10 + PINSRQ $0x01, 32(SI), X10 + MOVQ 48(SI), X11 + PINSRQ $0x01, 80(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 48(SI), X8 + PINSRQ $0x01, 112(SI), X8 + MOVQ 88(SI), X9 + PINSRQ $0x01, (SI), X9 + MOVQ 120(SI), X10 + PINSRQ $0x01, 72(SI), X10 + MOVQ 24(SI), X11 + PINSRQ $0x01, 64(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 96(SI), X8 + PINSRQ $0x01, 104(SI), X8 + MOVQ 8(SI), X9 + PINSRQ $0x01, 80(SI), X9 + MOVQ 16(SI), X10 + PINSRQ $0x01, 56(SI), X10 + MOVQ 32(SI), X11 + PINSRQ $0x01, 40(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVQ 80(SI), X8 + PINSRQ $0x01, 64(SI), X8 + MOVQ 56(SI), X9 + PINSRQ $0x01, 8(SI), X9 + MOVQ 16(SI), X10 + PINSRQ $0x01, 32(SI), X10 + MOVQ 48(SI), X11 + PINSRQ $0x01, 40(SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + MOVQ 120(SI), X8 + PINSRQ $0x01, 72(SI), X8 + MOVQ 24(SI), X9 + PINSRQ $0x01, 104(SI), X9 + MOVQ 88(SI), X10 + PINSRQ $0x01, 112(SI), X10 + MOVQ 96(SI), X11 + PINSRQ $0x01, (SI), X11 + PADDQ X8, X0 + PADDQ X9, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ X10, X0 + PADDQ X11, X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + PADDQ 16(R10), X0 + PADDQ 32(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 48(R10), X0 + PADDQ 64(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + PADDQ 80(R10), X0 + PADDQ 96(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 112(R10), X0 + PADDQ 128(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + PADDQ 144(R10), X0 + PADDQ 160(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 176(R10), X0 + PADDQ 192(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X6, X8 + PUNPCKLQDQ X6, X9 + PUNPCKHQDQ X7, X6 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X7, X9 + MOVO X8, X7 + MOVO X2, X8 + PUNPCKHQDQ X9, X7 + PUNPCKLQDQ X3, X9 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X3 + PADDQ 208(R10), X0 + PADDQ 224(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFD $0xb1, X6, X6 + PSHUFD $0xb1, X7, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + PSHUFB X13, X2 + PSHUFB X13, X3 + PADDQ 240(R10), X0 + PADDQ 256(R10), X1 + PADDQ X2, X0 + PADDQ X3, X1 + PXOR X0, X6 + PXOR X1, X7 + PSHUFB X14, X6 + PSHUFB X14, X7 + PADDQ X6, X4 + PADDQ X7, X5 + PXOR X4, X2 + PXOR X5, X3 + MOVOU X2, X11 + PADDQ X2, X11 + PSRLQ $0x3f, X2 + PXOR X11, X2 + MOVOU X3, X11 + PADDQ X3, X11 + PSRLQ $0x3f, X3 + PXOR X11, X3 + MOVO X4, X8 + MOVO X5, X4 + MOVO X8, X5 + MOVO X2, X8 + PUNPCKLQDQ X2, X9 + PUNPCKHQDQ X3, X2 + PUNPCKHQDQ X9, X2 + PUNPCKLQDQ X3, X9 + MOVO X8, X3 + MOVO X6, X8 + PUNPCKHQDQ X9, X3 + PUNPCKLQDQ X7, X9 + PUNPCKHQDQ X9, X6 + PUNPCKLQDQ X8, X9 + PUNPCKHQDQ X9, X7 + MOVOU 32(AX), X10 + MOVOU 48(AX), X11 + PXOR X0, X12 + PXOR X1, X15 + PXOR X2, X10 + PXOR X3, X11 + PXOR X4, X12 + PXOR X5, X15 + PXOR X6, X10 + PXOR X7, X11 + MOVOU X10, 32(AX) + MOVOU X11, 48(AX) + LEAQ 128(SI), SI + SUBQ $0x80, DI + JNE loop + MOVOU X12, (AX) + MOVOU X15, 16(AX) + MOVQ R8, (BX) + MOVQ R9, 8(BX) + RET - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 144(R10), 160(R10), 176(R10), 192(R10), X11, X13, X14) - SHUFFLE(X2, X3, X4, X5, X6, X7, X8, X9) - HALF_ROUND(X0, X1, X2, X3, X4, X5, X6, X7, 208(R10), 224(R10), 240(R10), 256(R10), X11, X13, X14) - SHUFFLE_INV(X2, X3, X4, X5, X6, X7, X8, X9) +DATA ·iv3<>+0(SB)/8, $0x1f83d9abfb41bd6b +DATA ·iv3<>+8(SB)/8, $0x5be0cd19137e2179 +GLOBL ·iv3<>(SB), RODATA|NOPTR, $16 - MOVOU 32(AX), X10 - MOVOU 48(AX), X11 - PXOR X0, X12 - PXOR X1, X15 - PXOR X2, X10 - PXOR X3, X11 - PXOR X4, X12 - PXOR X5, X15 - PXOR X6, X10 - PXOR X7, X11 - MOVOU X10, 32(AX) - MOVOU X11, 48(AX) +DATA ·c40<>+0(SB)/8, $0x0201000706050403 +DATA ·c40<>+8(SB)/8, $0x0a09080f0e0d0c0b +GLOBL ·c40<>(SB), RODATA|NOPTR, $16 - LEAQ 128(SI), SI - SUBQ $128, DI - JNE loop +DATA ·c48<>+0(SB)/8, $0x0100070605040302 +DATA ·c48<>+8(SB)/8, $0x09080f0e0d0c0b0a +GLOBL ·c48<>(SB), RODATA|NOPTR, $16 - MOVOU X12, 0(AX) - MOVOU X15, 16(AX) +DATA ·iv0<>+0(SB)/8, $0x6a09e667f3bcc908 +DATA ·iv0<>+8(SB)/8, $0xbb67ae8584caa73b +GLOBL ·iv0<>(SB), RODATA|NOPTR, $16 - MOVQ R8, 0(BX) - MOVQ R9, 8(BX) +DATA ·iv1<>+0(SB)/8, $0x3c6ef372fe94f82b +DATA ·iv1<>+8(SB)/8, $0xa54ff53a5f1d36f1 +GLOBL ·iv1<>(SB), RODATA|NOPTR, $16 - RET +DATA ·iv2<>+0(SB)/8, $0x510e527fade682d1 +DATA ·iv2<>+8(SB)/8, $0x9b05688c2b3e6c1f +GLOBL ·iv2<>(SB), RODATA|NOPTR, $16 diff --git a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s index 731d2ac6dbc11..fd5ee845f9faa 100644 --- a/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s +++ b/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s @@ -1,2715 +1,9762 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. +// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT. //go:build gc && !purego #include "textflag.h" -// General register allocation -#define oup DI -#define inp SI -#define inl BX -#define adp CX // free to reuse, after we hash the additional data -#define keyp R8 // free to reuse, when we copy the key to stack -#define itr2 R9 // general iterator -#define itr1 CX // general iterator -#define acc0 R10 -#define acc1 R11 -#define acc2 R12 -#define t0 R13 -#define t1 R14 -#define t2 R15 -#define t3 R8 -// Register and stack allocation for the SSE code -#define rStore (0*16)(BP) -#define sStore (1*16)(BP) -#define state1Store (2*16)(BP) -#define state2Store (3*16)(BP) -#define tmpStore (4*16)(BP) -#define ctr0Store (5*16)(BP) -#define ctr1Store (6*16)(BP) -#define ctr2Store (7*16)(BP) -#define ctr3Store (8*16)(BP) -#define A0 X0 -#define A1 X1 -#define A2 X2 -#define B0 X3 -#define B1 X4 -#define B2 X5 -#define C0 X6 -#define C1 X7 -#define C2 X8 -#define D0 X9 -#define D1 X10 -#define D2 X11 -#define T0 X12 -#define T1 X13 -#define T2 X14 -#define T3 X15 -#define A3 T0 -#define B3 T1 -#define C3 T2 -#define D3 T3 -// Register and stack allocation for the AVX2 code -#define rsStoreAVX2 (0*32)(BP) -#define state1StoreAVX2 (1*32)(BP) -#define state2StoreAVX2 (2*32)(BP) -#define ctr0StoreAVX2 (3*32)(BP) -#define ctr1StoreAVX2 (4*32)(BP) -#define ctr2StoreAVX2 (5*32)(BP) -#define ctr3StoreAVX2 (6*32)(BP) -#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack -#define AA0 Y0 -#define AA1 Y5 -#define AA2 Y6 -#define AA3 Y7 -#define BB0 Y14 -#define BB1 Y9 -#define BB2 Y10 -#define BB3 Y11 -#define CC0 Y12 -#define CC1 Y13 -#define CC2 Y8 -#define CC3 Y15 -#define DD0 Y4 -#define DD1 Y1 -#define DD2 Y2 -#define DD3 Y3 -#define TT0 DD3 -#define TT1 AA3 -#define TT2 BB3 -#define TT3 CC3 -// ChaCha20 constants -DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 -DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 -// <<< 16 with PSHUFB -DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 -DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 -DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A -// <<< 8 with PSHUFB -DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 -DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 -DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B - -DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 -DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 -DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 -DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 - -DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 -DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 -DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 -DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 -// Poly1305 key clamp -DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF -DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC -DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF -DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF - -DATA ·sseIncMask<>+0x00(SB)/8, $0x1 -DATA ·sseIncMask<>+0x08(SB)/8, $0x0 -// To load/store the last < 16 bytes in a buffer -DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff -DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff -DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff -DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff -DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff -DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff - -GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 -GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 -GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 -GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 -GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 -// No PALIGNR in Go ASM yet (but VPALIGNR is present). -#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 -#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 -#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 -#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 -#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 -#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 -#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 -#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 -#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 -#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 -#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 -#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 -#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 -#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 -#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 -#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 -#define shiftC0Right shiftC0Left -#define shiftC1Right shiftC1Left -#define shiftC2Right shiftC2Left -#define shiftC3Right shiftC3Left -#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 -#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 -#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 -#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 - -// Some macros - -// ROL rotates the uint32s in register R left by N bits, using temporary T. -#define ROL(N, R, T) \ - MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R - -// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. -#ifdef GOAMD64_v2 -#define ROL16(R, T) PSHUFB ·rol16<>(SB), R -#else -#define ROL16(R, T) ROL(16, R, T) -#endif - -// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. -#ifdef GOAMD64_v2 -#define ROL8(R, T) PSHUFB ·rol8<>(SB), R -#else -#define ROL8(R, T) ROL(8, R, T) -#endif - -#define chachaQR(A, B, C, D, T) \ - PADDD B, A; PXOR A, D; ROL16(D, T) \ - PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ - PADDD B, A; PXOR A, D; ROL8(D, T) \ - PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B - -#define chachaQR_AVX2(A, B, C, D, T) \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ - VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ - VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B - -#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 -#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 -#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX -#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 -#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 - -#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 -#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 -#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 - -#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage -#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage -// ---------------------------------------------------------------------------- + +// func polyHashADInternal<>() TEXT polyHashADInternal<>(SB), NOSPLIT, $0 - // adp points to beginning of additional data - // itr2 holds ad length - XORQ acc0, acc0 - XORQ acc1, acc1 - XORQ acc2, acc2 - CMPQ itr2, $13 - JNE hashADLoop - -openFastTLSAD: - // Special treatment for the TLS case of 13 bytes - MOVQ (adp), acc0 - MOVQ 5(adp), acc1 - SHRQ $24, acc1 - MOVQ $1, acc2 - polyMul + // Hack: Must declare #define macros inside of a function due to Avo constraints + // ROL rotates the uint32s in register R left by N bits, using temporary T. + #define ROL(N, R, T) \ + MOVO R, T; \ + PSLLL $(N), T; \ + PSRLL $(32-(N)), R; \ + PXOR T, R + + // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. + #ifdef GOAMD64_v2 + #define ROL8(R, T) PSHUFB ·rol8<>(SB), R + #else + #define ROL8(R, T) ROL(8, R, T) + #endif + + // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. + #ifdef GOAMD64_v2 + #define ROL16(R, T) PSHUFB ·rol16<>(SB), R + #else + #define ROL16(R, T) ROL(16, R, T) + #endif + XORQ R10, R10 + XORQ R11, R11 + XORQ R12, R12 + CMPQ R9, $0x0d + JNE hashADLoop + MOVQ (CX), R10 + MOVQ 5(CX), R11 + SHRQ $0x18, R11 + MOVQ $0x00000001, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 RET hashADLoop: // Hash in 16 byte chunks - CMPQ itr2, $16 - JB hashADTail - polyAdd(0(adp)) - LEAQ (1*16)(adp), adp - SUBQ $16, itr2 - polyMul - JMP hashADLoop + CMPQ R9, $0x10 + JB hashADTail + ADDQ (CX), R10 + ADCQ 8(CX), R11 + ADCQ $0x01, R12 + LEAQ 16(CX), CX + SUBQ $0x10, R9 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + JMP hashADLoop hashADTail: - CMPQ itr2, $0 + CMPQ R9, $0x00 JE hashADDone // Hash last < 16 byte tail - XORQ t0, t0 - XORQ t1, t1 - XORQ t2, t2 - ADDQ itr2, adp + XORQ R13, R13 + XORQ R14, R14 + XORQ R15, R15 + ADDQ R9, CX hashADTailLoop: - SHLQ $8, t0, t1 - SHLQ $8, t0 - MOVB -1(adp), t2 - XORQ t2, t0 - DECQ adp - DECQ itr2 - JNE hashADTailLoop - -hashADTailFinish: - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul - - // Finished AD + SHLQ $0x08, R13, R14 + SHLQ $0x08, R13 + MOVB -1(CX), R15 + XORQ R15, R13 + DECQ CX + DECQ R9 + JNE hashADTailLoop + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + hashADDone: RET -// ---------------------------------------------------------------------------- -// func chacha20Poly1305Open(dst, key, src, ad []byte) bool -TEXT ·chacha20Poly1305Open(SB), 0, $288-97 +// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Open(SB), $288-97 // For aligned stack access MOVQ SP, BP - ADDQ $32, BP + ADDQ $0x20, BP ANDQ $-32, BP - MOVQ dst+0(FP), oup - MOVQ key+24(FP), keyp - MOVQ src+48(FP), inp - MOVQ src_len+56(FP), inl - MOVQ ad+72(FP), adp + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX // Check for AVX2 support - CMPB ·useAVX2(SB), $1 + CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Open_AVX2 // Special optimization, for very short buffers - CMPQ inl, $128 - JBE openSSE128 // About 16% faster + CMPQ BX, $0x80 + JBE openSSE128 // For long buffers, prepare the poly key first - MOVOU ·chacha20Constants<>(SB), A0 - MOVOU (1*16)(keyp), B0 - MOVOU (2*16)(keyp), C0 - MOVOU (3*16)(keyp), D0 - MOVO D0, T1 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X9, X13 // Store state on stack for future use - MOVO B0, state1Store - MOVO C0, state2Store - MOVO D0, ctr3Store - MOVQ $10, itr2 + MOVO X3, 32(BP) + MOVO X6, 48(BP) + MOVO X9, 128(BP) + MOVQ $0x0000000a, R9 openSSEPreparePolyKey: - chachaQR(A0, B0, C0, D0, T0) - shiftB0Left; shiftC0Left; shiftD0Left - chachaQR(A0, B0, C0, D0, T0) - shiftB0Right; shiftC0Right; shiftD0Right - DECQ itr2 - JNE openSSEPreparePolyKey + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + DECQ R9 + JNE openSSEPreparePolyKey // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL 32(BP), X3 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVO A0, rStore; MOVO B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVO X0, (BP) + MOVO X3, 16(BP) // Hash AAD - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSEMainLoop: - CMPQ inl, $256 + CMPQ BX, $0x00000100 JB openSSEMainLoopDone // Load state, increment counter blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) - // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 - MOVQ $4, itr1 - MOVQ inp, itr2 + // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash + // 2 blocks, and for the remaining 4 only 1 block - for a total of 16 + MOVQ $0x00000004, CX + MOVQ SI, R9 openSSEInternalLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyAdd(0(itr2)) - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - LEAQ (2*8)(itr2), itr2 - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - polyMulStage3 - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr1 - JGE openSSEInternalLoop - - polyAdd(0(itr2)) - polyMul - LEAQ (2*8)(itr2), itr2 - - CMPQ itr1, $-6 - JG openSSEInternalLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + LEAQ 16(R9), R9 + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ CX + JGE openSSEInternalLoop + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + CMPQ CX, $-6 + JG openSSEInternalLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 // Load - xor - store - MOVO D3, tmpStore - MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) - MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) - MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) - MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) - MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) - MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) - MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) - MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) - MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) - MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) - MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) - MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) - MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) - MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) - LEAQ 256(inp), inp - LEAQ 256(oup), oup - SUBQ $256, inl + MOVO X15, 64(BP) + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU X0, (DI) + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU X3, 16(DI) + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU X6, 32(DI) + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X9, 48(DI) + MOVOU 64(SI), X9 + PXOR X9, X1 + MOVOU X1, 64(DI) + MOVOU 80(SI), X9 + PXOR X9, X4 + MOVOU X4, 80(DI) + MOVOU 96(SI), X9 + PXOR X9, X7 + MOVOU X7, 96(DI) + MOVOU 112(SI), X9 + PXOR X9, X10 + MOVOU X10, 112(DI) + MOVOU 128(SI), X9 + PXOR X9, X2 + MOVOU X2, 128(DI) + MOVOU 144(SI), X9 + PXOR X9, X5 + MOVOU X5, 144(DI) + MOVOU 160(SI), X9 + PXOR X9, X8 + MOVOU X8, 160(DI) + MOVOU 176(SI), X9 + PXOR X9, X11 + MOVOU X11, 176(DI) + MOVOU 192(SI), X9 + PXOR X9, X12 + MOVOU X12, 192(DI) + MOVOU 208(SI), X9 + PXOR X9, X13 + MOVOU X13, 208(DI) + MOVOU 224(SI), X9 + PXOR X9, X14 + MOVOU X14, 224(DI) + MOVOU 240(SI), X9 + PXOR 64(BP), X9 + MOVOU X9, 240(DI) + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX JMP openSSEMainLoop openSSEMainLoopDone: // Handle the various tail sizes efficiently - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize - CMPQ inl, $64 + CMPQ BX, $0x40 JBE openSSETail64 - CMPQ inl, $128 + CMPQ BX, $0x80 JBE openSSETail128 - CMPQ inl, $192 + CMPQ BX, $0xc0 JBE openSSETail192 JMP openSSETail256 openSSEFinalize: // Hash in the PT, AAD lengths - ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 - polyMul + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Final reduce - MOVQ acc0, t0 - MOVQ acc1, t1 - MOVQ acc2, t2 - SUBQ $-5, acc0 - SBBQ $-1, acc1 - SBBQ $3, acc2 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - CMOVQCS t2, acc2 + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 // Add in the "s" part of the key - ADDQ 0+sStore, acc0 - ADCQ 8+sStore, acc1 + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 // Finally, constant time compare to the tag at the end of the message XORQ AX, AX - MOVQ $1, DX - XORQ (0*8)(inp), acc0 - XORQ (1*8)(inp), acc1 - ORQ acc1, acc0 + MOVQ $0x00000001, DX + XORQ (SI), R10 + XORQ 8(SI), R11 + ORQ R11, R10 CMOVQEQ DX, AX // Return true iff tags are equal MOVB AX, ret+96(FP) RET -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 129 bytes openSSE128: - // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 - MOVQ $10, itr2 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X3, X13 + MOVO X6, X14 + MOVO X10, X15 + MOVQ $0x0000000a, R9 openSSE128InnerCipherLoop: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftB1Left; shiftB2Left - shiftC0Left; shiftC1Left; shiftC2Left - shiftD0Left; shiftD1Left; shiftD2Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftB1Right; shiftB2Right - shiftC0Right; shiftC1Right; shiftC2Right - shiftD0Right; shiftD1Right; shiftD2Right - DECQ itr2 - JNE openSSE128InnerCipherLoop + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ R9 + JNE openSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 - PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL X13, X3 + PADDL X13, X4 + PADDL X13, X5 + PADDL X14, X7 + PADDL X14, X8 + PADDL X15, X10 + PADDL ·sseIncMask<>+0(SB), X15 + PADDL X15, X11 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVOU A0, rStore; MOVOU B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVOU X0, (BP) + MOVOU X3, 16(BP) // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSE128Open: - CMPQ inl, $16 + CMPQ BX, $0x10 JB openSSETail16 - SUBQ $16, inl + SUBQ $0x10, BX // Load for hashing - polyAdd(0(inp)) + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 // Load for decryption - MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - polyMul + MOVOU (SI), X12 + PXOR X12, X1 + MOVOU X1, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Shift the stream "left" - MOVO B1, A1 - MOVO C1, B1 - MOVO D1, C1 - MOVO A2, D1 - MOVO B2, A2 - MOVO C2, B2 - MOVO D2, C2 + MOVO X4, X1 + MOVO X7, X4 + MOVO X10, X7 + MOVO X2, X10 + MOVO X5, X2 + MOVO X8, X5 + MOVO X11, X8 JMP openSSE128Open openSSETail16: - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize // We can safely load the CT from the end, because it is padded with the MAC - MOVQ inl, itr2 - SHLQ $4, itr2 - LEAQ ·andMask<>(SB), t0 - MOVOU (inp), T0 - ADDQ inl, inp - PAND -16(t0)(itr2*1), T0 - MOVO T0, 0+tmpStore - MOVQ T0, t0 - MOVQ 8+tmpStore, t1 - PXOR A1, T0 + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVOU (SI), X12 + ADDQ BX, SI + PAND -16(R13)(R9*1), X12 + MOVO X12, 64(BP) + MOVQ X12, R13 + MOVQ 72(BP), R14 + PXOR X1, X12 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes openSSETail16Store: - MOVQ T0, t3 - MOVB t3, (oup) - PSRLDQ $1, T0 - INCQ oup - DECQ inl + MOVQ X12, R8 + MOVB R8, (DI) + PSRLDQ $0x01, X12 + INCQ DI + DECQ BX JNE openSSETail16Store - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 JMP openSSEFinalize -// ---------------------------------------------------------------------------- -// Special optimization for the last 64 bytes of ciphertext openSSETail64: - // Need to decrypt up to 64 bytes - prepare single block - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - XORQ itr2, itr2 - MOVQ inl, itr1 - CMPQ itr1, $16 - JB openSSETail64LoopB + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + XORQ R9, R9 + MOVQ BX, CX + CMPQ CX, $0x10 + JB openSSETail64LoopB openSSETail64LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul - SUBQ $16, itr1 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX openSSETail64LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0) - shiftB0Left; shiftC0Left; shiftD0Left - chachaQR(A0, B0, C0, D0, T0) - shiftB0Right; shiftC0Right; shiftD0Right - - CMPQ itr1, $16 - JAE openSSETail64LoopA - - CMPQ itr2, $160 - JNE openSSETail64LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + CMPQ CX, $0x10 + JAE openSSETail64LoopA + CMPQ R9, $0xa0 + JNE openSSETail64LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL 32(BP), X3 + PADDL 48(BP), X6 + PADDL 80(BP), X9 openSSETail64DecLoop: - CMPQ inl, $16 + CMPQ BX, $0x10 JB openSSETail64DecLoopDone - SUBQ $16, inl - MOVOU (inp), T0 - PXOR T0, A0 - MOVOU A0, (oup) - LEAQ 16(inp), inp - LEAQ 16(oup), oup - MOVO B0, A0 - MOVO C0, B0 - MOVO D0, C0 + SUBQ $0x10, BX + MOVOU (SI), X12 + PXOR X12, X0 + MOVOU X0, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + MOVO X3, X0 + MOVO X6, X3 + MOVO X9, X6 JMP openSSETail64DecLoop openSSETail64DecLoopDone: - MOVO A0, A1 + MOVO X0, X1 JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext openSSETail128: - // Need to decrypt up to 128 bytes - prepare two blocks - MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store - XORQ itr2, itr2 - MOVQ inl, itr1 - ANDQ $-16, itr1 + MOVO ·chacha20Constants<>+0(SB), X1 + MOVO 32(BP), X4 + MOVO 48(BP), X7 + MOVO 128(BP), X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 80(BP) + MOVO X1, X0 + MOVO X4, X3 + MOVO X7, X6 + MOVO X10, X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 96(BP) + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX openSSETail128LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSETail128LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - - CMPQ itr2, itr1 - JB openSSETail128LoopA - - CMPQ itr2, $160 - JNE openSSETail128LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B0; PADDL state1Store, B1 - PADDL state2Store, C0; PADDL state2Store, C1 - PADDL ctr1Store, D0; PADDL ctr0Store, D1 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) - - SUBQ $64, inl - LEAQ 64(inp), inp - LEAQ 64(oup), oup - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 192 bytes of ciphertext + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + CMPQ R9, CX + JB openSSETail128LoopA + CMPQ R9, $0xa0 + JNE openSSETail128LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 96(BP), X9 + PADDL 80(BP), X10 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, (DI) + MOVOU X4, 16(DI) + MOVOU X7, 32(DI) + MOVOU X10, 48(DI) + SUBQ $0x40, BX + LEAQ 64(SI), SI + LEAQ 64(DI), DI + JMP openSSETail64DecLoop + openSSETail192: - // Need to decrypt up to 192 bytes - prepare three blocks - MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store - MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store - - MOVQ inl, itr1 - MOVQ $160, itr2 - CMPQ itr1, $160 - CMOVQGT itr2, itr1 - ANDQ $-16, itr1 - XORQ itr2, itr2 + MOVO ·chacha20Constants<>+0(SB), X2 + MOVO 32(BP), X5 + MOVO 48(BP), X8 + MOVO 128(BP), X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X11, 80(BP) + MOVO X2, X1 + MOVO X5, X4 + MOVO X8, X7 + MOVO X11, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) + MOVO X1, X0 + MOVO X4, X3 + MOVO X7, X6 + MOVO X10, X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 112(BP) + MOVQ BX, CX + MOVQ $0x000000a0, R9 + CMPQ CX, $0xa0 + CMOVQGT R9, CX + ANDQ $-16, CX + XORQ R9, R9 openSSLTail192LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSLTail192LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - shiftB2Left; shiftC2Left; shiftD2Left - - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - shiftB2Right; shiftC2Right; shiftD2Right - - CMPQ itr2, itr1 - JB openSSLTail192LoopA - - CMPQ itr2, $160 - JNE openSSLTail192LoopB - - CMPQ inl, $176 - JB openSSLTail192Store - - polyAdd(160(inp)) - polyMul - - CMPQ inl, $192 - JB openSSLTail192Store - - polyAdd(176(inp)) - polyMul + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + CMPQ R9, CX + JB openSSLTail192LoopA + CMPQ R9, $0xa0 + JNE openSSLTail192LoopB + CMPQ BX, $0xb0 + JB openSSLTail192Store + ADDQ 160(SI), R10 + ADCQ 168(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + CMPQ BX, $0xc0 + JB openSSLTail192Store + ADDQ 176(SI), R10 + ADCQ 184(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSLTail192Store: - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 - PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 - PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 - MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) - - MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - - SUBQ $128, inl - LEAQ 128(inp), inp - LEAQ 128(oup), oup - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 32(BP), X5 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 48(BP), X8 + PADDL 112(BP), X9 + PADDL 96(BP), X10 + PADDL 80(BP), X11 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X2 + PXOR X13, X5 + PXOR X14, X8 + PXOR X15, X11 + MOVOU X2, (DI) + MOVOU X5, 16(DI) + MOVOU X8, 32(DI) + MOVOU X11, 48(DI) + MOVOU 64(SI), X12 + MOVOU 80(SI), X13 + MOVOU 96(SI), X14 + MOVOU 112(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + SUBQ $0x80, BX + LEAQ 128(SI), SI + LEAQ 128(DI), DI + JMP openSSETail64DecLoop + openSSETail256: - // Need to decrypt up to 256 bytes - prepare four blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store - XORQ itr2, itr2 + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) + XORQ R9, R9 openSSETail256Loop: - // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication - polyAdd(0(inp)(itr2*1)) - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulStage3 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - ADDQ $2*8, itr2 - CMPQ itr2, $160 - JB openSSETail256Loop - MOVQ inl, itr1 - ANDQ $-16, itr1 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + ADDQ $0x10, R9 + CMPQ R9, $0xa0 + JB openSSETail256Loop + MOVQ BX, CX + ANDQ $-16, CX openSSETail256HashLoop: - polyAdd(0(inp)(itr2*1)) - polyMul - ADDQ $2*8, itr2 - CMPQ itr2, itr1 - JB openSSETail256HashLoop + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ $0x10, R9 + CMPQ R9, CX + JB openSSETail256HashLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 - MOVO D3, tmpStore + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 + MOVO X15, 64(BP) // Load - xor - store - MOVOU (0*16)(inp), D3; PXOR D3, A0 - MOVOU (1*16)(inp), D3; PXOR D3, B0 - MOVOU (2*16)(inp), D3; PXOR D3, C0 - MOVOU (3*16)(inp), D3; PXOR D3, D0 - MOVOU A0, (0*16)(oup) - MOVOU B0, (1*16)(oup) - MOVOU C0, (2*16)(oup) - MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) - LEAQ 192(inp), inp - LEAQ 192(oup), oup - SUBQ $192, inl - MOVO A3, A0 - MOVO B3, B0 - MOVO C3, C0 - MOVO tmpStore, D0 - - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// ------------------------- AVX2 Code ---------------------------------------- + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVOU 128(SI), X0 + MOVOU 144(SI), X3 + MOVOU 160(SI), X6 + MOVOU 176(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 128(DI) + MOVOU X5, 144(DI) + MOVOU X8, 160(DI) + MOVOU X11, 176(DI) + LEAQ 192(SI), SI + LEAQ 192(DI), DI + SUBQ $0xc0, BX + MOVO X12, X0 + MOVO X13, X3 + MOVO X14, X6 + MOVO 64(BP), X9 + JMP openSSETail64DecLoop + chacha20Poly1305Open_AVX2: VZEROUPPER - VMOVDQU ·chacha20Constants<>(SB), AA0 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 - BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD ·avx2InitMask<>(SB), DD0, DD0 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x70 + BYTE $0x10 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x20 + BYTE $0xc4 + BYTE $0xc2 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x30 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimization, for very short buffers - CMPQ inl, $192 + CMPQ BX, $0xc0 JBE openAVX2192 - CMPQ inl, $320 + CMPQ BX, $0x00000140 JBE openAVX2320 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA BB0, state1StoreAVX2 - VMOVDQA CC0, state2StoreAVX2 - VMOVDQA DD0, ctr3StoreAVX2 - MOVQ $10, itr2 + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, 64(BP) + VMOVDQA Y4, 192(BP) + MOVQ $0x0000000a, R9 openAVX2PreparePolyKey: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - DECQ itr2 - JNE openAVX2PreparePolyKey - - VPADDD ·chacha20Constants<>(SB), AA0, AA0 - VPADDD state1StoreAVX2, BB0, BB0 - VPADDD state2StoreAVX2, CC0, CC0 - VPADDD ctr3StoreAVX2, DD0, DD0 - - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ R9 + JNE openAVX2PreparePolyKey + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD 32(BP), Y14, Y14 + VPADDD 64(BP), Y12, Y12 + VPADDD 192(BP), Y4, Y4 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for the first 64 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 // Hash AD + first 64 bytes - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX openAVX2InitialHash64: - polyAdd(0(inp)(itr1*1)) - polyMulAVX2 - ADDQ $16, itr1 - CMPQ itr1, $64 - JNE openAVX2InitialHash64 + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ $0x10, CX + CMPQ CX, $0x40 + JNE openAVX2InitialHash64 // Decrypt the first 64 bytes - VPXOR (0*32)(inp), AA0, AA0 - VPXOR (1*32)(inp), BB0, BB0 - VMOVDQU AA0, (0*32)(oup) - VMOVDQU BB0, (1*32)(oup) - LEAQ (2*32)(inp), inp - LEAQ (2*32)(oup), oup - SUBQ $64, inl + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VMOVDQU Y0, (DI) + VMOVDQU Y14, 32(DI) + LEAQ 64(SI), SI + LEAQ 64(DI), DI + SUBQ $0x40, BX openAVX2MainLoop: - CMPQ inl, $512 + CMPQ BX, $0x00000200 JB openAVX2MainLoopDone // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - XORQ itr1, itr1 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX openAVX2InternalLoop: - // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications - // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext - polyAdd(0*8(inp)(itr1*1)) - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage1_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulStage2_AVX2 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyMulStage3_AVX2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - polyAdd(2*8(inp)(itr1*1)) - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage1_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage2_AVX2 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage3_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulReduceStage - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(4*8(inp)(itr1*1)) - LEAQ (6*8)(itr1), itr1 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage1_AVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - polyMulStage2_AVX2 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage3_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - CMPQ itr1, $480 + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(SI)(CX*1), R10 + ADCQ 24(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(SI)(CX*1), R10 + ADCQ 40(SI)(CX*1), R11 + ADCQ $0x01, R12 + LEAQ 48(CX), CX + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + CMPQ CX, $0x000001e0 JNE openAVX2InternalLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - polyAdd(480(inp)) - polyMulAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + ADDQ 480(SI), R10 + ADCQ 488(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) // and here - polyAdd(496(inp)) - polyMulAVX2 - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 - VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) - LEAQ (32*16)(inp), inp - LEAQ (32*16)(oup), oup - SUBQ $(32*16), inl + ADDQ 496(SI), R10 + ADCQ 504(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + LEAQ 512(DI), DI + SUBQ $0x00000200, BX JMP openAVX2MainLoop openAVX2MainLoopDone: // Handle the various tail sizes efficiently - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize - CMPQ inl, $128 + CMPQ BX, $0x80 JBE openAVX2Tail128 - CMPQ inl, $256 + CMPQ BX, $0x00000100 JBE openAVX2Tail256 - CMPQ inl, $384 + CMPQ BX, $0x00000180 JBE openAVX2Tail384 JMP openAVX2Tail512 -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 193 bytes openAVX2192: - // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks - VMOVDQA AA0, AA1 - VMOVDQA BB0, BB1 - VMOVDQA CC0, CC1 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2 - VMOVDQA BB0, BB2 - VMOVDQA CC0, CC2 - VMOVDQA DD0, DD2 - VMOVDQA DD1, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 openAVX2192InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 JNE openAVX2192InnerCipherLoop - VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 - VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 - VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 - VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 192 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 openAVX2ShortOpen: // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openAVX2ShortOpenLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB openAVX2ShortTail32 - SUBQ $32, inl + SUBQ $0x20, BX // Load for hashing - polyAdd(0*8(inp)) - polyMulAVX2 - polyAdd(2*8(inp)) - polyMulAVX2 + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(SI), R10 + ADCQ 24(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Load for decryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp - LEAQ (1*32)(oup), oup + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI // Shift stream left - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 - VMOVDQA AA1, DD0 - VMOVDQA BB1, AA1 - VMOVDQA CC1, BB1 - VMOVDQA DD1, CC1 - VMOVDQA AA2, DD1 - VMOVDQA BB2, AA2 + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 JMP openAVX2ShortOpenLoop openAVX2ShortTail32: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB openAVX2ShortDone - - SUBQ $16, inl + SUBQ $0x10, BX // Load for hashing - polyAdd(0*8(inp)) - polyMulAVX2 + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Load for decryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 openAVX2ShortDone: VZEROUPPER JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 321 bytes openAVX2320: - // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 openAVX2320InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ R9 JNE openAVX2320InnerCipherLoop - - VMOVDQA ·chacha20Constants<>(SB), TT0 - VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 - VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 - VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA ·avx2IncMask<>(SB), TT0 - VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD2, DD2 + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 // Clamp and store poly key - VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 320 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 - VPERM2I128 $0x02, AA2, BB2, CC1 - VPERM2I128 $0x02, CC2, DD2, DD1 - VPERM2I128 $0x13, AA2, BB2, AA2 - VPERM2I128 $0x13, CC2, DD2, BB2 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 JMP openAVX2ShortOpen -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext openAVX2Tail128: // Need to decrypt up to 128 bytes - prepare two blocks - VMOVDQA ·chacha20Constants<>(SB), AA1 - VMOVDQA state1StoreAVX2, BB1 - VMOVDQA state2StoreAVX2, CC1 - VMOVDQA ctr3StoreAVX2, DD1 - VPADDD ·avx2IncMask<>(SB), DD1, DD1 - VMOVDQA DD1, DD0 - - XORQ itr2, itr2 - MOVQ inl, itr1 - ANDQ $-16, itr1 - TESTQ itr1, itr1 - JE openAVX2Tail128LoopB + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 + VMOVDQA Y1, Y4 + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX + TESTQ CX, CX + JE openAVX2Tail128LoopB openAVX2Tail128LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMulAVX2 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openAVX2Tail128LoopB: - ADDQ $16, itr2 - chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD1, DD1, DD1 - CMPQ itr2, itr1 - JB openAVX2Tail128LoopA - CMPQ itr2, $160 - JNE openAVX2Tail128LoopB - - VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC1, CC1 - VPADDD DD0, DD1, DD1 - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + ADDQ $0x10, R9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX + JB openAVX2Tail128LoopA + CMPQ R9, $0xa0 + JNE openAVX2Tail128LoopB + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y13, Y13 + VPADDD Y4, Y1, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 openAVX2TailLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB openAVX2Tail - SUBQ $32, inl + SUBQ $0x20, BX // Load for decryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp - LEAQ (1*32)(oup), oup - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 JMP openAVX2TailLoop openAVX2Tail: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB openAVX2TailDone - SUBQ $16, inl + SUBQ $0x10, BX // Load for decryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 openAVX2TailDone: VZEROUPPER JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext openAVX2Tail256: - // Need to decrypt up to 256 bytes - prepare four blocks - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA DD0, TT1 - VMOVDQA DD1, TT2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 // Compute the number of iterations that will hash data - MOVQ inl, tmpStoreAVX2 - MOVQ inl, itr1 - SUBQ $128, itr1 - SHRQ $4, itr1 - MOVQ $10, itr2 - CMPQ itr1, $10 - CMOVQGT itr2, itr1 - MOVQ inp, inl - XORQ itr2, itr2 + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x80, CX + SHRQ $0x04, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 openAVX2Tail256LoopA: - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX - // Perform ChaCha rounds, while hashing the remaining input openAVX2Tail256LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - INCQ itr2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - CMPQ itr2, itr1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX JB openAVX2Tail256LoopA + CMPQ R9, $0x0a + JNE openAVX2Tail256LoopB + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX - CMPQ itr2, $10 - JNE openAVX2Tail256LoopB - - MOVQ inl, itr2 - SUBQ inp, inl - MOVQ inl, itr1 - MOVQ tmpStoreAVX2, inl - - // Hash the remainder of data (if any) openAVX2Tail256Hash: - ADDQ $16, itr1 - CMPQ itr1, inl - JGT openAVX2Tail256HashEnd - polyAdd (0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - JMP openAVX2Tail256Hash - -// Store 128 bytes safely, then go to store loop + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail256HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail256Hash + openAVX2Tail256HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - - VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 - VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) - LEAQ (4*32)(inp), inp - LEAQ (4*32)(oup), oup - SUBQ $4*32, inl - - JMP openAVX2TailLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 384 bytes of ciphertext + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y6 + VPERM2I128 $0x02, Y12, Y4, Y10 + VPERM2I128 $0x13, Y0, Y14, Y8 + VPERM2I128 $0x13, Y12, Y4, Y2 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR (SI), Y6, Y6 + VPXOR 32(SI), Y10, Y10 + VPXOR 64(SI), Y8, Y8 + VPXOR 96(SI), Y2, Y2 + VMOVDQU Y6, (DI) + VMOVDQU Y10, 32(DI) + VMOVDQU Y8, 64(DI) + VMOVDQU Y2, 96(DI) + LEAQ 128(SI), SI + LEAQ 128(DI), DI + SUBQ $0x80, BX + JMP openAVX2TailLoop + openAVX2Tail384: // Need to decrypt up to 384 bytes - prepare six blocks - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA DD0, ctr0StoreAVX2 - VMOVDQA DD1, ctr1StoreAVX2 - VMOVDQA DD2, ctr2StoreAVX2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) // Compute the number of iterations that will hash two blocks of data - MOVQ inl, tmpStoreAVX2 - MOVQ inl, itr1 - SUBQ $256, itr1 - SHRQ $4, itr1 - ADDQ $6, itr1 - MOVQ $10, itr2 - CMPQ itr1, $10 - CMOVQGT itr2, itr1 - MOVQ inp, inl - XORQ itr2, itr2 - - // Perform ChaCha rounds, while hashing the remaining input + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x00000100, CX + SHRQ $0x04, CX + ADDQ $0x06, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 + openAVX2Tail384LoopB: - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX openAVX2Tail384LoopA: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl - INCQ itr2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - - CMPQ itr2, itr1 - JB openAVX2Tail384LoopB - - CMPQ itr2, $10 - JNE openAVX2Tail384LoopA - - MOVQ inl, itr2 - SUBQ inp, inl - MOVQ inl, itr1 - MOVQ tmpStoreAVX2, inl + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + CMPQ R9, CX + JB openAVX2Tail384LoopB + CMPQ R9, $0x0a + JNE openAVX2Tail384LoopA + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX openAVX2Tail384Hash: - ADDQ $16, itr1 - CMPQ itr1, inl - JGT openAVX2Tail384HashEnd - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - JMP openAVX2Tail384Hash - -// Store 256 bytes safely, then go to store loop + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail384HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail384Hash + openAVX2Tail384HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 - VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 - VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 - VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - LEAQ (8*32)(inp), inp - LEAQ (8*32)(oup), oup - SUBQ $8*32, inl + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX JMP openAVX2TailLoop -// ---------------------------------------------------------------------------- -// Special optimization for the last 512 bytes of ciphertext openAVX2Tail512: - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - XORQ itr1, itr1 - MOVQ inp, itr2 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX + MOVQ SI, R9 openAVX2Tail512LoopB: - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ (2*8)(itr2), itr2 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 openAVX2Tail512LoopA: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyAdd(0*8(itr2)) - polyMulAVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(2*8(itr2)) - polyMulAVX2 - LEAQ (4*8)(itr2), itr2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - INCQ itr1 - CMPQ itr1, $4 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(R9), R10 + ADCQ 24(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(R9), R9 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + INCQ CX + CMPQ CX, $0x04 JLT openAVX2Tail512LoopB - - CMPQ itr1, $10 - JNE openAVX2Tail512LoopA - - MOVQ inl, itr1 - SUBQ $384, itr1 - ANDQ $-16, itr1 + CMPQ CX, $0x0a + JNE openAVX2Tail512LoopA + MOVQ BX, CX + SUBQ $0x00000180, CX + ANDQ $-16, CX openAVX2Tail512HashLoop: - TESTQ itr1, itr1 + TESTQ CX, CX JE openAVX2Tail512HashEnd - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - SUBQ $16, itr1 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + SUBQ $0x10, CX JMP openAVX2Tail512HashLoop openAVX2Tail512HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - - LEAQ (12*32)(inp), inp - LEAQ (12*32)(oup), oup - SUBQ $12*32, inl - - JMP openAVX2TailLoop - -// ---------------------------------------------------------------------------- -// ---------------------------------------------------------------------------- -// func chacha20Poly1305Seal(dst, key, src, ad []byte) -TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 - // For aligned stack access + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + LEAQ 384(SI), SI + LEAQ 384(DI), DI + SUBQ $0x00000180, BX + JMP openAVX2TailLoop + +DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 +GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 + +DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff +DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc +DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff +DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff +GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 + +DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001 +DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000 +GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16 + +DATA ·andMask<>+0(SB)/8, $0x00000000000000ff +DATA ·andMask<>+8(SB)/8, $0x0000000000000000 +DATA ·andMask<>+16(SB)/8, $0x000000000000ffff +DATA ·andMask<>+24(SB)/8, $0x0000000000000000 +DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+40(SB)/8, $0x0000000000000000 +DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+56(SB)/8, $0x0000000000000000 +DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+72(SB)/8, $0x0000000000000000 +DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+88(SB)/8, $0x0000000000000000 +DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+104(SB)/8, $0x0000000000000000 +DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+120(SB)/8, $0x0000000000000000 +DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+136(SB)/8, $0x00000000000000ff +DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+152(SB)/8, $0x000000000000ffff +DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff +GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 + +DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 +DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 + +DATA ·rol16<>+0(SB)/8, $0x0504070601000302 +DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a +DATA ·rol16<>+16(SB)/8, $0x0504070601000302 +DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a +GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 + +DATA ·rol8<>+0(SB)/8, $0x0605040702010003 +DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b +DATA ·rol8<>+16(SB)/8, $0x0605040702010003 +DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b +GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 + +DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 + +// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Seal(SB), $288-96 MOVQ SP, BP - ADDQ $32, BP + ADDQ $0x20, BP ANDQ $-32, BP - MOVQ dst+0(FP), oup - MOVQ key+24(FP), keyp - MOVQ src+48(FP), inp - MOVQ src_len+56(FP), inl - MOVQ ad+72(FP), adp - - CMPB ·useAVX2(SB), $1 + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX + CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Seal_AVX2 // Special optimization, for very short buffers - CMPQ inl, $128 - JBE sealSSE128 // About 15% faster + CMPQ BX, $0x80 + JBE sealSSE128 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration - MOVOU ·chacha20Constants<>(SB), A0 - MOVOU (1*16)(keyp), B0 - MOVOU (2*16)(keyp), C0 - MOVOU (3*16)(keyp), D0 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 // Store state on stack for future use - MOVO B0, state1Store - MOVO C0, state2Store + MOVO X3, 32(BP) + MOVO X6, 48(BP) // Load state, increment counter blocks - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store - MOVQ $10, itr2 + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) + MOVQ $0x0000000a, R9 sealSSEIntroLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr2 - JNE sealSSEIntroLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ R9 + JNE sealSSEIntroLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVO A0, rStore - MOVO B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVO X0, (BP) + MOVO X3, 16(BP) // Hash AAD - MOVQ ad_len+80(FP), itr2 - CALL polyHashADInternal<>(SB) - - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) - - MOVQ $128, itr1 - SUBQ $128, inl - LEAQ 128(inp), inp - - MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 - - CMPQ inl, $64 - JBE sealSSE128SealHash - - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 - MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) - - ADDQ $64, itr1 - SUBQ $64, inl - LEAQ 64(inp), inp - - MOVQ $2, itr1 - MOVQ $8, itr2 - - CMPQ inl, $64 - JBE sealSSETail64 - CMPQ inl, $128 - JBE sealSSETail128 - CMPQ inl, $192 - JBE sealSSETail192 + MOVQ ad_len+80(FP), R9 + CALL polyHashADInternal<>(SB) + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, (DI) + MOVOU X4, 16(DI) + MOVOU X7, 32(DI) + MOVOU X10, 48(DI) + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 64(DI) + MOVOU X5, 80(DI) + MOVOU X8, 96(DI) + MOVOU X11, 112(DI) + MOVQ $0x00000080, CX + SUBQ $0x80, BX + LEAQ 128(SI), SI + MOVO X12, X1 + MOVO X13, X4 + MOVO X14, X7 + MOVO X15, X10 + CMPQ BX, $0x40 + JBE sealSSE128SealHash + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X12 + PXOR X3, X13 + PXOR X6, X14 + PXOR X9, X15 + MOVOU X12, 128(DI) + MOVOU X13, 144(DI) + MOVOU X14, 160(DI) + MOVOU X15, 176(DI) + ADDQ $0x40, CX + SUBQ $0x40, BX + LEAQ 64(SI), SI + MOVQ $0x00000002, CX + MOVQ $0x00000008, R9 + CMPQ BX, $0x40 + JBE sealSSETail64 + CMPQ BX, $0x80 + JBE sealSSETail128 + CMPQ BX, $0xc0 + JBE sealSSETail192 sealSSEMainLoop: // Load state, increment counter blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) sealSSEInnerLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyAdd(0(oup)) - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - LEAQ (2*8)(oup), oup - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - polyMulStage3 - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr2 - JGE sealSSEInnerLoop - polyAdd(0(oup)) - polyMul - LEAQ (2*8)(oup), oup - DECQ itr1 - JG sealSSEInnerLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + LEAQ 16(DI), DI + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ R9 + JGE sealSSEInnerLoop + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + DECQ CX + JG sealSSEInnerLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 - MOVO D3, tmpStore + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 + MOVO X15, 64(BP) // Load - xor - store - MOVOU (0*16)(inp), D3; PXOR D3, A0 - MOVOU (1*16)(inp), D3; PXOR D3, B0 - MOVOU (2*16)(inp), D3; PXOR D3, C0 - MOVOU (3*16)(inp), D3; PXOR D3, D0 - MOVOU A0, (0*16)(oup) - MOVOU B0, (1*16)(oup) - MOVOU C0, (2*16)(oup) - MOVOU D0, (3*16)(oup) - MOVO tmpStore, D3 - - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) - ADDQ $192, inp - MOVQ $192, itr1 - SUBQ $192, inl - MOVO A3, A1 - MOVO B3, B1 - MOVO C3, C1 - MOVO D3, D1 - CMPQ inl, $64 + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVO 64(BP), X15 + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVOU 128(SI), X0 + MOVOU 144(SI), X3 + MOVOU 160(SI), X6 + MOVOU 176(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 128(DI) + MOVOU X5, 144(DI) + MOVOU X8, 160(DI) + MOVOU X11, 176(DI) + ADDQ $0xc0, SI + MOVQ $0x000000c0, CX + SUBQ $0xc0, BX + MOVO X12, X1 + MOVO X13, X4 + MOVO X14, X7 + MOVO X15, X10 + CMPQ BX, $0x40 JBE sealSSE128SealHash - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 - MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) - LEAQ 64(inp), inp - SUBQ $64, inl - MOVQ $6, itr1 - MOVQ $4, itr2 - CMPQ inl, $192 + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X12 + PXOR X3, X13 + PXOR X6, X14 + PXOR X9, X15 + MOVOU X12, 192(DI) + MOVOU X13, 208(DI) + MOVOU X14, 224(DI) + MOVOU X15, 240(DI) + LEAQ 64(SI), SI + SUBQ $0x40, BX + MOVQ $0x00000006, CX + MOVQ $0x00000004, R9 + CMPQ BX, $0xc0 JG sealSSEMainLoop - - MOVQ inl, itr1 - TESTQ inl, inl + MOVQ BX, CX + TESTQ BX, BX JE sealSSE128SealHash - MOVQ $6, itr1 - CMPQ inl, $64 + MOVQ $0x00000006, CX + CMPQ BX, $0x40 JBE sealSSETail64 - CMPQ inl, $128 + CMPQ BX, $0x80 JBE sealSSETail128 JMP sealSSETail192 -// ---------------------------------------------------------------------------- -// Special optimization for the last 64 bytes of plaintext sealSSETail64: - // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A1 - MOVO state1Store, B1 - MOVO state2Store, C1 - MOVO ctr3Store, D1 - PADDL ·sseIncMask<>(SB), D1 - MOVO D1, ctr0Store + MOVO ·chacha20Constants<>+0(SB), X1 + MOVO 32(BP), X4 + MOVO 48(BP), X7 + MOVO 128(BP), X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 80(BP) sealSSETail64LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail64LoopB: - chachaQR(A1, B1, C1, D1, T1) - shiftB1Left; shiftC1Left; shiftD1Left - chachaQR(A1, B1, C1, D1, T1) - shiftB1Right; shiftC1Right; shiftD1Right - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - - DECQ itr1 - JG sealSSETail64LoopA - - DECQ itr2 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x0c, X13 + PSRLL $0x14, X4 + PXOR X13, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x07, X13 + PSRLL $0x19, X4 + PXOR X13, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x0c, X13 + PSRLL $0x14, X4 + PXOR X13, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x07, X13 + PSRLL $0x19, X4 + PXOR X13, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + DECQ CX + JG sealSSETail64LoopA + DECQ R9 JGE sealSSETail64LoopB - PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B1 - PADDL state2Store, C1 - PADDL ctr0Store, D1 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X4 + PADDL 48(BP), X7 + PADDL 80(BP), X10 + JMP sealSSE128Seal - JMP sealSSE128Seal - -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of plaintext sealSSETail128: - // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) sealSSETail128LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail128LoopB: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - - DECQ itr1 - JG sealSSETail128LoopA - - DECQ itr2 - JGE sealSSETail128LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B0; PADDL state1Store, B1 - PADDL state2Store, C0; PADDL state2Store, C1 - PADDL ctr0Store, D0; PADDL ctr1Store, D1 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 - MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) - - MOVQ $64, itr1 - LEAQ 64(inp), inp - SUBQ $64, inl - - JMP sealSSE128SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 192 bytes of plaintext + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + DECQ CX + JG sealSSETail128LoopA + DECQ R9 + JGE sealSSETail128LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 80(BP), X9 + PADDL 96(BP), X10 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X0 + PXOR X13, X3 + PXOR X14, X6 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVQ $0x00000040, CX + LEAQ 64(SI), SI + SUBQ $0x40, BX + JMP sealSSE128SealHash + sealSSETail192: - // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X11, 112(BP) sealSSETail192LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail192LoopB: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - shiftB2Left; shiftC2Left; shiftD2Left - - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - shiftB2Right; shiftC2Right; shiftD2Right - - DECQ itr1 - JG sealSSETail192LoopA - - DECQ itr2 - JGE sealSSETail192LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 - PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 - PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 - MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - - MOVO A2, A1 - MOVO B2, B1 - MOVO C2, C1 - MOVO D2, D1 - MOVQ $128, itr1 - LEAQ 128(inp), inp - SUBQ $128, inl - - JMP sealSSE128SealHash - -// ---------------------------------------------------------------------------- -// Special seal optimization for buffers smaller than 129 bytes + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ CX + JG sealSSETail192LoopA + DECQ R9 + JGE sealSSETail192LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 32(BP), X5 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 48(BP), X8 + PADDL 80(BP), X9 + PADDL 96(BP), X10 + PADDL 112(BP), X11 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X0 + PXOR X13, X3 + PXOR X14, X6 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVOU 64(SI), X12 + MOVOU 80(SI), X13 + MOVOU 96(SI), X14 + MOVOU 112(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVO X2, X1 + MOVO X5, X4 + MOVO X8, X7 + MOVO X11, X10 + MOVQ $0x00000080, CX + LEAQ 128(SI), SI + SUBQ $0x80, BX + JMP sealSSE128SealHash + sealSSE128: - // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 - MOVQ $10, itr2 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X3, X13 + MOVO X6, X14 + MOVO X10, X15 + MOVQ $0x0000000a, R9 sealSSE128InnerCipherLoop: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftB1Left; shiftB2Left - shiftC0Left; shiftC1Left; shiftC2Left - shiftD0Left; shiftD1Left; shiftD2Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftB1Right; shiftB2Right - shiftC0Right; shiftC1Right; shiftC2Right - shiftD0Right; shiftD1Right; shiftD2Right - DECQ itr2 - JNE sealSSE128InnerCipherLoop + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ R9 + JNE sealSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 - PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 - PAND ·polyClampMask<>(SB), A0 - MOVOU A0, rStore - MOVOU B0, sStore + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL X13, X3 + PADDL X13, X4 + PADDL X13, X5 + PADDL X14, X7 + PADDL X14, X8 + PADDL X15, X10 + PADDL ·sseIncMask<>+0(SB), X15 + PADDL X15, X11 + PAND ·polyClampMask<>+0(SB), X0 + MOVOU X0, (BP) + MOVOU X3, 16(BP) // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX sealSSE128SealHash: - // itr1 holds the number of bytes encrypted but not yet hashed - CMPQ itr1, $16 - JB sealSSE128Seal - polyAdd(0(oup)) - polyMul - - SUBQ $16, itr1 - ADDQ $16, oup - - JMP sealSSE128SealHash + CMPQ CX, $0x10 + JB sealSSE128Seal + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX + ADDQ $0x10, DI + JMP sealSSE128SealHash sealSSE128Seal: - CMPQ inl, $16 + CMPQ BX, $0x10 JB sealSSETail - SUBQ $16, inl + SUBQ $0x10, BX // Load for decryption - MOVOU (inp), T0 - PXOR T0, A1 - MOVOU A1, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup + MOVOU (SI), X12 + PXOR X12, X1 + MOVOU X1, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI // Extract for hashing - MOVQ A1, t0 - PSRLDQ $8, A1 - MOVQ A1, t1 - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Shift the stream "left" - MOVO B1, A1 - MOVO C1, B1 - MOVO D1, C1 - MOVO A2, D1 - MOVO B2, A2 - MOVO C2, B2 - MOVO D2, C2 + MOVO X4, X1 + MOVO X7, X4 + MOVO X10, X7 + MOVO X2, X10 + MOVO X5, X2 + MOVO X8, X5 + MOVO X11, X8 JMP sealSSE128Seal sealSSETail: - TESTQ inl, inl + TESTQ BX, BX JE sealSSEFinalize // We can only load the PT one byte at a time to avoid read after end of buffer - MOVQ inl, itr2 - SHLQ $4, itr2 - LEAQ ·andMask<>(SB), t0 - MOVQ inl, itr1 - LEAQ -1(inp)(inl*1), inp - XORQ t2, t2 - XORQ t3, t3 + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVQ BX, CX + LEAQ -1(SI)(BX*1), SI + XORQ R15, R15 + XORQ R8, R8 XORQ AX, AX sealSSETailLoadLoop: - SHLQ $8, t2, t3 - SHLQ $8, t2 - MOVB (inp), AX - XORQ AX, t2 - LEAQ -1(inp), inp - DECQ itr1 + SHLQ $0x08, R15, R8 + SHLQ $0x08, R15 + MOVB (SI), AX + XORQ AX, R15 + LEAQ -1(SI), SI + DECQ CX JNE sealSSETailLoadLoop - MOVQ t2, 0+tmpStore - MOVQ t3, 8+tmpStore - PXOR 0+tmpStore, A1 - MOVOU A1, (oup) - MOVOU -16(t0)(itr2*1), T0 - PAND T0, A1 - MOVQ A1, t0 - PSRLDQ $8, A1 - MOVQ A1, t1 - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul - - ADDQ inl, oup + MOVQ R15, 64(BP) + MOVQ R8, 72(BP) + PXOR 64(BP), X1 + MOVOU X1, (DI) + MOVOU -16(R13)(R9*1), X12 + PAND X12, X1 + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ BX, DI sealSSEFinalize: // Hash in the buffer lengths - ADDQ ad_len+80(FP), acc0 - ADCQ src_len+56(FP), acc1 - ADCQ $1, acc2 - polyMul + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Final reduce - MOVQ acc0, t0 - MOVQ acc1, t1 - MOVQ acc2, t2 - SUBQ $-5, acc0 - SBBQ $-1, acc1 - SBBQ $3, acc2 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - CMOVQCS t2, acc2 + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 // Add in the "s" part of the key - ADDQ 0+sStore, acc0 - ADCQ 8+sStore, acc1 + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 // Finally store the tag at the end of the message - MOVQ acc0, (0*8)(oup) - MOVQ acc1, (1*8)(oup) + MOVQ R10, (DI) + MOVQ R11, 8(DI) RET -// ---------------------------------------------------------------------------- -// ------------------------- AVX2 Code ---------------------------------------- chacha20Poly1305Seal_AVX2: VZEROUPPER - VMOVDQU ·chacha20Constants<>(SB), AA0 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 - BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD ·avx2InitMask<>(SB), DD0, DD0 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x70 + BYTE $0x10 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x20 + BYTE $0xc4 + BYTE $0xc2 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x30 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimizations, for very short buffers - CMPQ inl, $192 - JBE seal192AVX2 // 33% faster - CMPQ inl, $320 - JBE seal320AVX2 // 17% faster + CMPQ BX, $0x000000c0 + JBE seal192AVX2 + CMPQ BX, $0x00000140 + JBE seal320AVX2 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 - VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 - VMOVDQA DD3, ctr3StoreAVX2 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA Y12, 64(BP) + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, 96(BP) + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y1, 128(BP) + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + MOVQ $0x0000000a, R9 sealAVX2IntroLoop: - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 - VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 - VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 - VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 - VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 - DECQ itr2 - JNE sealAVX2IntroLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - - VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 - VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key - VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y3, Y3, Y3 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ R9 + JNE sealAVX2IntroLoop + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPERM2I128 $0x02, Y0, Y14, Y4 + VPERM2I128 $0x13, Y0, Y14, Y0 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), DD0, DD0 - VMOVDQA DD0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y4, Y4 + VMOVDQA Y4, (BP) // Hash AD - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) // Can store at least 320 bytes - VPXOR (0*32)(inp), AA0, AA0 - VPXOR (1*32)(inp), CC0, CC0 - VMOVDQU AA0, (0*32)(oup) - VMOVDQU CC0, (1*32)(oup) - - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 - VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 - VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) - - MOVQ $320, itr1 - SUBQ $320, inl - LEAQ 320(inp), inp - - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 - CMPQ inl, $128 + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y12, Y12 + VMOVDQU Y0, (DI) + VMOVDQU Y12, 32(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 64(SI), Y0, Y0 + VPXOR 96(SI), Y14, Y14 + VPXOR 128(SI), Y12, Y12 + VPXOR 160(SI), Y4, Y4 + VMOVDQU Y0, 64(DI) + VMOVDQU Y14, 96(DI) + VMOVDQU Y12, 128(DI) + VMOVDQU Y4, 160(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 192(SI), Y0, Y0 + VPXOR 224(SI), Y14, Y14 + VPXOR 256(SI), Y12, Y12 + VPXOR 288(SI), Y4, Y4 + VMOVDQU Y0, 192(DI) + VMOVDQU Y14, 224(DI) + VMOVDQU Y12, 256(DI) + VMOVDQU Y4, 288(DI) + MOVQ $0x00000140, CX + SUBQ $0x00000140, BX + LEAQ 320(SI), SI + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, Y15, Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, Y15, Y3, Y4 + CMPQ BX, $0x80 JBE sealAVX2SealHash - - VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 - VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) - SUBQ $128, inl - LEAQ 128(inp), inp - - MOVQ $8, itr1 - MOVQ $2, itr2 - - CMPQ inl, $128 - JBE sealAVX2Tail128 - CMPQ inl, $256 - JBE sealAVX2Tail256 - CMPQ inl, $384 - JBE sealAVX2Tail384 - CMPQ inl, $512 - JBE sealAVX2Tail512 + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VPXOR 64(SI), Y12, Y12 + VPXOR 96(SI), Y4, Y4 + VMOVDQU Y0, 320(DI) + VMOVDQU Y14, 352(DI) + VMOVDQU Y12, 384(DI) + VMOVDQU Y4, 416(DI) + SUBQ $0x80, BX + LEAQ 128(SI), SI + MOVQ $0x00000008, CX + MOVQ $0x00000002, R9 + CMPQ BX, $0x80 + JBE sealAVX2Tail128 + CMPQ BX, $0x00000100 + JBE sealAVX2Tail256 + CMPQ BX, $0x00000180 + JBE sealAVX2Tail384 + CMPQ BX, $0x00000200 + JBE sealAVX2Tail512 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 - VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 - VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 - VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 - VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - - SUBQ $16, oup // Adjust the pointer - MOVQ $9, itr1 - JMP sealAVX2InternalLoopStart + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y3, Y3, Y3 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + SUBQ $0x10, DI + MOVQ $0x00000009, CX + JMP sealAVX2InternalLoopStart sealAVX2MainLoop: - // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - MOVQ $10, itr1 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + MOVQ $0x0000000a, CX sealAVX2InternalLoop: - polyAdd(0*8(oup)) - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage1_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulStage2_AVX2 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyMulStage3_AVX2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 sealAVX2InternalLoopStart: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - polyAdd(2*8(oup)) - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage1_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage2_AVX2 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage3_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulReduceStage - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(4*8(oup)) - LEAQ (6*8)(oup), oup - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage1_AVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - polyMulStage2_AVX2 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage3_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - DECQ itr1 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(DI), R10 + ADCQ 40(DI), R11 + ADCQ $0x01, R12 + LEAQ 48(DI), DI + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ CX JNE sealAVX2InternalLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - polyAdd(0*8(oup)) - polyMulAVX2 - LEAQ (4*8)(oup), oup - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) // and here - polyAdd(-2*8(oup)) - polyMulAVX2 - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 - VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) - LEAQ (32*16)(inp), inp - SUBQ $(32*16), inl - CMPQ inl, $512 + ADDQ -16(DI), R10 + ADCQ -8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + SUBQ $0x00000200, BX + CMPQ BX, $0x00000200 JG sealAVX2MainLoop // Tail can only hash 480 bytes - polyAdd(0*8(oup)) - polyMulAVX2 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ 32(oup), oup - - MOVQ $10, itr1 - MOVQ $0, itr2 - CMPQ inl, $128 - JBE sealAVX2Tail128 - CMPQ inl, $256 - JBE sealAVX2Tail256 - CMPQ inl, $384 - JBE sealAVX2Tail384 - JMP sealAVX2Tail512 - -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 193 bytes + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + MOVQ $0x0000000a, CX + MOVQ $0x00000000, R9 + CMPQ BX, $0x80 + JBE sealAVX2Tail128 + CMPQ BX, $0x00000100 + JBE sealAVX2Tail256 + CMPQ BX, $0x00000180 + JBE sealAVX2Tail384 + JMP sealAVX2Tail512 + seal192AVX2: - // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks - VMOVDQA AA0, AA1 - VMOVDQA BB0, BB1 - VMOVDQA CC0, CC1 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2 - VMOVDQA BB0, BB2 - VMOVDQA CC0, CC2 - VMOVDQA DD0, DD2 - VMOVDQA DD1, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 sealAVX2192InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 JNE sealAVX2192InnerCipherLoop - VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 - VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 - VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 - VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 192 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 sealAVX2ShortSeal: // Hash aad - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX sealAVX2SealHash: // itr1 holds the number of bytes encrypted but not yet hashed - CMPQ itr1, $16 - JB sealAVX2ShortSealLoop - polyAdd(0(oup)) - polyMul - SUBQ $16, itr1 - ADDQ $16, oup - JMP sealAVX2SealHash + CMPQ CX, $0x10 + JB sealAVX2ShortSealLoop + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX + ADDQ $0x10, DI + JMP sealAVX2SealHash sealAVX2ShortSealLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB sealAVX2ShortTail32 - SUBQ $32, inl + SUBQ $0x20, BX // Load for encryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI // Now can hash - polyAdd(0*8(oup)) - polyMulAVX2 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ (1*32)(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI // Shift stream left - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 - VMOVDQA AA1, DD0 - VMOVDQA BB1, AA1 - VMOVDQA CC1, BB1 - VMOVDQA DD1, CC1 - VMOVDQA AA2, DD1 - VMOVDQA BB2, AA2 + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 JMP sealAVX2ShortSealLoop sealAVX2ShortTail32: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB sealAVX2ShortDone - - SUBQ $16, inl + SUBQ $0x10, BX // Load for encryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI // Hash - polyAdd(0*8(oup)) - polyMulAVX2 - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 sealAVX2ShortDone: VZEROUPPER JMP sealSSETail -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 321 bytes seal320AVX2: - // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 sealAVX2320InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ R9 JNE sealAVX2320InnerCipherLoop - - VMOVDQA ·chacha20Constants<>(SB), TT0 - VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 - VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 - VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA ·avx2IncMask<>(SB), TT0 - VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD2, DD2 + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 // Clamp and store poly key - VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 320 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 - VPERM2I128 $0x02, AA2, BB2, CC1 - VPERM2I128 $0x02, CC2, DD2, DD1 - VPERM2I128 $0x13, AA2, BB2, AA2 - VPERM2I128 $0x13, CC2, DD2, BB2 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 JMP sealAVX2ShortSeal -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext sealAVX2Tail128: - // Need to decrypt up to 128 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0 - VMOVDQA state1StoreAVX2, BB0 - VMOVDQA state2StoreAVX2, CC0 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VMOVDQA DD0, DD1 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA 32(BP), Y14 + VMOVDQA 64(BP), Y12 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VMOVDQA Y4, Y1 sealAVX2Tail128LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail128LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0 - VPALIGNR $8, CC0, CC0, CC0 - VPALIGNR $12, DD0, DD0, DD0 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0 - VPALIGNR $8, CC0, CC0, CC0 - VPALIGNR $4, DD0, DD0, DD0 - DECQ itr1 - JG sealAVX2Tail128LoopA - DECQ itr2 - JGE sealAVX2Tail128LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA1 - VPADDD state1StoreAVX2, BB0, BB1 - VPADDD state2StoreAVX2, CC0, CC1 - VPADDD DD1, DD0, DD1 - - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ CX + JG sealAVX2Tail128LoopA + DECQ R9 + JGE sealAVX2Tail128LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y5 + VPADDD 32(BP), Y14, Y9 + VPADDD 64(BP), Y12, Y13 + VPADDD Y1, Y4, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 JMP sealAVX2ShortSealLoop -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext sealAVX2Tail256: - // Need to decrypt up to 256 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA DD0, TT1 - VMOVDQA DD1, TT2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 sealAVX2Tail256LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail256LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr1 - JG sealAVX2Tail256LoopA - DECQ itr2 - JGE sealAVX2Tail256LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 - VPERM2I128 $0x02, CC0, DD0, TT1 - VPERM2I128 $0x13, AA0, BB0, TT2 - VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - MOVQ $128, itr1 - LEAQ 128(inp), inp - SUBQ $128, inl - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 - - JMP sealAVX2SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 384 bytes of ciphertext + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ CX + JG sealAVX2Tail256LoopA + DECQ R9 + JGE sealAVX2Tail256LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + MOVQ $0x00000080, CX + LEAQ 128(SI), SI + SUBQ $0x80, BX + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + JMP sealAVX2SealHash + sealAVX2Tail384: - // Need to decrypt up to 384 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 + VMOVDQA Y2, Y15 sealAVX2Tail384LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail384LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr1 - JG sealAVX2Tail384LoopA - DECQ itr2 - JGE sealAVX2Tail384LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 - VPERM2I128 $0x02, AA0, BB0, TT0 - VPERM2I128 $0x02, CC0, DD0, TT1 - VPERM2I128 $0x13, AA0, BB0, TT2 - VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, TT0 - VPERM2I128 $0x02, CC1, DD1, TT1 - VPERM2I128 $0x13, AA1, BB1, TT2 - VPERM2I128 $0x13, CC1, DD1, TT3 - VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 - VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) - MOVQ $256, itr1 - LEAQ 256(inp), inp - SUBQ $256, inl - VPERM2I128 $0x02, AA2, BB2, AA0 - VPERM2I128 $0x02, CC2, DD2, BB0 - VPERM2I128 $0x13, AA2, BB2, CC0 - VPERM2I128 $0x13, CC2, DD2, DD0 - - JMP sealAVX2SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 512 bytes of ciphertext + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ CX + JG sealAVX2Tail384LoopA + DECQ R9 + JGE sealAVX2Tail384LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPADDD Y15, Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + MOVQ $0x00000100, CX + LEAQ 256(SI), SI + SUBQ $0x00000100, BX + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + JMP sealAVX2SealHash + sealAVX2Tail512: - // Need to decrypt up to 512 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) sealAVX2Tail512LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail512LoopB: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyAdd(0*8(oup)) - polyMulAVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ (4*8)(oup), oup - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - - DECQ itr1 - JG sealAVX2Tail512LoopA - DECQ itr2 - JGE sealAVX2Tail512LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3 - VPXOR (0*32)(inp), CC3, CC3 - VMOVDQU CC3, (0*32)(oup) - VPERM2I128 $0x02, CC0, DD0, CC3 - VPXOR (1*32)(inp), CC3, CC3 - VMOVDQU CC3, (1*32)(oup) - VPERM2I128 $0x13, AA0, BB0, CC3 - VPXOR (2*32)(inp), CC3, CC3 - VMOVDQU CC3, (2*32)(oup) - VPERM2I128 $0x13, CC0, DD0, CC3 - VPXOR (3*32)(inp), CC3, CC3 - VMOVDQU CC3, (3*32)(oup) - - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) - - VPERM2I128 $0x02, AA2, BB2, AA0 - VPERM2I128 $0x02, CC2, DD2, BB0 - VPERM2I128 $0x13, AA2, BB2, CC0 - VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - - MOVQ $384, itr1 - LEAQ 384(inp), inp - SUBQ $384, inl - VPERM2I128 $0x02, AA3, BB3, AA0 - VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 - VPERM2I128 $0x13, AA3, BB3, CC0 - VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - - JMP sealAVX2SealHash + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ CX + JG sealAVX2Tail512LoopA + DECQ R9 + JGE sealAVX2Tail512LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPXOR (SI), Y15, Y15 + VMOVDQU Y15, (DI) + VPERM2I128 $0x02, Y12, Y4, Y15 + VPXOR 32(SI), Y15, Y15 + VMOVDQU Y15, 32(DI) + VPERM2I128 $0x13, Y0, Y14, Y15 + VPXOR 64(SI), Y15, Y15 + VMOVDQU Y15, 64(DI) + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + MOVQ $0x00000180, CX + LEAQ 384(SI), SI + SUBQ $0x00000180, BX + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + JMP sealAVX2SealHash diff --git a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s index e0d3c64756692..133757384b787 100644 --- a/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s +++ b/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s @@ -1,108 +1,93 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT. //go:build gc && !purego -#include "textflag.h" - -#define POLY1305_ADD(msg, h0, h1, h2) \ - ADDQ 0(msg), h0; \ - ADCQ 8(msg), h1; \ - ADCQ $1, h2; \ - LEAQ 16(msg), msg - -#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \ - MOVQ r0, AX; \ - MULQ h0; \ - MOVQ AX, t0; \ - MOVQ DX, t1; \ - MOVQ r0, AX; \ - MULQ h1; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ r0, t2; \ - IMULQ h2, t2; \ - ADDQ DX, t2; \ - \ - MOVQ r1, AX; \ - MULQ h0; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ DX, h0; \ - MOVQ r1, t3; \ - IMULQ h2, t3; \ - MOVQ r1, AX; \ - MULQ h1; \ - ADDQ AX, t2; \ - ADCQ DX, t3; \ - ADDQ h0, t2; \ - ADCQ $0, t3; \ - \ - MOVQ t0, h0; \ - MOVQ t1, h1; \ - MOVQ t2, h2; \ - ANDQ $3, h2; \ - MOVQ t2, t0; \ - ANDQ $0xFFFFFFFFFFFFFFFC, t0; \ - ADDQ t0, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2; \ - SHRQ $2, t3, t2; \ - SHRQ $2, t3; \ - ADDQ t2, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2 - -// func update(state *[7]uint64, msg []byte) +// func update(state *macState, msg []byte) TEXT ·update(SB), $0-32 MOVQ state+0(FP), DI MOVQ msg_base+8(FP), SI MOVQ msg_len+16(FP), R15 - - MOVQ 0(DI), R8 // h0 - MOVQ 8(DI), R9 // h1 - MOVQ 16(DI), R10 // h2 - MOVQ 24(DI), R11 // r0 - MOVQ 32(DI), R12 // r1 - - CMPQ R15, $16 + MOVQ (DI), R8 + MOVQ 8(DI), R9 + MOVQ 16(DI), R10 + MOVQ 24(DI), R11 + MOVQ 32(DI), R12 + CMPQ R15, $0x10 JB bytes_between_0_and_15 loop: - POLY1305_ADD(SI, R8, R9, R10) + ADDQ (SI), R8 + ADCQ 8(SI), R9 + ADCQ $0x01, R10 + LEAQ 16(SI), SI multiply: - POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14) - SUBQ $16, R15 - CMPQ R15, $16 - JAE loop + MOVQ R11, AX + MULQ R8 + MOVQ AX, BX + MOVQ DX, CX + MOVQ R11, AX + MULQ R9 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ R11, R13 + IMULQ R10, R13 + ADDQ DX, R13 + MOVQ R12, AX + MULQ R8 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ DX, R8 + MOVQ R12, R14 + IMULQ R10, R14 + MOVQ R12, AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, R14 + ADDQ R8, R13 + ADCQ $0x00, R14 + MOVQ BX, R8 + MOVQ CX, R9 + MOVQ R13, R10 + ANDQ $0x03, R10 + MOVQ R13, BX + ANDQ $-4, BX + ADDQ BX, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SHRQ $0x02, R14, R13 + SHRQ $0x02, R14 + ADDQ R13, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SUBQ $0x10, R15 + CMPQ R15, $0x10 + JAE loop bytes_between_0_and_15: TESTQ R15, R15 JZ done - MOVQ $1, BX + MOVQ $0x00000001, BX XORQ CX, CX XORQ R13, R13 ADDQ R15, SI flush_buffer: - SHLQ $8, BX, CX - SHLQ $8, BX + SHLQ $0x08, BX, CX + SHLQ $0x08, BX MOVB -1(SI), R13 XORQ R13, BX DECQ SI DECQ R15 JNZ flush_buffer - ADDQ BX, R8 ADCQ CX, R9 - ADCQ $0, R10 - MOVQ $16, R15 + ADCQ $0x00, R10 + MOVQ $0x00000010, R15 JMP multiply done: - MOVQ R8, 0(DI) + MOVQ R8, (DI) MOVQ R9, 8(DI) MOVQ R10, 16(DI) RET diff --git a/vendor/golang.org/x/exp/LICENSE b/vendor/golang.org/x/exp/LICENSE index 6a66aea5eafe0..2a7cf70da6e49 100644 --- a/vendor/golang.org/x/exp/LICENSE +++ b/vendor/golang.org/x/exp/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/vendor/golang.org/x/exp/slices/sort.go b/vendor/golang.org/x/exp/slices/sort.go index b67897f76b5d0..f58bbc7ba4de5 100644 --- a/vendor/golang.org/x/exp/slices/sort.go +++ b/vendor/golang.org/x/exp/slices/sort.go @@ -22,10 +22,12 @@ func Sort[S ~[]E, E constraints.Ordered](x S) { // SortFunc sorts the slice x in ascending order as determined by the cmp // function. This sort is not guaranteed to be stable. // cmp(a, b) should return a negative number when a < b, a positive number when -// a > b and zero when a == b. +// a > b and zero when a == b or when a is not comparable to b in the sense +// of the formal definition of Strict Weak Ordering. // // SortFunc requires that cmp is a strict weak ordering. // See https://en.wikipedia.org/wiki/Weak_ordering#Strict_weak_orderings. +// To indicate 'uncomparable', return 0 from the function. func SortFunc[S ~[]E, E any](x S, cmp func(a, b E) int) { n := len(x) pdqsortCmpFunc(x, 0, n, bits.Len(uint(n)), cmp) diff --git a/vendor/golang.org/x/mod/LICENSE b/vendor/golang.org/x/mod/LICENSE index 6a66aea5eafe0..2a7cf70da6e49 100644 --- a/vendor/golang.org/x/mod/LICENSE +++ b/vendor/golang.org/x/mod/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/vendor/golang.org/x/sys/cpu/cpu.go b/vendor/golang.org/x/sys/cpu/cpu.go index ec07aab05788c..02609d5b21d56 100644 --- a/vendor/golang.org/x/sys/cpu/cpu.go +++ b/vendor/golang.org/x/sys/cpu/cpu.go @@ -201,6 +201,25 @@ var S390X struct { _ CacheLinePad } +// RISCV64 contains the supported CPU features and performance characteristics for riscv64 +// platforms. The booleans in RISCV64, with the exception of HasFastMisaligned, indicate +// the presence of RISC-V extensions. +// +// It is safe to assume that all the RV64G extensions are supported and so they are omitted from +// this structure. As riscv64 Go programs require at least RV64G, the code that populates +// this structure cannot run successfully if some of the RV64G extensions are missing. +// The struct is padded to avoid false sharing. +var RISCV64 struct { + _ CacheLinePad + HasFastMisaligned bool // Fast misaligned accesses + HasC bool // Compressed instruction-set extension + HasV bool // Vector extension compatible with RVV 1.0 + HasZba bool // Address generation instructions extension + HasZbb bool // Basic bit-manipulation extension + HasZbs bool // Single-bit instructions extension + _ CacheLinePad +} + func init() { archInit() initOptions() diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go index cd63e73355734..7d902b6847bbf 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go +++ b/vendor/golang.org/x/sys/cpu/cpu_linux_noinit.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x +//go:build linux && !arm && !arm64 && !mips64 && !mips64le && !ppc64 && !ppc64le && !s390x && !riscv64 package cpu diff --git a/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go new file mode 100644 index 0000000000000..cb4a0c5728073 --- /dev/null +++ b/vendor/golang.org/x/sys/cpu/cpu_linux_riscv64.go @@ -0,0 +1,137 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package cpu + +import ( + "syscall" + "unsafe" +) + +// RISC-V extension discovery code for Linux. The approach here is to first try the riscv_hwprobe +// syscall falling back to HWCAP to check for the C extension if riscv_hwprobe is not available. +// +// A note on detection of the Vector extension using HWCAP. +// +// Support for the Vector extension version 1.0 was added to the Linux kernel in release 6.5. +// Support for the riscv_hwprobe syscall was added in 6.4. It follows that if the riscv_hwprobe +// syscall is not available then neither is the Vector extension (which needs kernel support). +// The riscv_hwprobe syscall should then be all we need to detect the Vector extension. +// However, some RISC-V board manufacturers ship boards with an older kernel on top of which +// they have back-ported various versions of the Vector extension patches but not the riscv_hwprobe +// patches. These kernels advertise support for the Vector extension using HWCAP. Falling +// back to HWCAP to detect the Vector extension, if riscv_hwprobe is not available, or simply not +// bothering with riscv_hwprobe at all and just using HWCAP may then seem like an attractive option. +// +// Unfortunately, simply checking the 'V' bit in AT_HWCAP will not work as this bit is used by +// RISC-V board and cloud instance providers to mean different things. The Lichee Pi 4A board +// and the Scaleway RV1 cloud instances use the 'V' bit to advertise their support for the unratified +// 0.7.1 version of the Vector Specification. The Banana Pi BPI-F3 and the CanMV-K230 board use +// it to advertise support for 1.0 of the Vector extension. Versions 0.7.1 and 1.0 of the Vector +// extension are binary incompatible. HWCAP can then not be used in isolation to populate the +// HasV field as this field indicates that the underlying CPU is compatible with RVV 1.0. +// +// There is a way at runtime to distinguish between versions 0.7.1 and 1.0 of the Vector +// specification by issuing a RVV 1.0 vsetvli instruction and checking the vill bit of the vtype +// register. This check would allow us to safely detect version 1.0 of the Vector extension +// with HWCAP, if riscv_hwprobe were not available. However, the check cannot +// be added until the assembler supports the Vector instructions. +// +// Note the riscv_hwprobe syscall does not suffer from these ambiguities by design as all of the +// extensions it advertises support for are explicitly versioned. It's also worth noting that +// the riscv_hwprobe syscall is the only way to detect multi-letter RISC-V extensions, e.g., Zba. +// These cannot be detected using HWCAP and so riscv_hwprobe must be used to detect the majority +// of RISC-V extensions. +// +// Please see https://docs.kernel.org/arch/riscv/hwprobe.html for more information. + +// golang.org/x/sys/cpu is not allowed to depend on golang.org/x/sys/unix so we must +// reproduce the constants, types and functions needed to make the riscv_hwprobe syscall +// here. + +const ( + // Copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go. + riscv_HWPROBE_KEY_IMA_EXT_0 = 0x4 + riscv_HWPROBE_IMA_C = 0x2 + riscv_HWPROBE_IMA_V = 0x4 + riscv_HWPROBE_EXT_ZBA = 0x8 + riscv_HWPROBE_EXT_ZBB = 0x10 + riscv_HWPROBE_EXT_ZBS = 0x20 + riscv_HWPROBE_KEY_CPUPERF_0 = 0x5 + riscv_HWPROBE_MISALIGNED_FAST = 0x3 + riscv_HWPROBE_MISALIGNED_MASK = 0x7 +) + +const ( + // sys_RISCV_HWPROBE is copied from golang.org/x/sys/unix/zsysnum_linux_riscv64.go. + sys_RISCV_HWPROBE = 258 +) + +// riscvHWProbePairs is copied from golang.org/x/sys/unix/ztypes_linux_riscv64.go. +type riscvHWProbePairs struct { + key int64 + value uint64 +} + +const ( + // CPU features + hwcap_RISCV_ISA_C = 1 << ('C' - 'A') +) + +func doinit() { + // A slice of key/value pair structures is passed to the RISCVHWProbe syscall. The key + // field should be initialised with one of the key constants defined above, e.g., + // RISCV_HWPROBE_KEY_IMA_EXT_0. The syscall will set the value field to the appropriate value. + // If the kernel does not recognise a key it will set the key field to -1 and the value field to 0. + + pairs := []riscvHWProbePairs{ + {riscv_HWPROBE_KEY_IMA_EXT_0, 0}, + {riscv_HWPROBE_KEY_CPUPERF_0, 0}, + } + + // This call only indicates that extensions are supported if they are implemented on all cores. + if riscvHWProbe(pairs, 0) { + if pairs[0].key != -1 { + v := uint(pairs[0].value) + RISCV64.HasC = isSet(v, riscv_HWPROBE_IMA_C) + RISCV64.HasV = isSet(v, riscv_HWPROBE_IMA_V) + RISCV64.HasZba = isSet(v, riscv_HWPROBE_EXT_ZBA) + RISCV64.HasZbb = isSet(v, riscv_HWPROBE_EXT_ZBB) + RISCV64.HasZbs = isSet(v, riscv_HWPROBE_EXT_ZBS) + } + if pairs[1].key != -1 { + v := pairs[1].value & riscv_HWPROBE_MISALIGNED_MASK + RISCV64.HasFastMisaligned = v == riscv_HWPROBE_MISALIGNED_FAST + } + } + + // Let's double check with HWCAP if the C extension does not appear to be supported. + // This may happen if we're running on a kernel older than 6.4. + + if !RISCV64.HasC { + RISCV64.HasC = isSet(hwCap, hwcap_RISCV_ISA_C) + } +} + +func isSet(hwc uint, value uint) bool { + return hwc&value != 0 +} + +// riscvHWProbe is a simplified version of the generated wrapper function found in +// golang.org/x/sys/unix/zsyscall_linux_riscv64.go. We simplify it by removing the +// cpuCount and cpus parameters which we do not need. We always want to pass 0 for +// these parameters here so the kernel only reports the extensions that are present +// on all cores. +func riscvHWProbe(pairs []riscvHWProbePairs, flags uint) bool { + var _zero uintptr + var p0 unsafe.Pointer + if len(pairs) > 0 { + p0 = unsafe.Pointer(&pairs[0]) + } else { + p0 = unsafe.Pointer(&_zero) + } + + _, _, e1 := syscall.Syscall6(sys_RISCV_HWPROBE, uintptr(p0), uintptr(len(pairs)), uintptr(0), uintptr(0), uintptr(flags), 0) + return e1 == 0 +} diff --git a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go index 7f0c79c004b44..aca3199c91169 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_riscv64.go +++ b/vendor/golang.org/x/sys/cpu/cpu_riscv64.go @@ -8,4 +8,13 @@ package cpu const cacheLineSize = 64 -func initOptions() {} +func initOptions() { + options = []option{ + {Name: "fastmisaligned", Feature: &RISCV64.HasFastMisaligned}, + {Name: "c", Feature: &RISCV64.HasC}, + {Name: "v", Feature: &RISCV64.HasV}, + {Name: "zba", Feature: &RISCV64.HasZba}, + {Name: "zbb", Feature: &RISCV64.HasZbb}, + {Name: "zbs", Feature: &RISCV64.HasZbs}, + } +} diff --git a/vendor/golang.org/x/sys/unix/mkerrors.sh b/vendor/golang.org/x/sys/unix/mkerrors.sh index d07dd09eb503c..e14b766a32c5f 100644 --- a/vendor/golang.org/x/sys/unix/mkerrors.sh +++ b/vendor/golang.org/x/sys/unix/mkerrors.sh @@ -552,6 +552,7 @@ ccflags="$@" $2 !~ /^RTC_VL_(ACCURACY|BACKUP|DATA)/ && $2 ~ /^(NETLINK|NLM|NLMSG|NLA|IFA|IFAN|RT|RTC|RTCF|RTN|RTPROT|RTNH|ARPHRD|ETH_P|NETNSA)_/ || $2 ~ /^SOCK_|SK_DIAG_|SKNLGRP_$/ || + $2 ~ /^(CONNECT|SAE)_/ || $2 ~ /^FIORDCHK$/ || $2 ~ /^SIOC/ || $2 ~ /^TIOC/ || diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin.go b/vendor/golang.org/x/sys/unix/syscall_darwin.go index 2d15200adb497..099867deedec6 100644 --- a/vendor/golang.org/x/sys/unix/syscall_darwin.go +++ b/vendor/golang.org/x/sys/unix/syscall_darwin.go @@ -566,6 +566,43 @@ func PthreadFchdir(fd int) (err error) { return pthread_fchdir_np(fd) } +// Connectx calls connectx(2) to initiate a connection on a socket. +// +// srcIf, srcAddr, and dstAddr are filled into a [SaEndpoints] struct and passed as the endpoints argument. +// +// - srcIf is the optional source interface index. 0 means unspecified. +// - srcAddr is the optional source address. nil means unspecified. +// - dstAddr is the destination address. +// +// On success, Connectx returns the number of bytes enqueued for transmission. +func Connectx(fd int, srcIf uint32, srcAddr, dstAddr Sockaddr, associd SaeAssocID, flags uint32, iov []Iovec, connid *SaeConnID) (n uintptr, err error) { + endpoints := SaEndpoints{ + Srcif: srcIf, + } + + if srcAddr != nil { + addrp, addrlen, err := srcAddr.sockaddr() + if err != nil { + return 0, err + } + endpoints.Srcaddr = (*RawSockaddr)(addrp) + endpoints.Srcaddrlen = uint32(addrlen) + } + + if dstAddr != nil { + addrp, addrlen, err := dstAddr.sockaddr() + if err != nil { + return 0, err + } + endpoints.Dstaddr = (*RawSockaddr)(addrp) + endpoints.Dstaddrlen = uint32(addrlen) + } + + err = connectx(fd, &endpoints, associd, flags, iov, &n, connid) + return +} + +//sys connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error) //sys sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) //sys shmat(id int, addr uintptr, flag int) (ret uintptr, err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_hurd.go b/vendor/golang.org/x/sys/unix/syscall_hurd.go index ba46651f8e38a..a6a2d2fc2b90c 100644 --- a/vendor/golang.org/x/sys/unix/syscall_hurd.go +++ b/vendor/golang.org/x/sys/unix/syscall_hurd.go @@ -11,6 +11,7 @@ package unix int ioctl(int, unsigned long int, uintptr_t); */ import "C" +import "unsafe" func ioctl(fd int, req uint, arg uintptr) (err error) { r0, er := C.ioctl(C.int(fd), C.ulong(req), C.uintptr_t(arg)) diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go index 4308ac1772b9f..d73c4652e6c54 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_amd64.go @@ -237,6 +237,9 @@ const ( CLOCK_UPTIME_RAW_APPROX = 0x9 CLONE_NOFOLLOW = 0x1 CLONE_NOOWNERCOPY = 0x2 + CONNECT_DATA_AUTHENTICATED = 0x4 + CONNECT_DATA_IDEMPOTENT = 0x2 + CONNECT_RESUME_ON_READ_WRITE = 0x1 CR0 = 0x0 CR1 = 0x1000 CR2 = 0x2000 @@ -1265,6 +1268,10 @@ const ( RTV_SSTHRESH = 0x20 RUSAGE_CHILDREN = -0x1 RUSAGE_SELF = 0x0 + SAE_ASSOCID_ALL = 0xffffffff + SAE_ASSOCID_ANY = 0x0 + SAE_CONNID_ALL = 0xffffffff + SAE_CONNID_ANY = 0x0 SCM_CREDS = 0x3 SCM_RIGHTS = 0x1 SCM_TIMESTAMP = 0x2 diff --git a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go index c8068a7a16900..4a55a40058894 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_darwin_arm64.go @@ -237,6 +237,9 @@ const ( CLOCK_UPTIME_RAW_APPROX = 0x9 CLONE_NOFOLLOW = 0x1 CLONE_NOOWNERCOPY = 0x2 + CONNECT_DATA_AUTHENTICATED = 0x4 + CONNECT_DATA_IDEMPOTENT = 0x2 + CONNECT_RESUME_ON_READ_WRITE = 0x1 CR0 = 0x0 CR1 = 0x1000 CR2 = 0x2000 @@ -1265,6 +1268,10 @@ const ( RTV_SSTHRESH = 0x20 RUSAGE_CHILDREN = -0x1 RUSAGE_SELF = 0x0 + SAE_ASSOCID_ALL = 0xffffffff + SAE_ASSOCID_ANY = 0x0 + SAE_CONNID_ALL = 0xffffffff + SAE_CONNID_ANY = 0x0 SCM_CREDS = 0x3 SCM_RIGHTS = 0x1 SCM_TIMESTAMP = 0x2 diff --git a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go index da08b2ab3d939..1ec2b1407b126 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go +++ b/vendor/golang.org/x/sys/unix/zerrors_zos_s390x.go @@ -581,6 +581,8 @@ const ( AT_EMPTY_PATH = 0x1000 AT_REMOVEDIR = 0x200 RENAME_NOREPLACE = 1 << 0 + ST_RDONLY = 1 + ST_NOSUID = 2 ) const ( diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go index b622533ef2c2b..24b346e1a3515 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go @@ -841,6 +841,26 @@ var libc_pthread_fchdir_np_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error) { + var _p0 unsafe.Pointer + if len(iov) > 0 { + _p0 = unsafe.Pointer(&iov[0]) + } else { + _p0 = unsafe.Pointer(&_zero) + } + _, _, e1 := syscall_syscall9(libc_connectx_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(endpoints)), uintptr(associd), uintptr(flags), uintptr(_p0), uintptr(len(iov)), uintptr(unsafe.Pointer(n)), uintptr(unsafe.Pointer(connid)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_connectx_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_connectx connectx "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) { _, _, e1 := syscall_syscall6(libc_sendfile_trampoline_addr, uintptr(infd), uintptr(outfd), uintptr(offset), uintptr(unsafe.Pointer(len)), uintptr(hdtr), uintptr(flags)) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s index cfe6646baf230..ebd213100b352 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s @@ -248,6 +248,11 @@ TEXT libc_pthread_fchdir_np_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_pthread_fchdir_np_trampoline_addr(SB), RODATA, $8 DATA ·libc_pthread_fchdir_np_trampoline_addr(SB)/8, $libc_pthread_fchdir_np_trampoline<>(SB) +TEXT libc_connectx_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_connectx(SB) +GLOBL ·libc_connectx_trampoline_addr(SB), RODATA, $8 +DATA ·libc_connectx_trampoline_addr(SB)/8, $libc_connectx_trampoline<>(SB) + TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sendfile(SB) GLOBL ·libc_sendfile_trampoline_addr(SB), RODATA, $8 diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go index 13f624f69f19d..824b9c2d5e0e0 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go @@ -841,6 +841,26 @@ var libc_pthread_fchdir_np_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func connectx(fd int, endpoints *SaEndpoints, associd SaeAssocID, flags uint32, iov []Iovec, n *uintptr, connid *SaeConnID) (err error) { + var _p0 unsafe.Pointer + if len(iov) > 0 { + _p0 = unsafe.Pointer(&iov[0]) + } else { + _p0 = unsafe.Pointer(&_zero) + } + _, _, e1 := syscall_syscall9(libc_connectx_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(endpoints)), uintptr(associd), uintptr(flags), uintptr(_p0), uintptr(len(iov)), uintptr(unsafe.Pointer(n)), uintptr(unsafe.Pointer(connid)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +var libc_connectx_trampoline_addr uintptr + +//go:cgo_import_dynamic libc_connectx connectx "/usr/lib/libSystem.B.dylib" + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func sendfile(infd int, outfd int, offset int64, len *int64, hdtr unsafe.Pointer, flags int) (err error) { _, _, e1 := syscall_syscall6(libc_sendfile_trampoline_addr, uintptr(infd), uintptr(outfd), uintptr(offset), uintptr(unsafe.Pointer(len)), uintptr(hdtr), uintptr(flags)) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s index fe222b75df04f..4f178a229345e 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s @@ -248,6 +248,11 @@ TEXT libc_pthread_fchdir_np_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_pthread_fchdir_np_trampoline_addr(SB), RODATA, $8 DATA ·libc_pthread_fchdir_np_trampoline_addr(SB)/8, $libc_pthread_fchdir_np_trampoline<>(SB) +TEXT libc_connectx_trampoline<>(SB),NOSPLIT,$0-0 + JMP libc_connectx(SB) +GLOBL ·libc_connectx_trampoline_addr(SB), RODATA, $8 +DATA ·libc_connectx_trampoline_addr(SB)/8, $libc_connectx_trampoline<>(SB) + TEXT libc_sendfile_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_sendfile(SB) GLOBL ·libc_sendfile_trampoline_addr(SB), RODATA, $8 diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go index 091d107f3a5c9..d003c3d43780c 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go @@ -306,6 +306,19 @@ type XVSockPgen struct { type _Socklen uint32 +type SaeAssocID uint32 + +type SaeConnID uint32 + +type SaEndpoints struct { + Srcif uint32 + Srcaddr *RawSockaddr + Srcaddrlen uint32 + Dstaddr *RawSockaddr + Dstaddrlen uint32 + _ [4]byte +} + type Xucred struct { Version uint32 Uid uint32 diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go index 28ff4ef74d0d3..0d45a941aaecc 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go @@ -306,6 +306,19 @@ type XVSockPgen struct { type _Socklen uint32 +type SaeAssocID uint32 + +type SaeConnID uint32 + +type SaEndpoints struct { + Srcif uint32 + Srcaddr *RawSockaddr + Srcaddrlen uint32 + Dstaddr *RawSockaddr + Dstaddrlen uint32 + _ [4]byte +} + type Xucred struct { Version uint32 Uid uint32 diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go index 6cbd094a3aa19..51e13eb055fc5 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go @@ -625,6 +625,7 @@ const ( POLLRDNORM = 0x40 POLLWRBAND = 0x100 POLLWRNORM = 0x4 + POLLRDHUP = 0x4000 ) type CapRights struct { diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go index 7c03b6ee77fa0..d002d8ef3cc2d 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go @@ -630,6 +630,7 @@ const ( POLLRDNORM = 0x40 POLLWRBAND = 0x100 POLLWRNORM = 0x4 + POLLRDHUP = 0x4000 ) type CapRights struct { diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go index 422107ee8b133..3f863d898dd85 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go @@ -616,6 +616,7 @@ const ( POLLRDNORM = 0x40 POLLWRBAND = 0x100 POLLWRNORM = 0x4 + POLLRDHUP = 0x4000 ) type CapRights struct { diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go index 505a12acfd9d2..61c7293106616 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go @@ -610,6 +610,7 @@ const ( POLLRDNORM = 0x40 POLLWRBAND = 0x100 POLLWRNORM = 0x4 + POLLRDHUP = 0x4000 ) type CapRights struct { diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go index cc986c7900660..b5d17414f039f 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_riscv64.go @@ -612,6 +612,7 @@ const ( POLLRDNORM = 0x40 POLLWRBAND = 0x100 POLLWRNORM = 0x4 + POLLRDHUP = 0x4000 ) type CapRights struct { diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux.go b/vendor/golang.org/x/sys/unix/ztypes_linux.go index 7f1961b907a2b..9f2550dc3120d 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux.go @@ -2486,7 +2486,7 @@ type XDPMmapOffsets struct { type XDPUmemReg struct { Addr uint64 Len uint64 - Chunk_size uint32 + Size uint32 Headroom uint32 Flags uint32 Tx_metadata_len uint32 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go index 15adc04142f2b..ad05b51a60364 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go @@ -727,6 +727,37 @@ const ( RISCV_HWPROBE_EXT_ZBA = 0x8 RISCV_HWPROBE_EXT_ZBB = 0x10 RISCV_HWPROBE_EXT_ZBS = 0x20 + RISCV_HWPROBE_EXT_ZICBOZ = 0x40 + RISCV_HWPROBE_EXT_ZBC = 0x80 + RISCV_HWPROBE_EXT_ZBKB = 0x100 + RISCV_HWPROBE_EXT_ZBKC = 0x200 + RISCV_HWPROBE_EXT_ZBKX = 0x400 + RISCV_HWPROBE_EXT_ZKND = 0x800 + RISCV_HWPROBE_EXT_ZKNE = 0x1000 + RISCV_HWPROBE_EXT_ZKNH = 0x2000 + RISCV_HWPROBE_EXT_ZKSED = 0x4000 + RISCV_HWPROBE_EXT_ZKSH = 0x8000 + RISCV_HWPROBE_EXT_ZKT = 0x10000 + RISCV_HWPROBE_EXT_ZVBB = 0x20000 + RISCV_HWPROBE_EXT_ZVBC = 0x40000 + RISCV_HWPROBE_EXT_ZVKB = 0x80000 + RISCV_HWPROBE_EXT_ZVKG = 0x100000 + RISCV_HWPROBE_EXT_ZVKNED = 0x200000 + RISCV_HWPROBE_EXT_ZVKNHA = 0x400000 + RISCV_HWPROBE_EXT_ZVKNHB = 0x800000 + RISCV_HWPROBE_EXT_ZVKSED = 0x1000000 + RISCV_HWPROBE_EXT_ZVKSH = 0x2000000 + RISCV_HWPROBE_EXT_ZVKT = 0x4000000 + RISCV_HWPROBE_EXT_ZFH = 0x8000000 + RISCV_HWPROBE_EXT_ZFHMIN = 0x10000000 + RISCV_HWPROBE_EXT_ZIHINTNTL = 0x20000000 + RISCV_HWPROBE_EXT_ZVFH = 0x40000000 + RISCV_HWPROBE_EXT_ZVFHMIN = 0x80000000 + RISCV_HWPROBE_EXT_ZFA = 0x100000000 + RISCV_HWPROBE_EXT_ZTSO = 0x200000000 + RISCV_HWPROBE_EXT_ZACAS = 0x400000000 + RISCV_HWPROBE_EXT_ZICOND = 0x800000000 + RISCV_HWPROBE_EXT_ZIHINTPAUSE = 0x1000000000 RISCV_HWPROBE_KEY_CPUPERF_0 = 0x5 RISCV_HWPROBE_MISALIGNED_UNKNOWN = 0x0 RISCV_HWPROBE_MISALIGNED_EMULATED = 0x1 @@ -734,4 +765,6 @@ const ( RISCV_HWPROBE_MISALIGNED_FAST = 0x3 RISCV_HWPROBE_MISALIGNED_UNSUPPORTED = 0x4 RISCV_HWPROBE_MISALIGNED_MASK = 0x7 + RISCV_HWPROBE_KEY_ZICBOZ_BLOCK_SIZE = 0x6 + RISCV_HWPROBE_WHICH_CPUS = 0x1 ) diff --git a/vendor/golang.org/x/sys/windows/syscall_windows.go b/vendor/golang.org/x/sys/windows/syscall_windows.go index 1fa34fd17c5cd..5cee9a3143fd5 100644 --- a/vendor/golang.org/x/sys/windows/syscall_windows.go +++ b/vendor/golang.org/x/sys/windows/syscall_windows.go @@ -313,6 +313,10 @@ func NewCallbackCDecl(fn interface{}) uintptr { //sys SetConsoleMode(console Handle, mode uint32) (err error) = kernel32.SetConsoleMode //sys GetConsoleScreenBufferInfo(console Handle, info *ConsoleScreenBufferInfo) (err error) = kernel32.GetConsoleScreenBufferInfo //sys setConsoleCursorPosition(console Handle, position uint32) (err error) = kernel32.SetConsoleCursorPosition +//sys GetConsoleCP() (cp uint32, err error) = kernel32.GetConsoleCP +//sys GetConsoleOutputCP() (cp uint32, err error) = kernel32.GetConsoleOutputCP +//sys SetConsoleCP(cp uint32) (err error) = kernel32.SetConsoleCP +//sys SetConsoleOutputCP(cp uint32) (err error) = kernel32.SetConsoleOutputCP //sys WriteConsole(console Handle, buf *uint16, towrite uint32, written *uint32, reserved *byte) (err error) = kernel32.WriteConsoleW //sys ReadConsole(console Handle, buf *uint16, toread uint32, read *uint32, inputControl *byte) (err error) = kernel32.ReadConsoleW //sys resizePseudoConsole(pconsole Handle, size uint32) (hr error) = kernel32.ResizePseudoConsole diff --git a/vendor/golang.org/x/sys/windows/types_windows.go b/vendor/golang.org/x/sys/windows/types_windows.go index 3f03b3d57cc35..7b97a154c9573 100644 --- a/vendor/golang.org/x/sys/windows/types_windows.go +++ b/vendor/golang.org/x/sys/windows/types_windows.go @@ -1060,6 +1060,7 @@ const ( SIO_GET_EXTENSION_FUNCTION_POINTER = IOC_INOUT | IOC_WS2 | 6 SIO_KEEPALIVE_VALS = IOC_IN | IOC_VENDOR | 4 SIO_UDP_CONNRESET = IOC_IN | IOC_VENDOR | 12 + SIO_UDP_NETRESET = IOC_IN | IOC_VENDOR | 15 // cf. http://support.microsoft.com/default.aspx?scid=kb;en-us;257460 diff --git a/vendor/golang.org/x/sys/windows/zsyscall_windows.go b/vendor/golang.org/x/sys/windows/zsyscall_windows.go index 9bb979a3e4726..4c2e1bdc01ed3 100644 --- a/vendor/golang.org/x/sys/windows/zsyscall_windows.go +++ b/vendor/golang.org/x/sys/windows/zsyscall_windows.go @@ -247,7 +247,9 @@ var ( procGetCommandLineW = modkernel32.NewProc("GetCommandLineW") procGetComputerNameExW = modkernel32.NewProc("GetComputerNameExW") procGetComputerNameW = modkernel32.NewProc("GetComputerNameW") + procGetConsoleCP = modkernel32.NewProc("GetConsoleCP") procGetConsoleMode = modkernel32.NewProc("GetConsoleMode") + procGetConsoleOutputCP = modkernel32.NewProc("GetConsoleOutputCP") procGetConsoleScreenBufferInfo = modkernel32.NewProc("GetConsoleScreenBufferInfo") procGetCurrentDirectoryW = modkernel32.NewProc("GetCurrentDirectoryW") procGetCurrentProcessId = modkernel32.NewProc("GetCurrentProcessId") @@ -347,8 +349,10 @@ var ( procSetCommMask = modkernel32.NewProc("SetCommMask") procSetCommState = modkernel32.NewProc("SetCommState") procSetCommTimeouts = modkernel32.NewProc("SetCommTimeouts") + procSetConsoleCP = modkernel32.NewProc("SetConsoleCP") procSetConsoleCursorPosition = modkernel32.NewProc("SetConsoleCursorPosition") procSetConsoleMode = modkernel32.NewProc("SetConsoleMode") + procSetConsoleOutputCP = modkernel32.NewProc("SetConsoleOutputCP") procSetCurrentDirectoryW = modkernel32.NewProc("SetCurrentDirectoryW") procSetDefaultDllDirectories = modkernel32.NewProc("SetDefaultDllDirectories") procSetDllDirectoryW = modkernel32.NewProc("SetDllDirectoryW") @@ -2162,6 +2166,15 @@ func GetComputerName(buf *uint16, n *uint32) (err error) { return } +func GetConsoleCP() (cp uint32, err error) { + r0, _, e1 := syscall.Syscall(procGetConsoleCP.Addr(), 0, 0, 0, 0) + cp = uint32(r0) + if cp == 0 { + err = errnoErr(e1) + } + return +} + func GetConsoleMode(console Handle, mode *uint32) (err error) { r1, _, e1 := syscall.Syscall(procGetConsoleMode.Addr(), 2, uintptr(console), uintptr(unsafe.Pointer(mode)), 0) if r1 == 0 { @@ -2170,6 +2183,15 @@ func GetConsoleMode(console Handle, mode *uint32) (err error) { return } +func GetConsoleOutputCP() (cp uint32, err error) { + r0, _, e1 := syscall.Syscall(procGetConsoleOutputCP.Addr(), 0, 0, 0, 0) + cp = uint32(r0) + if cp == 0 { + err = errnoErr(e1) + } + return +} + func GetConsoleScreenBufferInfo(console Handle, info *ConsoleScreenBufferInfo) (err error) { r1, _, e1 := syscall.Syscall(procGetConsoleScreenBufferInfo.Addr(), 2, uintptr(console), uintptr(unsafe.Pointer(info)), 0) if r1 == 0 { @@ -3038,6 +3060,14 @@ func SetCommTimeouts(handle Handle, timeouts *CommTimeouts) (err error) { return } +func SetConsoleCP(cp uint32) (err error) { + r1, _, e1 := syscall.Syscall(procSetConsoleCP.Addr(), 1, uintptr(cp), 0, 0) + if r1 == 0 { + err = errnoErr(e1) + } + return +} + func setConsoleCursorPosition(console Handle, position uint32) (err error) { r1, _, e1 := syscall.Syscall(procSetConsoleCursorPosition.Addr(), 2, uintptr(console), uintptr(position), 0) if r1 == 0 { @@ -3054,6 +3084,14 @@ func SetConsoleMode(console Handle, mode uint32) (err error) { return } +func SetConsoleOutputCP(cp uint32) (err error) { + r1, _, e1 := syscall.Syscall(procSetConsoleOutputCP.Addr(), 1, uintptr(cp), 0, 0) + if r1 == 0 { + err = errnoErr(e1) + } + return +} + func SetCurrentDirectory(path *uint16) (err error) { r1, _, e1 := syscall.Syscall(procSetCurrentDirectoryW.Addr(), 1, uintptr(unsafe.Pointer(path)), 0, 0) if r1 == 0 { diff --git a/vendor/golang.org/x/term/term_windows.go b/vendor/golang.org/x/term/term_windows.go index 465f560604e41..df6bf948e1407 100644 --- a/vendor/golang.org/x/term/term_windows.go +++ b/vendor/golang.org/x/term/term_windows.go @@ -26,6 +26,7 @@ func makeRaw(fd int) (*State, error) { return nil, err } raw := st &^ (windows.ENABLE_ECHO_INPUT | windows.ENABLE_PROCESSED_INPUT | windows.ENABLE_LINE_INPUT | windows.ENABLE_PROCESSED_OUTPUT) + raw |= windows.ENABLE_VIRTUAL_TERMINAL_INPUT if err := windows.SetConsoleMode(windows.Handle(fd), raw); err != nil { return nil, err } diff --git a/vendor/golang.org/x/tools/LICENSE b/vendor/golang.org/x/tools/LICENSE index 6a66aea5eafe0..2a7cf70da6e49 100644 --- a/vendor/golang.org/x/tools/LICENSE +++ b/vendor/golang.org/x/tools/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2009 The Go Authors. All rights reserved. +Copyright 2009 The Go Authors. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -10,7 +10,7 @@ notice, this list of conditions and the following disclaimer. copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - * Neither the name of Google Inc. nor the names of its + * Neither the name of Google LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. diff --git a/vendor/golang.org/x/tools/go/packages/external.go b/vendor/golang.org/x/tools/go/packages/external.go index c2b4b711b59df..8f7afcb5dfbd8 100644 --- a/vendor/golang.org/x/tools/go/packages/external.go +++ b/vendor/golang.org/x/tools/go/packages/external.go @@ -82,7 +82,7 @@ type DriverResponse struct { type driver func(cfg *Config, patterns ...string) (*DriverResponse, error) // findExternalDriver returns the file path of a tool that supplies -// the build system package structure, or "" if not found." +// the build system package structure, or "" if not found. // If GOPACKAGESDRIVER is set in the environment findExternalTool returns its // value, otherwise it searches for a binary named gopackagesdriver on the PATH. func findExternalDriver(cfg *Config) driver { diff --git a/vendor/golang.org/x/tools/go/packages/packages.go b/vendor/golang.org/x/tools/go/packages/packages.go index 34306ddd39007..0b6bfaff8087f 100644 --- a/vendor/golang.org/x/tools/go/packages/packages.go +++ b/vendor/golang.org/x/tools/go/packages/packages.go @@ -46,7 +46,6 @@ import ( // // Unfortunately there are a number of open bugs related to // interactions among the LoadMode bits: -// - https://github.com/golang/go/issues/48226 // - https://github.com/golang/go/issues/56633 // - https://github.com/golang/go/issues/56677 // - https://github.com/golang/go/issues/58726 @@ -76,7 +75,7 @@ const ( // NeedTypes adds Types, Fset, and IllTyped. NeedTypes - // NeedSyntax adds Syntax. + // NeedSyntax adds Syntax and Fset. NeedSyntax // NeedTypesInfo adds TypesInfo. @@ -961,12 +960,14 @@ func (ld *loader) refine(response *DriverResponse) ([]*Package, error) { } if ld.requestedMode&NeedTypes == 0 { ld.pkgs[i].Types = nil - ld.pkgs[i].Fset = nil ld.pkgs[i].IllTyped = false } if ld.requestedMode&NeedSyntax == 0 { ld.pkgs[i].Syntax = nil } + if ld.requestedMode&NeedTypes == 0 && ld.requestedMode&NeedSyntax == 0 { + ld.pkgs[i].Fset = nil + } if ld.requestedMode&NeedTypesInfo == 0 { ld.pkgs[i].TypesInfo = nil } @@ -1499,6 +1500,10 @@ func impliedLoadMode(loadMode LoadMode) LoadMode { // All these things require knowing the import graph. loadMode |= NeedImports } + if loadMode&NeedTypes != 0 { + // Types require the GoVersion from Module. + loadMode |= NeedModule + } return loadMode } diff --git a/vendor/golang.org/x/tools/go/packages/visit.go b/vendor/golang.org/x/tools/go/packages/visit.go index a1dcc40b7270d..df14ffd94dc3e 100644 --- a/vendor/golang.org/x/tools/go/packages/visit.go +++ b/vendor/golang.org/x/tools/go/packages/visit.go @@ -49,11 +49,20 @@ func Visit(pkgs []*Package, pre func(*Package) bool, post func(*Package)) { // PrintErrors returns the number of errors printed. func PrintErrors(pkgs []*Package) int { var n int + errModules := make(map[*Module]bool) Visit(pkgs, nil, func(pkg *Package) { for _, err := range pkg.Errors { fmt.Fprintln(os.Stderr, err) n++ } + + // Print pkg.Module.Error once if present. + mod := pkg.Module + if mod != nil && mod.Error != nil && !errModules[mod] { + errModules[mod] = true + fmt.Fprintln(os.Stderr, mod.Error.Err) + n++ + } }) return n } diff --git a/vendor/golang.org/x/tools/go/types/objectpath/objectpath.go b/vendor/golang.org/x/tools/go/types/objectpath/objectpath.go index d648c3d071b00..9ada177758ff7 100644 --- a/vendor/golang.org/x/tools/go/types/objectpath/objectpath.go +++ b/vendor/golang.org/x/tools/go/types/objectpath/objectpath.go @@ -51,7 +51,7 @@ type Path string // // PO package->object Package.Scope.Lookup // OT object->type Object.Type -// TT type->type Type.{Elem,Key,{,{,Recv}Type}Params,Results,Underlying} [EKPRUTrC] +// TT type->type Type.{Elem,Key,{,{,Recv}Type}Params,Results,Underlying,Rhs} [EKPRUTrCa] // TO type->object Type.{At,Field,Method,Obj} [AFMO] // // All valid paths start with a package and end at an object @@ -63,7 +63,7 @@ type Path string // - The only PO operator is Package.Scope.Lookup, which requires an identifier. // - The only OT operator is Object.Type, // which we encode as '.' because dot cannot appear in an identifier. -// - The TT operators are encoded as [EKPRUTrC]; +// - The TT operators are encoded as [EKPRUTrCa]; // two of these ({,Recv}TypeParams) require an integer operand, // which is encoded as a string of decimal digits. // - The TO operators are encoded as [AFMO]; @@ -106,6 +106,7 @@ const ( opTypeParam = 'T' // .TypeParams.At(i) (Named, Signature) opRecvTypeParam = 'r' // .RecvTypeParams.At(i) (Signature) opConstraint = 'C' // .Constraint() (TypeParam) + opRhs = 'a' // .Rhs() (Alias) // type->object operators opAt = 'A' // .At(i) (Tuple) @@ -279,21 +280,26 @@ func (enc *Encoder) For(obj types.Object) (Path, error) { path = append(path, opType) T := o.Type() + if alias, ok := T.(*aliases.Alias); ok { + if r := findTypeParam(obj, aliases.TypeParams(alias), path, opTypeParam, nil); r != nil { + return Path(r), nil + } + if r := find(obj, aliases.Rhs(alias), append(path, opRhs), nil); r != nil { + return Path(r), nil + } - if tname.IsAlias() { - // type alias + } else if tname.IsAlias() { + // legacy alias if r := find(obj, T, path, nil); r != nil { return Path(r), nil } - } else { - if named, _ := T.(*types.Named); named != nil { - if r := findTypeParam(obj, named.TypeParams(), path, opTypeParam, nil); r != nil { - // generic named type - return Path(r), nil - } - } + + } else if named, ok := T.(*types.Named); ok { // defined (named) type - if r := find(obj, T.Underlying(), append(path, opUnderlying), nil); r != nil { + if r := findTypeParam(obj, named.TypeParams(), path, opTypeParam, nil); r != nil { + return Path(r), nil + } + if r := find(obj, named.Underlying(), append(path, opUnderlying), nil); r != nil { return Path(r), nil } } @@ -657,6 +663,16 @@ func Object(pkg *types.Package, p Path) (types.Object, error) { } t = named.Underlying() + case opRhs: + if alias, ok := t.(*aliases.Alias); ok { + t = aliases.Rhs(alias) + } else if false && aliases.Enabled() { + // The Enabled check is too expensive, so for now we + // simply assume that aliases are not enabled. + // TODO(adonovan): replace with "if true {" when go1.24 is assured. + return nil, fmt.Errorf("cannot apply %q to %s (got %T, want alias)", code, t, t) + } + case opTypeParam: hasTypeParams, ok := t.(hasTypeParams) // Named, Signature if !ok { diff --git a/vendor/golang.org/x/tools/internal/aliases/aliases.go b/vendor/golang.org/x/tools/internal/aliases/aliases.go index c24c2eee457f2..f7798e3354e48 100644 --- a/vendor/golang.org/x/tools/internal/aliases/aliases.go +++ b/vendor/golang.org/x/tools/internal/aliases/aliases.go @@ -22,11 +22,17 @@ import ( // GODEBUG=gotypesalias=... by invoking the type checker. The Enabled // function is expensive and should be called once per task (e.g. // package import), not once per call to NewAlias. -func NewAlias(enabled bool, pos token.Pos, pkg *types.Package, name string, rhs types.Type) *types.TypeName { +// +// Precondition: enabled || len(tparams)==0. +// If materialized aliases are disabled, there must not be any type parameters. +func NewAlias(enabled bool, pos token.Pos, pkg *types.Package, name string, rhs types.Type, tparams []*types.TypeParam) *types.TypeName { if enabled { tname := types.NewTypeName(pos, pkg, name, nil) - newAlias(tname, rhs) + newAlias(tname, rhs, tparams) return tname } + if len(tparams) > 0 { + panic("cannot create an alias with type parameters when gotypesalias is not enabled") + } return types.NewTypeName(pos, pkg, name, rhs) } diff --git a/vendor/golang.org/x/tools/internal/aliases/aliases_go121.go b/vendor/golang.org/x/tools/internal/aliases/aliases_go121.go index c027b9f315f68..a775fcc4bedad 100644 --- a/vendor/golang.org/x/tools/internal/aliases/aliases_go121.go +++ b/vendor/golang.org/x/tools/internal/aliases/aliases_go121.go @@ -15,15 +15,21 @@ import ( // It will never be created by go/types. type Alias struct{} -func (*Alias) String() string { panic("unreachable") } -func (*Alias) Underlying() types.Type { panic("unreachable") } -func (*Alias) Obj() *types.TypeName { panic("unreachable") } -func Rhs(alias *Alias) types.Type { panic("unreachable") } +func (*Alias) String() string { panic("unreachable") } +func (*Alias) Underlying() types.Type { panic("unreachable") } +func (*Alias) Obj() *types.TypeName { panic("unreachable") } +func Rhs(alias *Alias) types.Type { panic("unreachable") } +func TypeParams(alias *Alias) *types.TypeParamList { panic("unreachable") } +func SetTypeParams(alias *Alias, tparams []*types.TypeParam) { panic("unreachable") } +func TypeArgs(alias *Alias) *types.TypeList { panic("unreachable") } +func Origin(alias *Alias) *Alias { panic("unreachable") } // Unalias returns the type t for go <=1.21. func Unalias(t types.Type) types.Type { return t } -func newAlias(name *types.TypeName, rhs types.Type) *Alias { panic("unreachable") } +func newAlias(name *types.TypeName, rhs types.Type, tparams []*types.TypeParam) *Alias { + panic("unreachable") +} // Enabled reports whether [NewAlias] should create [types.Alias] types. // diff --git a/vendor/golang.org/x/tools/internal/aliases/aliases_go122.go b/vendor/golang.org/x/tools/internal/aliases/aliases_go122.go index b32995484190a..31c159e42e6c4 100644 --- a/vendor/golang.org/x/tools/internal/aliases/aliases_go122.go +++ b/vendor/golang.org/x/tools/internal/aliases/aliases_go122.go @@ -28,16 +28,51 @@ func Rhs(alias *Alias) types.Type { return Unalias(alias) } +// TypeParams returns the type parameter list of the alias. +func TypeParams(alias *Alias) *types.TypeParamList { + if alias, ok := any(alias).(interface{ TypeParams() *types.TypeParamList }); ok { + return alias.TypeParams() // go1.23+ + } + return nil +} + +// SetTypeParams sets the type parameters of the alias type. +func SetTypeParams(alias *Alias, tparams []*types.TypeParam) { + if alias, ok := any(alias).(interface { + SetTypeParams(tparams []*types.TypeParam) + }); ok { + alias.SetTypeParams(tparams) // go1.23+ + } else if len(tparams) > 0 { + panic("cannot set type parameters of an Alias type in go1.22") + } +} + +// TypeArgs returns the type arguments used to instantiate the Alias type. +func TypeArgs(alias *Alias) *types.TypeList { + if alias, ok := any(alias).(interface{ TypeArgs() *types.TypeList }); ok { + return alias.TypeArgs() // go1.23+ + } + return nil // empty (go1.22) +} + +// Origin returns the generic Alias type of which alias is an instance. +// If alias is not an instance of a generic alias, Origin returns alias. +func Origin(alias *Alias) *Alias { + if alias, ok := any(alias).(interface{ Origin() *types.Alias }); ok { + return alias.Origin() // go1.23+ + } + return alias // not an instance of a generic alias (go1.22) +} + // Unalias is a wrapper of types.Unalias. func Unalias(t types.Type) types.Type { return types.Unalias(t) } // newAlias is an internal alias around types.NewAlias. // Direct usage is discouraged as the moment. // Try to use NewAlias instead. -func newAlias(tname *types.TypeName, rhs types.Type) *Alias { +func newAlias(tname *types.TypeName, rhs types.Type, tparams []*types.TypeParam) *Alias { a := types.NewAlias(tname, rhs) - // TODO(go.dev/issue/65455): Remove kludgy workaround to set a.actual as a side-effect. - Unalias(a) + SetTypeParams(a, tparams) return a } diff --git a/vendor/golang.org/x/tools/internal/gcimporter/iexport.go b/vendor/golang.org/x/tools/internal/gcimporter/iexport.go index deeb67f315af8..5f283281a2599 100644 --- a/vendor/golang.org/x/tools/internal/gcimporter/iexport.go +++ b/vendor/golang.org/x/tools/internal/gcimporter/iexport.go @@ -2,9 +2,227 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// Indexed binary package export. -// This file was derived from $GOROOT/src/cmd/compile/internal/gc/iexport.go; -// see that file for specification of the format. +// Indexed package export. +// +// The indexed export data format is an evolution of the previous +// binary export data format. Its chief contribution is introducing an +// index table, which allows efficient random access of individual +// declarations and inline function bodies. In turn, this allows +// avoiding unnecessary work for compilation units that import large +// packages. +// +// +// The top-level data format is structured as: +// +// Header struct { +// Tag byte // 'i' +// Version uvarint +// StringSize uvarint +// DataSize uvarint +// } +// +// Strings [StringSize]byte +// Data [DataSize]byte +// +// MainIndex []struct{ +// PkgPath stringOff +// PkgName stringOff +// PkgHeight uvarint +// +// Decls []struct{ +// Name stringOff +// Offset declOff +// } +// } +// +// Fingerprint [8]byte +// +// uvarint means a uint64 written out using uvarint encoding. +// +// []T means a uvarint followed by that many T objects. In other +// words: +// +// Len uvarint +// Elems [Len]T +// +// stringOff means a uvarint that indicates an offset within the +// Strings section. At that offset is another uvarint, followed by +// that many bytes, which form the string value. +// +// declOff means a uvarint that indicates an offset within the Data +// section where the associated declaration can be found. +// +// +// There are five kinds of declarations, distinguished by their first +// byte: +// +// type Var struct { +// Tag byte // 'V' +// Pos Pos +// Type typeOff +// } +// +// type Func struct { +// Tag byte // 'F' or 'G' +// Pos Pos +// TypeParams []typeOff // only present if Tag == 'G' +// Signature Signature +// } +// +// type Const struct { +// Tag byte // 'C' +// Pos Pos +// Value Value +// } +// +// type Type struct { +// Tag byte // 'T' or 'U' +// Pos Pos +// TypeParams []typeOff // only present if Tag == 'U' +// Underlying typeOff +// +// Methods []struct{ // omitted if Underlying is an interface type +// Pos Pos +// Name stringOff +// Recv Param +// Signature Signature +// } +// } +// +// type Alias struct { +// Tag byte // 'A' or 'B' +// Pos Pos +// TypeParams []typeOff // only present if Tag == 'B' +// Type typeOff +// } +// +// // "Automatic" declaration of each typeparam +// type TypeParam struct { +// Tag byte // 'P' +// Pos Pos +// Implicit bool +// Constraint typeOff +// } +// +// typeOff means a uvarint that either indicates a predeclared type, +// or an offset into the Data section. If the uvarint is less than +// predeclReserved, then it indicates the index into the predeclared +// types list (see predeclared in bexport.go for order). Otherwise, +// subtracting predeclReserved yields the offset of a type descriptor. +// +// Value means a type, kind, and type-specific value. See +// (*exportWriter).value for details. +// +// +// There are twelve kinds of type descriptors, distinguished by an itag: +// +// type DefinedType struct { +// Tag itag // definedType +// Name stringOff +// PkgPath stringOff +// } +// +// type PointerType struct { +// Tag itag // pointerType +// Elem typeOff +// } +// +// type SliceType struct { +// Tag itag // sliceType +// Elem typeOff +// } +// +// type ArrayType struct { +// Tag itag // arrayType +// Len uint64 +// Elem typeOff +// } +// +// type ChanType struct { +// Tag itag // chanType +// Dir uint64 // 1 RecvOnly; 2 SendOnly; 3 SendRecv +// Elem typeOff +// } +// +// type MapType struct { +// Tag itag // mapType +// Key typeOff +// Elem typeOff +// } +// +// type FuncType struct { +// Tag itag // signatureType +// PkgPath stringOff +// Signature Signature +// } +// +// type StructType struct { +// Tag itag // structType +// PkgPath stringOff +// Fields []struct { +// Pos Pos +// Name stringOff +// Type typeOff +// Embedded bool +// Note stringOff +// } +// } +// +// type InterfaceType struct { +// Tag itag // interfaceType +// PkgPath stringOff +// Embeddeds []struct { +// Pos Pos +// Type typeOff +// } +// Methods []struct { +// Pos Pos +// Name stringOff +// Signature Signature +// } +// } +// +// // Reference to a type param declaration +// type TypeParamType struct { +// Tag itag // typeParamType +// Name stringOff +// PkgPath stringOff +// } +// +// // Instantiation of a generic type (like List[T2] or List[int]) +// type InstanceType struct { +// Tag itag // instanceType +// Pos pos +// TypeArgs []typeOff +// BaseType typeOff +// } +// +// type UnionType struct { +// Tag itag // interfaceType +// Terms []struct { +// tilde bool +// Type typeOff +// } +// } +// +// +// +// type Signature struct { +// Params []Param +// Results []Param +// Variadic bool // omitted if Results is empty +// } +// +// type Param struct { +// Pos Pos +// Name stringOff +// Type typOff +// } +// +// +// Pos encodes a file:line:column triple, incorporating a simple delta +// encoding scheme within a data object. See exportWriter.pos for +// details. package gcimporter @@ -523,9 +741,22 @@ func (p *iexporter) doDecl(obj types.Object) { } if obj.IsAlias() { - w.tag(aliasTag) + alias, materialized := t.(*aliases.Alias) // may fail when aliases are not enabled + + var tparams *types.TypeParamList + if materialized { + tparams = aliases.TypeParams(alias) + } + if tparams.Len() == 0 { + w.tag(aliasTag) + } else { + w.tag(genericAliasTag) + } w.pos(obj.Pos()) - if alias, ok := t.(*aliases.Alias); ok { + if tparams.Len() > 0 { + w.tparamList(obj.Name(), tparams, obj.Pkg()) + } + if materialized { // Preserve materialized aliases, // even of non-exported types. t = aliases.Rhs(alias) @@ -745,7 +976,13 @@ func (w *exportWriter) doTyp(t types.Type, pkg *types.Package) { } switch t := t.(type) { case *aliases.Alias: - // TODO(adonovan): support parameterized aliases, following *types.Named. + if targs := aliases.TypeArgs(t); targs.Len() > 0 { + w.startType(instanceType) + w.pos(t.Obj().Pos()) + w.typeList(targs, pkg) + w.typ(aliases.Origin(t), pkg) + return + } w.startType(aliasType) w.qualifiedType(t.Obj()) diff --git a/vendor/golang.org/x/tools/internal/gcimporter/iimport.go b/vendor/golang.org/x/tools/internal/gcimporter/iimport.go index 136aa03653cce..ed2d5629596b7 100644 --- a/vendor/golang.org/x/tools/internal/gcimporter/iimport.go +++ b/vendor/golang.org/x/tools/internal/gcimporter/iimport.go @@ -3,7 +3,7 @@ // license that can be found in the LICENSE file. // Indexed package import. -// See cmd/compile/internal/gc/iexport.go for the export data format. +// See iexport.go for the export data format. // This file is a copy of $GOROOT/src/go/internal/gcimporter/iimport.go. @@ -562,14 +562,14 @@ func (r *importReader) obj(name string) { pos := r.pos() switch tag { - case aliasTag: + case aliasTag, genericAliasTag: + var tparams []*types.TypeParam + if tag == genericAliasTag { + tparams = r.tparamList() + } typ := r.typ() - // TODO(adonovan): support generic aliases: - // if tag == genericAliasTag { - // tparams := r.tparamList() - // alias.SetTypeParams(tparams) - // } - r.declare(aliases.NewAlias(r.p.aliases, pos, r.currPkg, name, typ)) + obj := aliases.NewAlias(r.p.aliases, pos, r.currPkg, name, typ, tparams) + r.declare(obj) case constTag: typ, val := r.value() @@ -862,7 +862,7 @@ func (r *importReader) string() string { return r.p.stringAt(r.uint64()) } func (r *importReader) doType(base *types.Named) (res types.Type) { k := r.kind() if debug { - r.p.trace("importing type %d (base: %s)", k, base) + r.p.trace("importing type %d (base: %v)", k, base) r.p.indent++ defer func() { r.p.indent-- diff --git a/vendor/golang.org/x/tools/internal/gcimporter/ureader_yes.go b/vendor/golang.org/x/tools/internal/gcimporter/ureader_yes.go index 2c0770688771a..f0742f5404bad 100644 --- a/vendor/golang.org/x/tools/internal/gcimporter/ureader_yes.go +++ b/vendor/golang.org/x/tools/internal/gcimporter/ureader_yes.go @@ -52,8 +52,7 @@ func (pr *pkgReader) later(fn func()) { // See cmd/compile/internal/noder.derivedInfo. type derivedInfo struct { - idx pkgbits.Index - needed bool + idx pkgbits.Index } // See cmd/compile/internal/noder.typeInfo. @@ -110,13 +109,17 @@ func readUnifiedPackage(fset *token.FileSet, ctxt *types.Context, imports map[st r := pr.newReader(pkgbits.RelocMeta, pkgbits.PublicRootIdx, pkgbits.SyncPublic) pkg := r.pkg() - r.Bool() // has init + if r.Version().Has(pkgbits.HasInit) { + r.Bool() + } for i, n := 0, r.Len(); i < n; i++ { // As if r.obj(), but avoiding the Scope.Lookup call, // to avoid eager loading of imports. r.Sync(pkgbits.SyncObject) - assert(!r.Bool()) + if r.Version().Has(pkgbits.DerivedFuncInstance) { + assert(!r.Bool()) + } r.p.objIdx(r.Reloc(pkgbits.RelocObj)) assert(r.Len() == 0) } @@ -165,7 +168,7 @@ type readerDict struct { // tparams is a slice of the constructed TypeParams for the element. tparams []*types.TypeParam - // devived is a slice of types derived from tparams, which may be + // derived is a slice of types derived from tparams, which may be // instantiated while reading the current element. derived []derivedInfo derivedTypes []types.Type // lazily instantiated from derived @@ -471,7 +474,9 @@ func (r *reader) param() *types.Var { func (r *reader) obj() (types.Object, []types.Type) { r.Sync(pkgbits.SyncObject) - assert(!r.Bool()) + if r.Version().Has(pkgbits.DerivedFuncInstance) { + assert(!r.Bool()) + } pkg, name := r.p.objIdx(r.Reloc(pkgbits.RelocObj)) obj := pkgScope(pkg).Lookup(name) @@ -525,8 +530,12 @@ func (pr *pkgReader) objIdx(idx pkgbits.Index) (*types.Package, string) { case pkgbits.ObjAlias: pos := r.pos() + var tparams []*types.TypeParam + if r.Version().Has(pkgbits.AliasTypeParamNames) { + tparams = r.typeParamNames() + } typ := r.typ() - declare(aliases.NewAlias(r.p.aliases, pos, objPkg, objName, typ)) + declare(aliases.NewAlias(r.p.aliases, pos, objPkg, objName, typ, tparams)) case pkgbits.ObjConst: pos := r.pos() @@ -632,7 +641,10 @@ func (pr *pkgReader) objDictIdx(idx pkgbits.Index) *readerDict { dict.derived = make([]derivedInfo, r.Len()) dict.derivedTypes = make([]types.Type, len(dict.derived)) for i := range dict.derived { - dict.derived[i] = derivedInfo{r.Reloc(pkgbits.RelocType), r.Bool()} + dict.derived[i] = derivedInfo{idx: r.Reloc(pkgbits.RelocType)} + if r.Version().Has(pkgbits.DerivedInfoNeeded) { + assert(!r.Bool()) + } } pr.retireReader(r) diff --git a/vendor/golang.org/x/tools/internal/pkgbits/decoder.go b/vendor/golang.org/x/tools/internal/pkgbits/decoder.go index 2acd85851e362..f6cb37c5c3d1f 100644 --- a/vendor/golang.org/x/tools/internal/pkgbits/decoder.go +++ b/vendor/golang.org/x/tools/internal/pkgbits/decoder.go @@ -21,10 +21,7 @@ import ( // export data. type PkgDecoder struct { // version is the file format version. - version uint32 - - // aliases determines whether types.Aliases should be created - aliases bool + version Version // sync indicates whether the file uses sync markers. sync bool @@ -71,12 +68,9 @@ func (pr *PkgDecoder) SyncMarkers() bool { return pr.sync } // NewPkgDecoder returns a PkgDecoder initialized to read the Unified // IR export data from input. pkgPath is the package path for the // compilation unit that produced the export data. -// -// TODO(mdempsky): Remove pkgPath parameter; unneeded since CL 391014. func NewPkgDecoder(pkgPath, input string) PkgDecoder { pr := PkgDecoder{ pkgPath: pkgPath, - //aliases: aliases.Enabled(), } // TODO(mdempsky): Implement direct indexing of input string to @@ -84,14 +78,15 @@ func NewPkgDecoder(pkgPath, input string) PkgDecoder { r := strings.NewReader(input) - assert(binary.Read(r, binary.LittleEndian, &pr.version) == nil) + var ver uint32 + assert(binary.Read(r, binary.LittleEndian, &ver) == nil) + pr.version = Version(ver) - switch pr.version { - default: - panic(fmt.Errorf("unsupported version: %v", pr.version)) - case 0: - // no flags - case 1: + if pr.version >= numVersions { + panic(fmt.Errorf("cannot decode %q, export data version %d is greater than maximum supported version %d", pkgPath, pr.version, numVersions-1)) + } + + if pr.version.Has(Flags) { var flags uint32 assert(binary.Read(r, binary.LittleEndian, &flags) == nil) pr.sync = flags&flagSyncMarkers != 0 @@ -106,7 +101,9 @@ func NewPkgDecoder(pkgPath, input string) PkgDecoder { assert(err == nil) pr.elemData = input[pos:] - assert(len(pr.elemData)-8 == int(pr.elemEnds[len(pr.elemEnds)-1])) + + const fingerprintSize = 8 + assert(len(pr.elemData)-fingerprintSize == int(pr.elemEnds[len(pr.elemEnds)-1])) return pr } @@ -140,7 +137,7 @@ func (pr *PkgDecoder) AbsIdx(k RelocKind, idx Index) int { absIdx += int(pr.elemEndsEnds[k-1]) } if absIdx >= int(pr.elemEndsEnds[k]) { - errorf("%v:%v is out of bounds; %v", k, idx, pr.elemEndsEnds) + panicf("%v:%v is out of bounds; %v", k, idx, pr.elemEndsEnds) } return absIdx } @@ -197,9 +194,7 @@ func (pr *PkgDecoder) NewDecoderRaw(k RelocKind, idx Index) Decoder { Idx: idx, } - // TODO(mdempsky) r.data.Reset(...) after #44505 is resolved. - r.Data = *strings.NewReader(pr.DataIdx(k, idx)) - + r.Data.Reset(pr.DataIdx(k, idx)) r.Sync(SyncRelocs) r.Relocs = make([]RelocEnt, r.Len()) for i := range r.Relocs { @@ -248,7 +243,7 @@ type Decoder struct { func (r *Decoder) checkErr(err error) { if err != nil { - errorf("unexpected decoding error: %w", err) + panicf("unexpected decoding error: %w", err) } } @@ -519,3 +514,6 @@ func (pr *PkgDecoder) PeekObj(idx Index) (string, string, CodeObj) { return path, name, tag } + +// Version reports the version of the bitstream. +func (w *Decoder) Version() Version { return w.common.version } diff --git a/vendor/golang.org/x/tools/internal/pkgbits/encoder.go b/vendor/golang.org/x/tools/internal/pkgbits/encoder.go index 6482617a4fccd..c17a12399d059 100644 --- a/vendor/golang.org/x/tools/internal/pkgbits/encoder.go +++ b/vendor/golang.org/x/tools/internal/pkgbits/encoder.go @@ -12,18 +12,15 @@ import ( "io" "math/big" "runtime" + "strings" ) -// currentVersion is the current version number. -// -// - v0: initial prototype -// -// - v1: adds the flags uint32 word -const currentVersion uint32 = 1 - // A PkgEncoder provides methods for encoding a package's Unified IR // export data. type PkgEncoder struct { + // version of the bitstream. + version Version + // elems holds the bitstream for previously encoded elements. elems [numRelocs][]string @@ -47,8 +44,9 @@ func (pw *PkgEncoder) SyncMarkers() bool { return pw.syncFrames >= 0 } // export data files, but can help diagnosing desync errors in // higher-level Unified IR reader/writer code. If syncFrames is // negative, then sync markers are omitted entirely. -func NewPkgEncoder(syncFrames int) PkgEncoder { +func NewPkgEncoder(version Version, syncFrames int) PkgEncoder { return PkgEncoder{ + version: version, stringsIdx: make(map[string]Index), syncFrames: syncFrames, } @@ -64,13 +62,15 @@ func (pw *PkgEncoder) DumpTo(out0 io.Writer) (fingerprint [8]byte) { assert(binary.Write(out, binary.LittleEndian, x) == nil) } - writeUint32(currentVersion) + writeUint32(uint32(pw.version)) - var flags uint32 - if pw.SyncMarkers() { - flags |= flagSyncMarkers + if pw.version.Has(Flags) { + var flags uint32 + if pw.SyncMarkers() { + flags |= flagSyncMarkers + } + writeUint32(flags) } - writeUint32(flags) // Write elemEndsEnds. var sum uint32 @@ -159,7 +159,7 @@ type Encoder struct { // Flush finalizes the element's bitstream and returns its Index. func (w *Encoder) Flush() Index { - var sb bytes.Buffer // TODO(mdempsky): strings.Builder after #44505 is resolved + var sb strings.Builder // Backup the data so we write the relocations at the front. var tmp bytes.Buffer @@ -189,7 +189,7 @@ func (w *Encoder) Flush() Index { func (w *Encoder) checkErr(err error) { if err != nil { - errorf("unexpected encoding error: %v", err) + panicf("unexpected encoding error: %v", err) } } @@ -320,8 +320,14 @@ func (w *Encoder) Code(c Code) { // section (if not already present), and then writing a relocation // into the element bitstream. func (w *Encoder) String(s string) { + w.StringRef(w.p.StringIdx(s)) +} + +// StringRef writes a reference to the given index, which must be a +// previously encoded string value. +func (w *Encoder) StringRef(idx Index) { w.Sync(SyncString) - w.Reloc(RelocString, w.p.StringIdx(s)) + w.Reloc(RelocString, idx) } // Strings encodes and writes a variable-length slice of strings into @@ -348,7 +354,7 @@ func (w *Encoder) Value(val constant.Value) { func (w *Encoder) scalar(val constant.Value) { switch v := constant.Val(val).(type) { default: - errorf("unhandled %v (%v)", val, val.Kind()) + panicf("unhandled %v (%v)", val, val.Kind()) case bool: w.Code(ValBool) w.Bool(v) @@ -381,3 +387,6 @@ func (w *Encoder) bigFloat(v *big.Float) { b := v.Append(nil, 'p', -1) w.String(string(b)) // TODO: More efficient encoding. } + +// Version reports the version of the bitstream. +func (w *Encoder) Version() Version { return w.p.version } diff --git a/vendor/golang.org/x/tools/internal/pkgbits/frames_go1.go b/vendor/golang.org/x/tools/internal/pkgbits/frames_go1.go deleted file mode 100644 index 5294f6a63edd7..0000000000000 --- a/vendor/golang.org/x/tools/internal/pkgbits/frames_go1.go +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2021 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build !go1.7 -// +build !go1.7 - -// TODO(mdempsky): Remove after #44505 is resolved - -package pkgbits - -import "runtime" - -func walkFrames(pcs []uintptr, visit frameVisitor) { - for _, pc := range pcs { - fn := runtime.FuncForPC(pc) - file, line := fn.FileLine(pc) - - visit(file, line, fn.Name(), pc-fn.Entry()) - } -} diff --git a/vendor/golang.org/x/tools/internal/pkgbits/frames_go17.go b/vendor/golang.org/x/tools/internal/pkgbits/frames_go17.go deleted file mode 100644 index 2324ae7adfe20..0000000000000 --- a/vendor/golang.org/x/tools/internal/pkgbits/frames_go17.go +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright 2021 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build go1.7 -// +build go1.7 - -package pkgbits - -import "runtime" - -// walkFrames calls visit for each call frame represented by pcs. -// -// pcs should be a slice of PCs, as returned by runtime.Callers. -func walkFrames(pcs []uintptr, visit frameVisitor) { - if len(pcs) == 0 { - return - } - - frames := runtime.CallersFrames(pcs) - for { - frame, more := frames.Next() - visit(frame.File, frame.Line, frame.Function, frame.PC-frame.Entry) - if !more { - return - } - } -} diff --git a/vendor/golang.org/x/tools/internal/pkgbits/support.go b/vendor/golang.org/x/tools/internal/pkgbits/support.go index ad26d3b28cae1..50534a2955363 100644 --- a/vendor/golang.org/x/tools/internal/pkgbits/support.go +++ b/vendor/golang.org/x/tools/internal/pkgbits/support.go @@ -12,6 +12,6 @@ func assert(b bool) { } } -func errorf(format string, args ...interface{}) { +func panicf(format string, args ...any) { panic(fmt.Errorf(format, args...)) } diff --git a/vendor/golang.org/x/tools/internal/pkgbits/sync.go b/vendor/golang.org/x/tools/internal/pkgbits/sync.go index 5bd51ef717007..1520b73afb9e9 100644 --- a/vendor/golang.org/x/tools/internal/pkgbits/sync.go +++ b/vendor/golang.org/x/tools/internal/pkgbits/sync.go @@ -6,6 +6,7 @@ package pkgbits import ( "fmt" + "runtime" "strings" ) @@ -23,6 +24,24 @@ func fmtFrames(pcs ...uintptr) []string { type frameVisitor func(file string, line int, name string, offset uintptr) +// walkFrames calls visit for each call frame represented by pcs. +// +// pcs should be a slice of PCs, as returned by runtime.Callers. +func walkFrames(pcs []uintptr, visit frameVisitor) { + if len(pcs) == 0 { + return + } + + frames := runtime.CallersFrames(pcs) + for { + frame, more := frames.Next() + visit(frame.File, frame.Line, frame.Function, frame.PC-frame.Entry) + if !more { + return + } + } +} + // SyncMarker is an enum type that represents markers that may be // written to export data to ensure the reader and writer stay // synchronized. @@ -110,4 +129,8 @@ const ( SyncStmtsEnd SyncLabel SyncOptLabel + + SyncMultiExpr + SyncRType + SyncConvRTTI ) diff --git a/vendor/golang.org/x/tools/internal/pkgbits/syncmarker_string.go b/vendor/golang.org/x/tools/internal/pkgbits/syncmarker_string.go index 4a5b0ca5f2ffc..582ad56d3e09f 100644 --- a/vendor/golang.org/x/tools/internal/pkgbits/syncmarker_string.go +++ b/vendor/golang.org/x/tools/internal/pkgbits/syncmarker_string.go @@ -74,11 +74,14 @@ func _() { _ = x[SyncStmtsEnd-64] _ = x[SyncLabel-65] _ = x[SyncOptLabel-66] + _ = x[SyncMultiExpr-67] + _ = x[SyncRType-68] + _ = x[SyncConvRTTI-69] } -const _SyncMarker_name = "EOFBoolInt64Uint64StringValueValRelocsRelocUseRelocPublicPosPosBaseObjectObject1PkgPkgDefMethodTypeTypeIdxTypeParamNamesSignatureParamsParamCodeObjSymLocalIdentSelectorPrivateFuncExtVarExtTypeExtPragmaExprListExprsExprExprTypeAssignOpFuncLitCompLitDeclFuncBodyOpenScopeCloseScopeCloseAnotherScopeDeclNamesDeclNameStmtsBlockStmtIfStmtForStmtSwitchStmtRangeStmtCaseClauseCommClauseSelectStmtDeclsLabeledStmtUseObjLocalAddLocalLinknameStmt1StmtsEndLabelOptLabel" +const _SyncMarker_name = "EOFBoolInt64Uint64StringValueValRelocsRelocUseRelocPublicPosPosBaseObjectObject1PkgPkgDefMethodTypeTypeIdxTypeParamNamesSignatureParamsParamCodeObjSymLocalIdentSelectorPrivateFuncExtVarExtTypeExtPragmaExprListExprsExprExprTypeAssignOpFuncLitCompLitDeclFuncBodyOpenScopeCloseScopeCloseAnotherScopeDeclNamesDeclNameStmtsBlockStmtIfStmtForStmtSwitchStmtRangeStmtCaseClauseCommClauseSelectStmtDeclsLabeledStmtUseObjLocalAddLocalLinknameStmt1StmtsEndLabelOptLabelMultiExprRTypeConvRTTI" -var _SyncMarker_index = [...]uint16{0, 3, 7, 12, 18, 24, 29, 32, 38, 43, 51, 57, 60, 67, 73, 80, 83, 89, 95, 99, 106, 120, 129, 135, 140, 147, 150, 160, 168, 175, 182, 188, 195, 201, 209, 214, 218, 226, 232, 234, 241, 248, 252, 260, 269, 279, 296, 305, 313, 318, 327, 333, 340, 350, 359, 369, 379, 389, 394, 405, 416, 424, 432, 437, 445, 450, 458} +var _SyncMarker_index = [...]uint16{0, 3, 7, 12, 18, 24, 29, 32, 38, 43, 51, 57, 60, 67, 73, 80, 83, 89, 95, 99, 106, 120, 129, 135, 140, 147, 150, 160, 168, 175, 182, 188, 195, 201, 209, 214, 218, 226, 232, 234, 241, 248, 252, 260, 269, 279, 296, 305, 313, 318, 327, 333, 340, 350, 359, 369, 379, 389, 394, 405, 416, 424, 432, 437, 445, 450, 458, 467, 472, 480} func (i SyncMarker) String() string { i -= 1 diff --git a/vendor/golang.org/x/tools/internal/pkgbits/version.go b/vendor/golang.org/x/tools/internal/pkgbits/version.go new file mode 100644 index 0000000000000..53af9df22b38a --- /dev/null +++ b/vendor/golang.org/x/tools/internal/pkgbits/version.go @@ -0,0 +1,85 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package pkgbits + +// Version indicates a version of a unified IR bitstream. +// Each Version indicates the addition, removal, or change of +// new data in the bitstream. +// +// These are serialized to disk and the interpretation remains fixed. +type Version uint32 + +const ( + // V0: initial prototype. + // + // All data that is not assigned a Field is in version V0 + // and has not been deprecated. + V0 Version = iota + + // V1: adds the Flags uint32 word + V1 + + // V2: removes unused legacy fields and supports type parameters for aliases. + // - remove the legacy "has init" bool from the public root + // - remove obj's "derived func instance" bool + // - add a TypeParamNames field to ObjAlias + // - remove derived info "needed" bool + V2 + + numVersions = iota +) + +// Field denotes a unit of data in the serialized unified IR bitstream. +// It is conceptually a like field in a structure. +// +// We only really need Fields when the data may or may not be present +// in a stream based on the Version of the bitstream. +// +// Unlike much of pkgbits, Fields are not serialized and +// can change values as needed. +type Field int + +const ( + // Flags in a uint32 in the header of a bitstream + // that is used to indicate whether optional features are enabled. + Flags Field = iota + + // Deprecated: HasInit was a bool indicating whether a package + // has any init functions. + HasInit + + // Deprecated: DerivedFuncInstance was a bool indicating + // whether an object was a function instance. + DerivedFuncInstance + + // ObjAlias has a list of TypeParamNames. + AliasTypeParamNames + + // Deprecated: DerivedInfoNeeded was a bool indicating + // whether a type was a derived type. + DerivedInfoNeeded + + numFields = iota +) + +// introduced is the version a field was added. +var introduced = [numFields]Version{ + Flags: V1, + AliasTypeParamNames: V2, +} + +// removed is the version a field was removed in or 0 for fields +// that have not yet been deprecated. +// (So removed[f]-1 is the last version it is included in.) +var removed = [numFields]Version{ + HasInit: V2, + DerivedFuncInstance: V2, + DerivedInfoNeeded: V2, +} + +// Has reports whether field f is present in a bitstream at version v. +func (v Version) Has(f Field) bool { + return introduced[f] <= v && (v < removed[f] || removed[f] == V0) +} diff --git a/vendor/golang.org/x/tools/internal/stdlib/manifest.go b/vendor/golang.org/x/tools/internal/stdlib/manifest.go index a928acf29facf..cdaac9ab34df2 100644 --- a/vendor/golang.org/x/tools/internal/stdlib/manifest.go +++ b/vendor/golang.org/x/tools/internal/stdlib/manifest.go @@ -951,7 +951,7 @@ var PackageSymbols = map[string][]Symbol{ {"ParseSessionState", Func, 21}, {"QUICClient", Func, 21}, {"QUICConfig", Type, 21}, - {"QUICConfig.EnableStoreSessionEvent", Field, 23}, + {"QUICConfig.EnableSessionEvents", Field, 23}, {"QUICConfig.TLSConfig", Field, 21}, {"QUICConn", Type, 21}, {"QUICEncryptionLevel", Type, 21}, diff --git a/vendor/golang.org/x/tools/internal/typesinternal/errorcode.go b/vendor/golang.org/x/tools/internal/typesinternal/errorcode.go index 834e05381ceaa..131caab2847ec 100644 --- a/vendor/golang.org/x/tools/internal/typesinternal/errorcode.go +++ b/vendor/golang.org/x/tools/internal/typesinternal/errorcode.go @@ -838,7 +838,7 @@ const ( // InvalidCap occurs when an argument to the cap built-in function is not of // supported type. // - // See https://golang.org/ref/spec#Lengthand_capacity for information on + // See https://golang.org/ref/spec#Length_and_capacity for information on // which underlying types are supported as arguments to cap and len. // // Example: @@ -859,7 +859,7 @@ const ( // InvalidCopy occurs when the arguments are not of slice type or do not // have compatible type. // - // See https://golang.org/ref/spec#Appendingand_copying_slices for more + // See https://golang.org/ref/spec#Appending_and_copying_slices for more // information on the type requirements for the copy built-in. // // Example: @@ -897,7 +897,7 @@ const ( // InvalidLen occurs when an argument to the len built-in function is not of // supported type. // - // See https://golang.org/ref/spec#Lengthand_capacity for information on + // See https://golang.org/ref/spec#Length_and_capacity for information on // which underlying types are supported as arguments to cap and len. // // Example: @@ -914,7 +914,7 @@ const ( // InvalidMake occurs when make is called with an unsupported type argument. // - // See https://golang.org/ref/spec#Makingslices_maps_and_channels for + // See https://golang.org/ref/spec#Making_slices_maps_and_channels for // information on the types that may be created using make. // // Example: diff --git a/vendor/golang.org/x/tools/internal/versions/constraint.go b/vendor/golang.org/x/tools/internal/versions/constraint.go new file mode 100644 index 0000000000000..179063d484860 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/versions/constraint.go @@ -0,0 +1,13 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package versions + +import "go/build/constraint" + +// ConstraintGoVersion is constraint.GoVersion (if built with go1.21+). +// Otherwise nil. +// +// Deprecate once x/tools is after go1.21. +var ConstraintGoVersion func(x constraint.Expr) string diff --git a/vendor/golang.org/x/tools/internal/versions/constraint_go121.go b/vendor/golang.org/x/tools/internal/versions/constraint_go121.go new file mode 100644 index 0000000000000..38011407d5f06 --- /dev/null +++ b/vendor/golang.org/x/tools/internal/versions/constraint_go121.go @@ -0,0 +1,14 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.21 +// +build go1.21 + +package versions + +import "go/build/constraint" + +func init() { + ConstraintGoVersion = constraint.GoVersion +} diff --git a/vendor/modules.txt b/vendor/modules.txt index af829edd451b1..f869e0047e110 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1035,7 +1035,7 @@ github.com/grafana/gomemcache/memcache ## explicit; go 1.13 github.com/grafana/jsonparser # github.com/grafana/loki/pkg/push v0.0.0-20231124142027-e52380921608 => ./pkg/push -## explicit; go 1.19 +## explicit; go 1.22.0 github.com/grafana/loki/pkg/push # github.com/grafana/pyroscope-go/godeltaprof v0.1.8 ## explicit; go 1.18 @@ -1807,7 +1807,7 @@ go.uber.org/zap/zapgrpc # go4.org/netipx v0.0.0-20230125063823-8449b0a6169f ## explicit; go 1.18 go4.org/netipx -# golang.org/x/crypto v0.26.0 +# golang.org/x/crypto v0.27.0 ## explicit; go 1.20 golang.org/x/crypto/argon2 golang.org/x/crypto/bcrypt @@ -1826,15 +1826,15 @@ golang.org/x/crypto/pkcs12 golang.org/x/crypto/pkcs12/internal/rc2 golang.org/x/crypto/scrypt golang.org/x/crypto/sha3 -# golang.org/x/exp v0.0.0-20240325151524-a685a6edb6d8 -## explicit; go 1.20 +# golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 +## explicit; go 1.22.0 golang.org/x/exp/constraints golang.org/x/exp/maps golang.org/x/exp/slices -# golang.org/x/mod v0.19.0 -## explicit; go 1.18 +# golang.org/x/mod v0.21.0 +## explicit; go 1.22.0 golang.org/x/mod/semver -# golang.org/x/net v0.28.0 +# golang.org/x/net v0.29.0 ## explicit; go 1.18 golang.org/x/net/bpf golang.org/x/net/context @@ -1874,7 +1874,7 @@ golang.org/x/oauth2/jwt golang.org/x/sync/errgroup golang.org/x/sync/semaphore golang.org/x/sync/singleflight -# golang.org/x/sys v0.24.0 +# golang.org/x/sys v0.25.0 ## explicit; go 1.18 golang.org/x/sys/cpu golang.org/x/sys/plan9 @@ -1882,10 +1882,10 @@ golang.org/x/sys/unix golang.org/x/sys/windows golang.org/x/sys/windows/registry golang.org/x/sys/windows/svc/eventlog -# golang.org/x/term v0.23.0 +# golang.org/x/term v0.24.0 ## explicit; go 1.18 golang.org/x/term -# golang.org/x/text v0.17.0 +# golang.org/x/text v0.18.0 ## explicit; go 1.18 golang.org/x/text/cases golang.org/x/text/encoding @@ -1912,8 +1912,8 @@ golang.org/x/text/unicode/norm # golang.org/x/time v0.6.0 ## explicit; go 1.18 golang.org/x/time/rate -# golang.org/x/tools v0.23.0 -## explicit; go 1.19 +# golang.org/x/tools v0.25.0 +## explicit; go 1.22.0 golang.org/x/tools/go/gcexportdata golang.org/x/tools/go/packages golang.org/x/tools/go/types/objectpath