diff --git a/src/benchmark.h b/src/benchmark.h new file mode 100644 index 000000000..bd5df6f19 --- /dev/null +++ b/src/benchmark.h @@ -0,0 +1,30 @@ +#ifndef TINC_BENCHMARK_H +#define TINC_BENCHMARK_H + +#include "system.h" + +static struct timespec start; +static struct timespec end; +static double elapsed; +static double rate; +static unsigned int count; + +static void clock_start(void) { + count = 0; + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); +} + +static bool clock_countto(double seconds) { + clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); + elapsed = (double) end.tv_sec + (double) end.tv_nsec * 1e-9 + - (double) start.tv_sec - (double) start.tv_nsec * 1e-9; + + if(elapsed < seconds) { + return ++count; + } + + rate = count / elapsed; + return false; +} + +#endif // TINC_BENCHMARK_H diff --git a/src/chacha-poly1305/bench_chacha.c b/src/chacha-poly1305/bench_chacha.c new file mode 100644 index 000000000..a5ee96be3 --- /dev/null +++ b/src/chacha-poly1305/bench_chacha.c @@ -0,0 +1,61 @@ +#include "../system.h" + +#include "../benchmark.h" +#include "../random.h" +#include "../crypto.h" +#include "../xalloc.h" +#include "chacha.h" + +#define BUFFER_SIZE (1024 * 1024) + +static FILE *dev_null; + +static void benchmark(chacha_ctx *ctx, const uint8_t *plaintext, uint8_t *ciphertext, size_t len) { + for(clock_start(); clock_countto(5);) { + chacha_encrypt_bytes(ctx, plaintext, ciphertext, len); + } + + // Prevent the compiler from optimizing out encryption + fwrite(ciphertext, len, 1, dev_null); + fprintf(stderr, "%8zu: %14.2lf op/s\n", len, rate); +} + +const size_t block_sizes[] = { + 32, + 256, + 512, + 1024, + 16 * 1024, + 128 * 1024, + BUFFER_SIZE, +}; + +int main(void) { + dev_null = fopen("/dev/null", "w"); + random_init(); + chacha_resolve_functions(); + + uint8_t key[256 / 8]; + uint8_t iv[8]; + randomize(key, sizeof(key)); + randomize(iv, sizeof(iv)); + + chacha_ctx ctx; + chacha_keysetup(&ctx, key, 256); + chacha_ivsetup(&ctx, iv, NULL); + + uint8_t *plaintext = xmalloc(BUFFER_SIZE); + uint8_t *ciphertext = malloc(BUFFER_SIZE); + randomize(plaintext, BUFFER_SIZE); + + for(size_t i = 0; i < sizeof(block_sizes) / sizeof(*block_sizes); ++i) { + benchmark(&ctx, plaintext, ciphertext, block_sizes[i]); + } + + free(ciphertext); + free(plaintext); + random_exit(); + fclose(dev_null); + + return 0; +} diff --git a/src/chacha-poly1305/chacha.c b/src/chacha-poly1305/chacha.c index 696f44a52..22165caad 100644 --- a/src/chacha-poly1305/chacha.c +++ b/src/chacha-poly1305/chacha.c @@ -7,8 +7,7 @@ Public domain. #include "../system.h" #include "chacha.h" - -typedef struct chacha_ctx chacha_ctx; +#include "../cpu.h" #define U8C(v) (v##U) #define U32C(v) (v##U) @@ -79,8 +78,7 @@ void chacha_ivsetup(chacha_ctx *x, const uint8_t *iv, const uint8_t *counter) { x->input[15] = U8TO32_LITTLE(iv + 4); } -void -chacha_encrypt_bytes(chacha_ctx *x, const uint8_t *m, uint8_t *c, uint32_t bytes) { +static void chacha_encrypt_bytes_generic(chacha_ctx *x, const uint8_t *m, uint8_t *c, uint32_t bytes) { uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; uint8_t *ctarget = NULL; @@ -222,3 +220,31 @@ chacha_encrypt_bytes(chacha_ctx *x, const uint8_t *m, uint8_t *c, uint32_t bytes m += 64; } } + +static chacha_encrypt_bytes_t *chacha_encrypt_bytes_impl; + +void chacha_encrypt_bytes(struct chacha_ctx *x, const uint8_t *m, uint8_t *c, uint32_t bytes) { + chacha_encrypt_bytes_impl(x, m, c, bytes); +} + +void chacha_resolve_functions(void) { + cpu_detect_features(); + +#ifdef HAVE_CPU_AVX2 + + if(cpu_supports(CPU_AVX2)) { + chacha_encrypt_bytes_impl = chacha_encrypt_bytes_avx2; + return; + } + +#endif +#ifdef HAVE_CPU_SSSE3 + + if(cpu_supports(CPU_SSSE3)) { + chacha_encrypt_bytes_impl = chacha_encrypt_bytes_ssse3; + return; + } + +#endif + chacha_encrypt_bytes_impl = chacha_encrypt_bytes_generic; +} diff --git a/src/chacha-poly1305/chacha.h b/src/chacha-poly1305/chacha.h index 103c3d812..7678538ad 100644 --- a/src/chacha-poly1305/chacha.h +++ b/src/chacha-poly1305/chacha.h @@ -7,9 +7,11 @@ Public domain. #ifndef CHACHA_H #define CHACHA_H -struct chacha_ctx { +typedef struct chacha_ctx { uint32_t input[16]; -}; +} chacha_ctx; + +#define ROUNDS 20 #define CHACHA_MINKEYLEN 16 #define CHACHA_NONCELEN 8 @@ -17,6 +19,16 @@ struct chacha_ctx { #define CHACHA_STATELEN (CHACHA_NONCELEN+CHACHA_CTRLEN) #define CHACHA_BLOCKLEN 64 +typedef void (chacha_encrypt_bytes_t)(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, uint32_t bytes); + +#ifdef HAVE_CPU_AVX2 +extern chacha_encrypt_bytes_t chacha_encrypt_bytes_avx2; +#endif +#ifdef HAVE_CPU_SSSE3 +extern chacha_encrypt_bytes_t chacha_encrypt_bytes_ssse3; +#endif + +void chacha_resolve_functions(void); void chacha_keysetup(struct chacha_ctx *x, const uint8_t *k, uint32_t kbits); void chacha_ivsetup(struct chacha_ctx *x, const uint8_t *iv, const uint8_t *ctr); void chacha_encrypt_bytes(struct chacha_ctx *x, const uint8_t *m, uint8_t *c, uint32_t bytes); diff --git a/src/chacha-poly1305/chacha_avx2.c b/src/chacha-poly1305/chacha_avx2.c new file mode 100644 index 000000000..2eb2530d7 --- /dev/null +++ b/src/chacha-poly1305/chacha_avx2.c @@ -0,0 +1,27 @@ +#include "../system.h" + +#include "chacha.h" +#include "../xalloc.h" + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("sse2,ssse3,sse4.1,avx2"))), apply_to=function) +#elif defined(__GNUC__) +# pragma GCC target("sse2", "ssse3", "sse4.1", "avx2") +#endif + +#include + +void chacha_encrypt_bytes_avx2(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, uint32_t bytes) { + uint32_t *x = &ctx->input[0]; + + if(!bytes) { + return; + } + +#include "chacha_avx2.h" +#include "chacha_ssse3.h" +} + +#ifdef __clang__ +# pragma clang attribute pop +#endif diff --git a/src/chacha-poly1305/chacha_avx2.h b/src/chacha-poly1305/chacha_avx2.h new file mode 100644 index 000000000..ef643e8c4 --- /dev/null +++ b/src/chacha-poly1305/chacha_avx2.h @@ -0,0 +1,329 @@ +// Copyright (C) 2014-2017 D. J. Bernstein, Romain Dolbeau, Frank Denis +// Public domain + +#define VEC8_ROT(A, IMM) \ + _mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM))) + +/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 & + * 16) (better) */ +#define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D) \ + x_##A = _mm256_add_epi32(x_##A, x_##B); \ + t_##A = _mm256_xor_si256(x_##D, x_##A); \ + x_##D = _mm256_shuffle_epi8(t_##A, rot16); \ + x_##C = _mm256_add_epi32(x_##C, x_##D); \ + t_##C = _mm256_xor_si256(x_##B, x_##C); \ + x_##B = VEC8_ROT(t_##C, 12); \ + x_##A = _mm256_add_epi32(x_##A, x_##B); \ + t_##A = _mm256_xor_si256(x_##D, x_##A); \ + x_##D = _mm256_shuffle_epi8(t_##A, rot8); \ + x_##C = _mm256_add_epi32(x_##C, x_##D); \ + t_##C = _mm256_xor_si256(x_##B, x_##C); \ + x_##B = VEC8_ROT(t_##C, 7) + +#define VEC8_QUARTERROUND(A, B, C, D) VEC8_QUARTERROUND_SHUFFLE(A, B, C, D) + +#define VEC8_LINE1(A, B, C, D) \ + x_##A = _mm256_add_epi32(x_##A, x_##B); \ + x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16) +#define VEC8_LINE2(A, B, C, D) \ + x_##C = _mm256_add_epi32(x_##C, x_##D); \ + x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12) +#define VEC8_LINE3(A, B, C, D) \ + x_##A = _mm256_add_epi32(x_##A, x_##B); \ + x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8) +#define VEC8_LINE4(A, B, C, D) \ + x_##C = _mm256_add_epi32(x_##C, x_##D); \ + x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7) + +#define VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, \ + C4, D4) \ +VEC8_LINE1(A1, B1, C1, D1); \ +VEC8_LINE1(A2, B2, C2, D2); \ +VEC8_LINE1(A3, B3, C3, D3); \ +VEC8_LINE1(A4, B4, C4, D4); \ +VEC8_LINE2(A1, B1, C1, D1); \ +VEC8_LINE2(A2, B2, C2, D2); \ +VEC8_LINE2(A3, B3, C3, D3); \ +VEC8_LINE2(A4, B4, C4, D4); \ +VEC8_LINE3(A1, B1, C1, D1); \ +VEC8_LINE3(A2, B2, C2, D2); \ +VEC8_LINE3(A3, B3, C3, D3); \ +VEC8_LINE3(A4, B4, C4, D4); \ +VEC8_LINE4(A1, B1, C1, D1); \ +VEC8_LINE4(A2, B2, C2, D2); \ +VEC8_LINE4(A3, B3, C3, D3); \ +VEC8_LINE4(A4, B4, C4, D4) + +#define VEC8_ROUND_HALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, \ + B4, C4, D4) \ +VEC8_LINE1(A1, B1, C1, D1); \ +VEC8_LINE1(A2, B2, C2, D2); \ +VEC8_LINE2(A1, B1, C1, D1); \ +VEC8_LINE2(A2, B2, C2, D2); \ +VEC8_LINE3(A1, B1, C1, D1); \ +VEC8_LINE3(A2, B2, C2, D2); \ +VEC8_LINE4(A1, B1, C1, D1); \ +VEC8_LINE4(A2, B2, C2, D2); \ +VEC8_LINE1(A3, B3, C3, D3); \ +VEC8_LINE1(A4, B4, C4, D4); \ +VEC8_LINE2(A3, B3, C3, D3); \ +VEC8_LINE2(A4, B4, C4, D4); \ +VEC8_LINE3(A3, B3, C3, D3); \ +VEC8_LINE3(A4, B4, C4, D4); \ +VEC8_LINE4(A3, B3, C3, D3); \ +VEC8_LINE4(A4, B4, C4, D4) + +#define VEC8_ROUND_HALFANDHALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, \ + A4, B4, C4, D4) \ +VEC8_LINE1(A1, B1, C1, D1); \ +VEC8_LINE1(A2, B2, C2, D2); \ +VEC8_LINE2(A1, B1, C1, D1); \ +VEC8_LINE2(A2, B2, C2, D2); \ +VEC8_LINE1(A3, B3, C3, D3); \ +VEC8_LINE1(A4, B4, C4, D4); \ +VEC8_LINE2(A3, B3, C3, D3); \ +VEC8_LINE2(A4, B4, C4, D4); \ +VEC8_LINE3(A1, B1, C1, D1); \ +VEC8_LINE3(A2, B2, C2, D2); \ +VEC8_LINE4(A1, B1, C1, D1); \ +VEC8_LINE4(A2, B2, C2, D2); \ +VEC8_LINE3(A3, B3, C3, D3); \ +VEC8_LINE3(A4, B4, C4, D4); \ +VEC8_LINE4(A3, B3, C3, D3); \ +VEC8_LINE4(A4, B4, C4, D4) + +#define VEC8_ROUND(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \ + D4) \ +VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \ + D4) + +if(bytes >= 512) { + /* constant for shuffling bytes (replacing multiple-of-8 rotates) */ + __m256i rot16 = + _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, + 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + __m256i rot8 = + _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, + 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); + uint32_t in12, in13; + + /* the naive way seems as fast (if not a bit faster) than the vector way */ + __m256i x_0 = _mm256_set1_epi32(x[0]); + __m256i x_1 = _mm256_set1_epi32(x[1]); + __m256i x_2 = _mm256_set1_epi32(x[2]); + __m256i x_3 = _mm256_set1_epi32(x[3]); + __m256i x_4 = _mm256_set1_epi32(x[4]); + __m256i x_5 = _mm256_set1_epi32(x[5]); + __m256i x_6 = _mm256_set1_epi32(x[6]); + __m256i x_7 = _mm256_set1_epi32(x[7]); + __m256i x_8 = _mm256_set1_epi32(x[8]); + __m256i x_9 = _mm256_set1_epi32(x[9]); + __m256i x_10 = _mm256_set1_epi32(x[10]); + __m256i x_11 = _mm256_set1_epi32(x[11]); + __m256i x_12; + __m256i x_13; + __m256i x_14 = _mm256_set1_epi32(x[14]); + __m256i x_15 = _mm256_set1_epi32(x[15]); + + __m256i orig0 = x_0; + __m256i orig1 = x_1; + __m256i orig2 = x_2; + __m256i orig3 = x_3; + __m256i orig4 = x_4; + __m256i orig5 = x_5; + __m256i orig6 = x_6; + __m256i orig7 = x_7; + __m256i orig8 = x_8; + __m256i orig9 = x_9; + __m256i orig10 = x_10; + __m256i orig11 = x_11; + __m256i orig12; + __m256i orig13; + __m256i orig14 = x_14; + __m256i orig15 = x_15; + __m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, + t_13, t_14, t_15; + + while(bytes >= 512) { + const __m256i addv12 = _mm256_set_epi64x(3, 2, 1, 0); + const __m256i addv13 = _mm256_set_epi64x(7, 6, 5, 4); + const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); + __m256i t12, t13; + + uint64_t in1213; + int i; + + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_8 = orig8; + x_9 = orig9; + x_10 = orig10; + x_11 = orig11; + x_14 = orig14; + x_15 = orig15; + + in12 = x[12]; + in13 = x[13]; + in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32); + x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213)); + + t12 = _mm256_add_epi64(addv12, x_12); + t13 = _mm256_add_epi64(addv13, x_13); + + x_12 = _mm256_unpacklo_epi32(t12, t13); + x_13 = _mm256_unpackhi_epi32(t12, t13); + + t12 = _mm256_unpacklo_epi32(x_12, x_13); + t13 = _mm256_unpackhi_epi32(x_12, x_13); + + /* required because unpack* are intra-lane */ + x_12 = _mm256_permutevar8x32_epi32(t12, permute); + x_13 = _mm256_permutevar8x32_epi32(t13, permute); + + orig12 = x_12; + orig13 = x_13; + + in1213 += 8; + + x[12] = in1213 & 0xFFFFFFFF; + x[13] = (in1213 >> 32) & 0xFFFFFFFF; + + for(i = 0; i < ROUNDS; i += 2) { + VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14); + } + +#define ONEQUAD_TRANSPOSE(A, B, C, D) \ + { \ + __m128i t0, t1, t2, t3; \ + x_##A = _mm256_add_epi32(x_##A, orig##A); \ + x_##B = _mm256_add_epi32(x_##B, orig##B); \ + x_##C = _mm256_add_epi32(x_##C, orig##C); \ + x_##D = _mm256_add_epi32(x_##D, orig##D); \ + t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \ + t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \ + t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \ + t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \ + x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \ + x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \ + x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \ + x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \ + t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0), \ + _mm_loadu_si128((const __m128i*) (m + 0))); \ + _mm_storeu_si128((__m128i*) (c + 0), t0); \ + t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0), \ + _mm_loadu_si128((const __m128i*) (m + 64))); \ + _mm_storeu_si128((__m128i*) (c + 64), t1); \ + t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0), \ + _mm_loadu_si128((const __m128i*) (m + 128))); \ + _mm_storeu_si128((__m128i*) (c + 128), t2); \ + t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0), \ + _mm_loadu_si128((const __m128i*) (m + 192))); \ + _mm_storeu_si128((__m128i*) (c + 192), t3); \ + t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1), \ + _mm_loadu_si128((const __m128i*) (m + 256))); \ + _mm_storeu_si128((__m128i*) (c + 256), t0); \ + t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1), \ + _mm_loadu_si128((const __m128i*) (m + 320))); \ + _mm_storeu_si128((__m128i*) (c + 320), t1); \ + t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1), \ + _mm_loadu_si128((const __m128i*) (m + 384))); \ + _mm_storeu_si128((__m128i*) (c + 384), t2); \ + t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1), \ + _mm_loadu_si128((const __m128i*) (m + 448))); \ + _mm_storeu_si128((__m128i*) (c + 448), t3); \ + } + +#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) + +#define ONEQUAD_UNPCK(A, B, C, D) \ + { \ + x_##A = _mm256_add_epi32(x_##A, orig##A); \ + x_##B = _mm256_add_epi32(x_##B, orig##B); \ + x_##C = _mm256_add_epi32(x_##C, orig##C); \ + x_##D = _mm256_add_epi32(x_##D, orig##D); \ + t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \ + t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \ + t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \ + t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \ + x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \ + x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \ + x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \ + x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \ + } + +#define ONEOCTO(A, B, C, D, A2, B2, C2, D2) \ + { \ + ONEQUAD_UNPCK(A, B, C, D); \ + ONEQUAD_UNPCK(A2, B2, C2, D2); \ + t_##A = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20); \ + t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31); \ + t_##B = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20); \ + t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31); \ + t_##C = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20); \ + t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31); \ + t_##D = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20); \ + t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31); \ + t_##A = _mm256_xor_si256( \ + t_##A, _mm256_loadu_si256((const __m256i*) (m + 0))); \ + t_##B = _mm256_xor_si256( \ + t_##B, _mm256_loadu_si256((const __m256i*) (m + 64))); \ + t_##C = _mm256_xor_si256( \ + t_##C, _mm256_loadu_si256((const __m256i*) (m + 128))); \ + t_##D = _mm256_xor_si256( \ + t_##D, _mm256_loadu_si256((const __m256i*) (m + 192))); \ + t_##A2 = _mm256_xor_si256( \ + t_##A2, _mm256_loadu_si256((const __m256i*) (m + 256))); \ + t_##B2 = _mm256_xor_si256( \ + t_##B2, _mm256_loadu_si256((const __m256i*) (m + 320))); \ + t_##C2 = _mm256_xor_si256( \ + t_##C2, _mm256_loadu_si256((const __m256i*) (m + 384))); \ + t_##D2 = _mm256_xor_si256( \ + t_##D2, _mm256_loadu_si256((const __m256i*) (m + 448))); \ + _mm256_storeu_si256((__m256i*) (c + 0), t_##A); \ + _mm256_storeu_si256((__m256i*) (c + 64), t_##B); \ + _mm256_storeu_si256((__m256i*) (c + 128), t_##C); \ + _mm256_storeu_si256((__m256i*) (c + 192), t_##D); \ + _mm256_storeu_si256((__m256i*) (c + 256), t_##A2); \ + _mm256_storeu_si256((__m256i*) (c + 320), t_##B2); \ + _mm256_storeu_si256((__m256i*) (c + 384), t_##C2); \ + _mm256_storeu_si256((__m256i*) (c + 448), t_##D2); \ + } + + ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7); + m += 32; + c += 32; + ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15); + m -= 32; + c -= 32; + +#undef ONEQUAD +#undef ONEQUAD_TRANSPOSE +#undef ONEQUAD_UNPCK +#undef ONEOCTO + + bytes -= 512; + c += 512; + m += 512; + } +} + +#undef VEC8_ROT +#undef VEC8_QUARTERROUND +#undef VEC8_QUARTERROUND_NAIVE +#undef VEC8_QUARTERROUND_SHUFFLE +#undef VEC8_QUARTERROUND_SHUFFLE2 +#undef VEC8_LINE1 +#undef VEC8_LINE2 +#undef VEC8_LINE3 +#undef VEC8_LINE4 +#undef VEC8_ROUND +#undef VEC8_ROUND_SEQ +#undef VEC8_ROUND_HALF +#undef VEC8_ROUND_HALFANDHALF diff --git a/src/chacha-poly1305/chacha_ssse3.c b/src/chacha-poly1305/chacha_ssse3.c new file mode 100644 index 000000000..be52884c3 --- /dev/null +++ b/src/chacha-poly1305/chacha_ssse3.c @@ -0,0 +1,26 @@ +#include "../system.h" + +#include "chacha.h" +#include "../xalloc.h" + +#if defined(__clang__) +# pragma clang attribute push (__attribute__((target("sse2,ssse3"))), apply_to=function) +#elif defined(__GNUC__) +# pragma GCC target("sse2", "ssse3") +#endif + +#include + +void chacha_encrypt_bytes_ssse3(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, uint32_t bytes) { + uint32_t *x = &ctx->input[0]; + + if(!bytes) { + return; + } + +#include "chacha_ssse3.h" +} + +#ifdef __clang__ +# pragma clang attribute pop +#endif diff --git a/src/chacha-poly1305/chacha_ssse3.h b/src/chacha-poly1305/chacha_ssse3.h new file mode 100644 index 000000000..a942837fc --- /dev/null +++ b/src/chacha-poly1305/chacha_ssse3.h @@ -0,0 +1,370 @@ +// Copyright (C) 2014-2017 D. J. Bernstein, Romain Dolbeau, Frank Denis +// Public domain + +#define VEC4_ROT(A, IMM) \ + _mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM))) + +/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 & + * 16) (better) */ +#define VEC4_QUARTERROUND_SHUFFLE(A, B, C, D) \ + x_##A = _mm_add_epi32(x_##A, x_##B); \ + t_##A = _mm_xor_si128(x_##D, x_##A); \ + x_##D = _mm_shuffle_epi8(t_##A, rot16); \ + x_##C = _mm_add_epi32(x_##C, x_##D); \ + t_##C = _mm_xor_si128(x_##B, x_##C); \ + x_##B = VEC4_ROT(t_##C, 12); \ + x_##A = _mm_add_epi32(x_##A, x_##B); \ + t_##A = _mm_xor_si128(x_##D, x_##A); \ + x_##D = _mm_shuffle_epi8(t_##A, rot8); \ + x_##C = _mm_add_epi32(x_##C, x_##D); \ + t_##C = _mm_xor_si128(x_##B, x_##C); \ + x_##B = VEC4_ROT(t_##C, 7) + +#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D) + +if(bytes >= 256) { + /* constant for shuffling bytes (replacing multiple-of-8 rotates) */ + __m128i rot16 = + _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + __m128i rot8 = + _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); + + __m128i x_0 = _mm_set1_epi32(x[0]); + __m128i x_1 = _mm_set1_epi32(x[1]); + __m128i x_2 = _mm_set1_epi32(x[2]); + __m128i x_3 = _mm_set1_epi32(x[3]); + __m128i x_4 = _mm_set1_epi32(x[4]); + __m128i x_5 = _mm_set1_epi32(x[5]); + __m128i x_6 = _mm_set1_epi32(x[6]); + __m128i x_7 = _mm_set1_epi32(x[7]); + __m128i x_8 = _mm_set1_epi32(x[8]); + __m128i x_9 = _mm_set1_epi32(x[9]); + __m128i x_10 = _mm_set1_epi32(x[10]); + __m128i x_11 = _mm_set1_epi32(x[11]); + __m128i x_12; + __m128i x_13; + __m128i x_14 = _mm_set1_epi32(x[14]); + __m128i x_15 = _mm_set1_epi32(x[15]); + __m128i orig0 = x_0; + __m128i orig1 = x_1; + __m128i orig2 = x_2; + __m128i orig3 = x_3; + __m128i orig4 = x_4; + __m128i orig5 = x_5; + __m128i orig6 = x_6; + __m128i orig7 = x_7; + __m128i orig8 = x_8; + __m128i orig9 = x_9; + __m128i orig10 = x_10; + __m128i orig11 = x_11; + __m128i orig12; + __m128i orig13; + __m128i orig14 = x_14; + __m128i orig15 = x_15; + __m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12, + t_13, t_14, t_15; + + uint32_t in12, in13; + int i; + + while(bytes >= 256) { + const __m128i addv12 = _mm_set_epi64x(1, 0); + const __m128i addv13 = _mm_set_epi64x(3, 2); + __m128i t12, t13; + uint64_t in1213; + + x_0 = orig0; + x_1 = orig1; + x_2 = orig2; + x_3 = orig3; + x_4 = orig4; + x_5 = orig5; + x_6 = orig6; + x_7 = orig7; + x_8 = orig8; + x_9 = orig9; + x_10 = orig10; + x_11 = orig11; + x_14 = orig14; + x_15 = orig15; + + in12 = x[12]; + in13 = x[13]; + in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32); + t12 = _mm_set1_epi64x(in1213); + t13 = _mm_set1_epi64x(in1213); + + x_12 = _mm_add_epi64(addv12, t12); + x_13 = _mm_add_epi64(addv13, t13); + + t12 = _mm_unpacklo_epi32(x_12, x_13); + t13 = _mm_unpackhi_epi32(x_12, x_13); + + x_12 = _mm_unpacklo_epi32(t12, t13); + x_13 = _mm_unpackhi_epi32(t12, t13); + + orig12 = x_12; + orig13 = x_13; + + in1213 += 4; + + x[12] = in1213 & 0xFFFFFFFF; + x[13] = (in1213 >> 32) & 0xFFFFFFFF; + + for(i = 0; i < ROUNDS; i += 2) { + VEC4_QUARTERROUND(0, 4, 8, 12); + VEC4_QUARTERROUND(1, 5, 9, 13); + VEC4_QUARTERROUND(2, 6, 10, 14); + VEC4_QUARTERROUND(3, 7, 11, 15); + VEC4_QUARTERROUND(0, 5, 10, 15); + VEC4_QUARTERROUND(1, 6, 11, 12); + VEC4_QUARTERROUND(2, 7, 8, 13); + VEC4_QUARTERROUND(3, 4, 9, 14); + } + +#define ONEQUAD_TRANSPOSE(A, B, C, D) \ + { \ + __m128i t0, t1, t2, t3; \ + \ + x_##A = _mm_add_epi32(x_##A, orig##A); \ + x_##B = _mm_add_epi32(x_##B, orig##B); \ + x_##C = _mm_add_epi32(x_##C, orig##C); \ + x_##D = _mm_add_epi32(x_##D, orig##D); \ + t_##A = _mm_unpacklo_epi32(x_##A, x_##B); \ + t_##B = _mm_unpacklo_epi32(x_##C, x_##D); \ + t_##C = _mm_unpackhi_epi32(x_##A, x_##B); \ + t_##D = _mm_unpackhi_epi32(x_##C, x_##D); \ + x_##A = _mm_unpacklo_epi64(t_##A, t_##B); \ + x_##B = _mm_unpackhi_epi64(t_##A, t_##B); \ + x_##C = _mm_unpacklo_epi64(t_##C, t_##D); \ + x_##D = _mm_unpackhi_epi64(t_##C, t_##D); \ + \ + t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((const __m128i*) (m + 0))); \ + _mm_storeu_si128((__m128i*) (c + 0), t0); \ + t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((const __m128i*) (m + 64))); \ + _mm_storeu_si128((__m128i*) (c + 64), t1); \ + t2 = \ + _mm_xor_si128(x_##C, _mm_loadu_si128((const __m128i*) (m + 128))); \ + _mm_storeu_si128((__m128i*) (c + 128), t2); \ + t3 = \ + _mm_xor_si128(x_##D, _mm_loadu_si128((const __m128i*) (m + 192))); \ + _mm_storeu_si128((__m128i*) (c + 192), t3); \ + } + +#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D) + + ONEQUAD(0, 1, 2, 3); + m += 16; + c += 16; + ONEQUAD(4, 5, 6, 7); + m += 16; + c += 16; + ONEQUAD(8, 9, 10, 11); + m += 16; + c += 16; + ONEQUAD(12, 13, 14, 15); + m -= 48; + c -= 48; + +#undef ONEQUAD +#undef ONEQUAD_TRANSPOSE + + bytes -= 256; + c += 256; + m += 256; + } +} + +#undef VEC4_ROT +#undef VEC4_QUARTERROUND +#undef VEC4_QUARTERROUND_SHUFFLE + +while(bytes >= 64) { + __m128i x_0, x_1, x_2, x_3; + __m128i t_1; + const __m128i rot16 = + _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + const __m128i rot8 = + _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); + + uint32_t in12; + uint32_t in13; + int i; + + x_0 = _mm_loadu_si128((const __m128i *)(x + 0)); + x_1 = _mm_loadu_si128((const __m128i *)(x + 4)); + x_2 = _mm_loadu_si128((const __m128i *)(x + 8)); + x_3 = _mm_loadu_si128((const __m128i *)(x + 12)); + + for(i = 0; i < ROUNDS; i += 2) { + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_3 = _mm_shuffle_epi8(x_3, rot16); + + x_2 = _mm_add_epi32(x_2, x_3); + x_1 = _mm_xor_si128(x_1, x_2); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 12); + t_1 = _mm_srli_epi32(t_1, 20); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_0 = _mm_shuffle_epi32(x_0, 0x93); + x_3 = _mm_shuffle_epi8(x_3, rot8); + + x_2 = _mm_add_epi32(x_2, x_3); + x_3 = _mm_shuffle_epi32(x_3, 0x4e); + x_1 = _mm_xor_si128(x_1, x_2); + x_2 = _mm_shuffle_epi32(x_2, 0x39); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 7); + t_1 = _mm_srli_epi32(t_1, 25); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_3 = _mm_shuffle_epi8(x_3, rot16); + + x_2 = _mm_add_epi32(x_2, x_3); + x_1 = _mm_xor_si128(x_1, x_2); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 12); + t_1 = _mm_srli_epi32(t_1, 20); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_0 = _mm_shuffle_epi32(x_0, 0x39); + x_3 = _mm_shuffle_epi8(x_3, rot8); + + x_2 = _mm_add_epi32(x_2, x_3); + x_3 = _mm_shuffle_epi32(x_3, 0x4e); + x_1 = _mm_xor_si128(x_1, x_2); + x_2 = _mm_shuffle_epi32(x_2, 0x93); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 7); + t_1 = _mm_srli_epi32(t_1, 25); + x_1 = _mm_xor_si128(x_1, t_1); + } + + x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((const __m128i *)(x + 0))); + x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((const __m128i *)(x + 4))); + x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((const __m128i *)(x + 8))); + x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((const __m128i *)(x + 12))); + x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((const __m128i *)(m + 0))); + x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((const __m128i *)(m + 16))); + x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((const __m128i *)(m + 32))); + x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((const __m128i *)(m + 48))); + _mm_storeu_si128((__m128i *)(c + 0), x_0); + _mm_storeu_si128((__m128i *)(c + 16), x_1); + _mm_storeu_si128((__m128i *)(c + 32), x_2); + _mm_storeu_si128((__m128i *)(c + 48), x_3); + + in12 = x[12]; + in13 = x[13]; + in12++; + + if(in12 == 0) { + in13++; + } + + x[12] = in12; + x[13] = in13; + + bytes -= 64; + c += 64; + m += 64; +} + +if(bytes > 0) { + __m128i x_0, x_1, x_2, x_3; + __m128i t_1; + const __m128i rot16 = + _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); + const __m128i rot8 = + _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3); + uint8_t partialblock[64]; + + unsigned int i; + + x_0 = _mm_loadu_si128((const __m128i *)(x + 0)); + x_1 = _mm_loadu_si128((const __m128i *)(x + 4)); + x_2 = _mm_loadu_si128((const __m128i *)(x + 8)); + x_3 = _mm_loadu_si128((const __m128i *)(x + 12)); + + for(i = 0; i < ROUNDS; i += 2) { + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_3 = _mm_shuffle_epi8(x_3, rot16); + + x_2 = _mm_add_epi32(x_2, x_3); + x_1 = _mm_xor_si128(x_1, x_2); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 12); + t_1 = _mm_srli_epi32(t_1, 20); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_0 = _mm_shuffle_epi32(x_0, 0x93); + x_3 = _mm_shuffle_epi8(x_3, rot8); + + x_2 = _mm_add_epi32(x_2, x_3); + x_3 = _mm_shuffle_epi32(x_3, 0x4e); + x_1 = _mm_xor_si128(x_1, x_2); + x_2 = _mm_shuffle_epi32(x_2, 0x39); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 7); + t_1 = _mm_srli_epi32(t_1, 25); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_3 = _mm_shuffle_epi8(x_3, rot16); + + x_2 = _mm_add_epi32(x_2, x_3); + x_1 = _mm_xor_si128(x_1, x_2); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 12); + t_1 = _mm_srli_epi32(t_1, 20); + x_1 = _mm_xor_si128(x_1, t_1); + + x_0 = _mm_add_epi32(x_0, x_1); + x_3 = _mm_xor_si128(x_3, x_0); + x_0 = _mm_shuffle_epi32(x_0, 0x39); + x_3 = _mm_shuffle_epi8(x_3, rot8); + + x_2 = _mm_add_epi32(x_2, x_3); + x_3 = _mm_shuffle_epi32(x_3, 0x4e); + x_1 = _mm_xor_si128(x_1, x_2); + x_2 = _mm_shuffle_epi32(x_2, 0x93); + + t_1 = x_1; + x_1 = _mm_slli_epi32(x_1, 7); + t_1 = _mm_srli_epi32(t_1, 25); + x_1 = _mm_xor_si128(x_1, t_1); + } + + x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((const __m128i *)(x + 0))); + x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((const __m128i *)(x + 4))); + x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((const __m128i *)(x + 8))); + x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((const __m128i *)(x + 12))); + _mm_storeu_si128((__m128i *)(partialblock + 0), x_0); + _mm_storeu_si128((__m128i *)(partialblock + 16), x_1); + _mm_storeu_si128((__m128i *)(partialblock + 32), x_2); + _mm_storeu_si128((__m128i *)(partialblock + 48), x_3); + + for(i = 0; i < bytes; i++) { + c[i] = m[i] ^ partialblock[i]; + } + + memzero(partialblock, sizeof partialblock); +} diff --git a/src/chacha-poly1305/meson.build b/src/chacha-poly1305/meson.build index d8fd74cc1..8fc3cd534 100644 --- a/src/chacha-poly1305/meson.build +++ b/src/chacha-poly1305/meson.build @@ -4,11 +4,38 @@ src_chacha_poly = files( 'poly1305.c', ) +chacha_impl = 'generic' +if cdata.has('HAVE_CPU_SSSE3') + src_chacha_poly += files('chacha_ssse3.c') + chacha_impl = 'SSSE3' + if cdata.has('HAVE_CPU_AVX2') + src_chacha_poly += files('chacha_avx2.c') + chacha_impl = 'AVX2' + endif +endif + +if meson_version.version_compare('>=0.53') + summary({ 'ChaCha20': chacha_impl }, section: 'Cryptography') +endif + lib_chacha_poly = static_library( 'chacha_poly', sources: src_chacha_poly, implicit_include_directories: false, + link_with: lib_cpu_features, include_directories: inc_conf, build_by_default: false, ) +if os_name != 'windows' + exe_bench_chacha = executable( + 'bench_chacha', + sources: files('../random.c', 'bench_chacha.c'), + link_with: lib_chacha_poly, + implicit_include_directories: false, + include_directories: inc_conf, + build_by_default: false, + ) + + benchmark('bench_chacha', exe_bench_chacha) +endif diff --git a/src/cpu.c b/src/cpu.c new file mode 100644 index 000000000..67e26503e --- /dev/null +++ b/src/cpu.c @@ -0,0 +1,89 @@ +#include "system.h" + +#include + +#include "cpu.h" + +#define CPUID_ECX_SSSE3 0x00000200 +#define CPUID_EBX_AVX2 0x00000020 + +#define CPU_INFO_LEN 4 + +static uint16_t features; + +// Copyright (c) 2014-2021 Frank Denis +static void tinc_cpuid(unsigned int cpu_info[CPU_INFO_LEN], const unsigned int cpu_info_type) { + memset(cpu_info, 0, CPU_INFO_LEN * sizeof(*cpu_info)); + +#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)) + __cpuid((int *) cpu_info, cpu_info_type); +#elif defined(HAVE_CPUID) +# if defined(__x86_64__) + __asm__ __volatile__("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1" + : "=a"(cpu_info[0]), "=&r"(cpu_info[1]), + "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "0"(cpu_info_type), "2"(0U)); +# elif defined(__i386__) + __asm__ __volatile__( + "pushfl; pushfl; " + "popl %0; " + "movl %0, %1; xorl %2, %0; " + "pushl %0; " + "popfl; pushfl; popl %0; popfl" + : "=&r"(cpu_info[0]), "=&r"(cpu_info[1]) + : "i"(0x200000)); + + if(((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0x0) { + return; + } + + __asm__ __volatile__("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" + : "=a"(cpu_info[0]), "=&r"(cpu_info[1]), + "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "0"(cpu_info_type), "2"(0U)); +# else + __asm__ __volatile__("cpuid" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), + "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "0"(cpu_info_type), "2"(0U)); +# endif +#else + (void)cpu_info_type; +#endif +} + +static bool initialized; + +void cpu_detect_features(void) { + initialized = true; + + unsigned int cpu_info[CPU_INFO_LEN]; + tinc_cpuid(cpu_info, 0x00); + + if(!cpu_info[0]) { + return; + } + +#ifdef HAVE_CPU_SSSE3 + tinc_cpuid(cpu_info, 0x01); + + if(cpu_info[2] & CPUID_ECX_SSSE3) { + features |= CPU_SSSE3; + } + +#endif + +#ifdef HAVE_CPU_AVX2 + tinc_cpuid(cpu_info, 0x07); + + if(cpu_info[1] & CPUID_EBX_AVX2) { + features |= CPU_AVX2; + } + +#endif +} + +bool cpu_supports(cpu_feature_t feat) { + assert(initialized); + return features & feat; +} diff --git a/src/cpu.h b/src/cpu.h new file mode 100644 index 000000000..c95407325 --- /dev/null +++ b/src/cpu.h @@ -0,0 +1,17 @@ +#ifndef TINC_CPU_H +#define TINC_CPU_H + +#include "system.h" + +typedef enum { + CPU_AVX2 = 1 << 0, + CPU_SSSE3 = 1 << 1, +} cpu_feature_t; + +// Detect supported features. Should be called once at application startup. +void cpu_detect_features(void); + +// Check if current CPU supports feature +bool cpu_supports(cpu_feature_t feat); + +#endif // TINC_CPU_H diff --git a/src/gcrypt/crypto.c b/src/gcrypt/crypto.c index 815bedf1b..627ac7ef1 100644 --- a/src/gcrypt/crypto.c +++ b/src/gcrypt/crypto.c @@ -3,8 +3,10 @@ #include #include "../crypto.h" +#include "../chacha-poly1305/chacha.h" void crypto_init(void) { + chacha_resolve_functions(); gcry_control(GCRYCTL_INIT_SECMEM, 32 * 1024, 0); gcry_control(GCRYCTL_INITIALIZATION_FINISHED, 0); } diff --git a/src/meson.build b/src/meson.build index d9f7b14bd..eda335fe5 100644 --- a/src/meson.build +++ b/src/meson.build @@ -18,6 +18,27 @@ foreach attr : ['malloc', 'nonnull', 'warn_unused_result', 'packed', 'format'] endif endforeach +src_cpuid = ''' + int main(void) { + unsigned int cpu_info[4]; + __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" : + "=a" (cpu_info[0]), "=&r" (cpu_info[1]), + "=c" (cpu_info[2]), "=d" (cpu_info[3]) : + "0" (0U), "2" (0U)); + return 0; + } +''' + +if cpu_family in ['x86', 'x86_64'] and cc.compiles(src_cpuid) + cdata.set('HAVE_CPUID', 1, description: 'have cpuid instruction support') + if cc.has_header_symbol('immintrin.h', '_mm_shuffle_epi8') + cdata.set('HAVE_CPU_SSSE3', 1, description: 'have SSSE3 headers') + if cc.has_header_symbol('immintrin.h', '_mm256_set1_epi32') + cdata.set('HAVE_CPU_AVX2', 1, description: 'have AVX2 headers') + endif + endif +endif + if cc.compiles(''' #include extern void *make() __attribute__((malloc(free))); @@ -112,6 +133,14 @@ check_types = [ 'struct nd_opt_hdr', ] +lib_cpu_features = static_library( + 'cpu_features', + sources: 'cpu.c', + implicit_include_directories: false, + include_directories: inc_conf, + build_by_default: false, +) + subdir('ed25519') subdir('chacha-poly1305') @@ -362,6 +391,10 @@ if opt_crypto != 'nolegacy' src_lib_crypto += ['cipher.c', 'digest.c'] endif +if meson_version.version_compare('>=0.53') + summary({ 'library': opt_crypto }, section: 'Cryptography') +endif + subdir('include') have_sandbox = cdata.has('HAVE_SANDBOX') diff --git a/src/nolegacy/crypto.c b/src/nolegacy/crypto.c index 4e6f427ad..f235cd514 100644 --- a/src/nolegacy/crypto.c +++ b/src/nolegacy/crypto.c @@ -18,7 +18,8 @@ */ #include "../crypto.h" +#include "../chacha-poly1305/chacha.h" -// No-op for those cryptographic libraries that -// do not require any additional initialization. -void crypto_init(void) {} +void crypto_init(void) { + chacha_resolve_functions(); +} diff --git a/src/openssl/crypto.c b/src/openssl/crypto.c index 3960c3e84..871c5a4e2 100644 --- a/src/openssl/crypto.c +++ b/src/openssl/crypto.c @@ -23,8 +23,11 @@ #include #include "../crypto.h" +#include "../chacha-poly1305/chacha.h" void crypto_init(void) { + chacha_resolve_functions(); + #if OPENSSL_VERSION_MAJOR < 3 ENGINE_load_builtin_engines(); #endif diff --git a/src/sptps_speed.c b/src/sptps_speed.c index c7c6e5463..9d78f242e 100644 --- a/src/sptps_speed.c +++ b/src/sptps_speed.c @@ -22,6 +22,7 @@ #include +#include "benchmark.h" #include "crypto.h" #include "ecdh.h" #include "ecdsa.h" @@ -81,30 +82,6 @@ static void receive_data(sptps_t *sptps) { } } -struct timespec start; -struct timespec end; -double elapsed; -double rate; -unsigned int count; - -static void clock_start(void) { - count = 0; - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); -} - -static bool clock_countto(double seconds) { - clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); - elapsed = (double) end.tv_sec + (double) end.tv_nsec * 1e-9 - - (double) start.tv_sec - (double) start.tv_nsec * 1e-9; - - if(elapsed < seconds) { - return ++count; - } - - rate = count / elapsed; - return false; -} - static int run_benchmark(int argc, char *argv[]) { ecdsa_t *key1, *key2; ecdh_t *ecdh1, *ecdh2;