diff --git a/src/benchmark.h b/src/benchmark.h
new file mode 100644
index 000000000..bd5df6f19
--- /dev/null
+++ b/src/benchmark.h
@@ -0,0 +1,30 @@
+#ifndef TINC_BENCHMARK_H
+#define TINC_BENCHMARK_H
+
+#include "system.h"
+
+static struct timespec start;
+static struct timespec end;
+static double elapsed;
+static double rate;
+static unsigned int count;
+
+static void clock_start(void) {
+	count = 0;
+	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
+}
+
+static bool clock_countto(double seconds) {
+	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
+	elapsed = (double) end.tv_sec + (double) end.tv_nsec * 1e-9
+	          - (double) start.tv_sec - (double) start.tv_nsec * 1e-9;
+
+	if(elapsed < seconds) {
+		return ++count;
+	}
+
+	rate = count / elapsed;
+	return false;
+}
+
+#endif // TINC_BENCHMARK_H
diff --git a/src/chacha-poly1305/bench_chacha.c b/src/chacha-poly1305/bench_chacha.c
new file mode 100644
index 000000000..a5ee96be3
--- /dev/null
+++ b/src/chacha-poly1305/bench_chacha.c
@@ -0,0 +1,61 @@
+#include "../system.h"
+
+#include "../benchmark.h"
+#include "../random.h"
+#include "../crypto.h"
+#include "../xalloc.h"
+#include "chacha.h"
+
+#define BUFFER_SIZE (1024 * 1024)
+
+static FILE *dev_null;
+
+static void benchmark(chacha_ctx *ctx, const uint8_t *plaintext, uint8_t *ciphertext, size_t len) {
+	for(clock_start(); clock_countto(5);) {
+		chacha_encrypt_bytes(ctx, plaintext, ciphertext, len);
+	}
+
+	// Prevent the compiler from optimizing out encryption
+	fwrite(ciphertext, len, 1, dev_null);
+	fprintf(stderr, "%8zu: %14.2lf op/s\n", len, rate);
+}
+
+const size_t block_sizes[] = {
+	32,
+	256,
+	512,
+	1024,
+	16 * 1024,
+	128 * 1024,
+	BUFFER_SIZE,
+};
+
+int main(void) {
+	dev_null = fopen("/dev/null", "w");
+	random_init();
+	chacha_resolve_functions();
+
+	uint8_t key[256 / 8];
+	uint8_t iv[8];
+	randomize(key, sizeof(key));
+	randomize(iv, sizeof(iv));
+
+	chacha_ctx ctx;
+	chacha_keysetup(&ctx, key, 256);
+	chacha_ivsetup(&ctx, iv, NULL);
+
+	uint8_t *plaintext = xmalloc(BUFFER_SIZE);
+	uint8_t *ciphertext = malloc(BUFFER_SIZE);
+	randomize(plaintext, BUFFER_SIZE);
+
+	for(size_t i = 0; i < sizeof(block_sizes) / sizeof(*block_sizes); ++i) {
+		benchmark(&ctx, plaintext, ciphertext, block_sizes[i]);
+	}
+
+	free(ciphertext);
+	free(plaintext);
+	random_exit();
+	fclose(dev_null);
+
+	return 0;
+}
diff --git a/src/chacha-poly1305/chacha.c b/src/chacha-poly1305/chacha.c
index 696f44a52..22165caad 100644
--- a/src/chacha-poly1305/chacha.c
+++ b/src/chacha-poly1305/chacha.c
@@ -7,8 +7,7 @@ Public domain.
 #include "../system.h"
 
 #include "chacha.h"
-
-typedef struct chacha_ctx chacha_ctx;
+#include "../cpu.h"
 
 #define U8C(v) (v##U)
 #define U32C(v) (v##U)
@@ -79,8 +78,7 @@ void chacha_ivsetup(chacha_ctx *x, const uint8_t *iv, const uint8_t *counter) {
 	x->input[15] = U8TO32_LITTLE(iv + 4);
 }
 
-void
-chacha_encrypt_bytes(chacha_ctx *x, const uint8_t *m, uint8_t *c, uint32_t bytes) {
+static void chacha_encrypt_bytes_generic(chacha_ctx *x, const uint8_t *m, uint8_t *c, uint32_t bytes) {
 	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
 	uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
 	uint8_t *ctarget = NULL;
@@ -222,3 +220,31 @@ chacha_encrypt_bytes(chacha_ctx *x, const uint8_t *m, uint8_t *c, uint32_t bytes
 		m += 64;
 	}
 }
+
+static chacha_encrypt_bytes_t *chacha_encrypt_bytes_impl;
+
+void chacha_encrypt_bytes(struct chacha_ctx *x, const uint8_t *m, uint8_t *c, uint32_t bytes) {
+	chacha_encrypt_bytes_impl(x, m, c, bytes);
+}
+
+void chacha_resolve_functions(void) {
+	cpu_detect_features();
+
+#ifdef HAVE_CPU_AVX2
+
+	if(cpu_supports(CPU_AVX2)) {
+		chacha_encrypt_bytes_impl = chacha_encrypt_bytes_avx2;
+		return;
+	}
+
+#endif
+#ifdef HAVE_CPU_SSSE3
+
+	if(cpu_supports(CPU_SSSE3)) {
+		chacha_encrypt_bytes_impl = chacha_encrypt_bytes_ssse3;
+		return;
+	}
+
+#endif
+	chacha_encrypt_bytes_impl = chacha_encrypt_bytes_generic;
+}
diff --git a/src/chacha-poly1305/chacha.h b/src/chacha-poly1305/chacha.h
index 103c3d812..7678538ad 100644
--- a/src/chacha-poly1305/chacha.h
+++ b/src/chacha-poly1305/chacha.h
@@ -7,9 +7,11 @@ Public domain.
 #ifndef CHACHA_H
 #define CHACHA_H
 
-struct chacha_ctx {
+typedef struct chacha_ctx {
 	uint32_t input[16];
-};
+} chacha_ctx;
+
+#define ROUNDS 20
 
 #define CHACHA_MINKEYLEN        16
 #define CHACHA_NONCELEN         8
@@ -17,6 +19,16 @@ struct chacha_ctx {
 #define CHACHA_STATELEN         (CHACHA_NONCELEN+CHACHA_CTRLEN)
 #define CHACHA_BLOCKLEN         64
 
+typedef void (chacha_encrypt_bytes_t)(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, uint32_t bytes);
+
+#ifdef HAVE_CPU_AVX2
+extern chacha_encrypt_bytes_t chacha_encrypt_bytes_avx2;
+#endif
+#ifdef HAVE_CPU_SSSE3
+extern chacha_encrypt_bytes_t chacha_encrypt_bytes_ssse3;
+#endif
+
+void chacha_resolve_functions(void);
 void chacha_keysetup(struct chacha_ctx *x, const uint8_t *k, uint32_t kbits);
 void chacha_ivsetup(struct chacha_ctx *x, const uint8_t *iv, const uint8_t *ctr);
 void chacha_encrypt_bytes(struct chacha_ctx *x, const uint8_t *m, uint8_t *c, uint32_t bytes);
diff --git a/src/chacha-poly1305/chacha_avx2.c b/src/chacha-poly1305/chacha_avx2.c
new file mode 100644
index 000000000..2eb2530d7
--- /dev/null
+++ b/src/chacha-poly1305/chacha_avx2.c
@@ -0,0 +1,27 @@
+#include "../system.h"
+
+#include "chacha.h"
+#include "../xalloc.h"
+
+#if defined(__clang__)
+#  pragma clang attribute push (__attribute__((target("sse2,ssse3,sse4.1,avx2"))), apply_to=function)
+#elif defined(__GNUC__)
+#  pragma GCC target("sse2", "ssse3", "sse4.1", "avx2")
+#endif
+
+#include <immintrin.h>
+
+void chacha_encrypt_bytes_avx2(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, uint32_t bytes) {
+	uint32_t *x = &ctx->input[0];
+
+	if(!bytes) {
+		return;
+	}
+
+#include "chacha_avx2.h"
+#include "chacha_ssse3.h"
+}
+
+#ifdef __clang__
+#  pragma clang attribute pop
+#endif
diff --git a/src/chacha-poly1305/chacha_avx2.h b/src/chacha-poly1305/chacha_avx2.h
new file mode 100644
index 000000000..ef643e8c4
--- /dev/null
+++ b/src/chacha-poly1305/chacha_avx2.h
@@ -0,0 +1,329 @@
+// Copyright (C) 2014-2017 D. J. Bernstein, Romain Dolbeau, Frank Denis
+// Public domain
+
+#define VEC8_ROT(A, IMM) \
+	_mm256_or_si256(_mm256_slli_epi32(A, IMM), _mm256_srli_epi32(A, (32 - IMM)))
+
+/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
+ * 16) (better) */
+#define VEC8_QUARTERROUND_SHUFFLE(A, B, C, D)  \
+	x_##A = _mm256_add_epi32(x_##A, x_##B);    \
+	t_##A = _mm256_xor_si256(x_##D, x_##A);    \
+	x_##D = _mm256_shuffle_epi8(t_##A, rot16); \
+	x_##C = _mm256_add_epi32(x_##C, x_##D);    \
+	t_##C = _mm256_xor_si256(x_##B, x_##C);    \
+	x_##B = VEC8_ROT(t_##C, 12);               \
+	x_##A = _mm256_add_epi32(x_##A, x_##B);    \
+	t_##A = _mm256_xor_si256(x_##D, x_##A);    \
+	x_##D = _mm256_shuffle_epi8(t_##A, rot8);  \
+	x_##C = _mm256_add_epi32(x_##C, x_##D);    \
+	t_##C = _mm256_xor_si256(x_##B, x_##C);    \
+	x_##B = VEC8_ROT(t_##C, 7)
+
+#define VEC8_QUARTERROUND(A, B, C, D) VEC8_QUARTERROUND_SHUFFLE(A, B, C, D)
+
+#define VEC8_LINE1(A, B, C, D)              \
+	x_##A = _mm256_add_epi32(x_##A, x_##B); \
+	x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot16)
+#define VEC8_LINE2(A, B, C, D)              \
+	x_##C = _mm256_add_epi32(x_##C, x_##D); \
+	x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 12)
+#define VEC8_LINE3(A, B, C, D)              \
+	x_##A = _mm256_add_epi32(x_##A, x_##B); \
+	x_##D = _mm256_shuffle_epi8(_mm256_xor_si256(x_##D, x_##A), rot8)
+#define VEC8_LINE4(A, B, C, D)              \
+	x_##C = _mm256_add_epi32(x_##C, x_##D); \
+	x_##B = VEC8_ROT(_mm256_xor_si256(x_##B, x_##C), 7)
+
+#define VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, \
+                       C4, D4)                                                 \
+VEC8_LINE1(A1, B1, C1, D1);                                                \
+VEC8_LINE1(A2, B2, C2, D2);                                                \
+VEC8_LINE1(A3, B3, C3, D3);                                                \
+VEC8_LINE1(A4, B4, C4, D4);                                                \
+VEC8_LINE2(A1, B1, C1, D1);                                                \
+VEC8_LINE2(A2, B2, C2, D2);                                                \
+VEC8_LINE2(A3, B3, C3, D3);                                                \
+VEC8_LINE2(A4, B4, C4, D4);                                                \
+VEC8_LINE3(A1, B1, C1, D1);                                                \
+VEC8_LINE3(A2, B2, C2, D2);                                                \
+VEC8_LINE3(A3, B3, C3, D3);                                                \
+VEC8_LINE3(A4, B4, C4, D4);                                                \
+VEC8_LINE4(A1, B1, C1, D1);                                                \
+VEC8_LINE4(A2, B2, C2, D2);                                                \
+VEC8_LINE4(A3, B3, C3, D3);                                                \
+VEC8_LINE4(A4, B4, C4, D4)
+
+#define VEC8_ROUND_HALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, \
+                        B4, C4, D4)                                         \
+VEC8_LINE1(A1, B1, C1, D1);                                             \
+VEC8_LINE1(A2, B2, C2, D2);                                             \
+VEC8_LINE2(A1, B1, C1, D1);                                             \
+VEC8_LINE2(A2, B2, C2, D2);                                             \
+VEC8_LINE3(A1, B1, C1, D1);                                             \
+VEC8_LINE3(A2, B2, C2, D2);                                             \
+VEC8_LINE4(A1, B1, C1, D1);                                             \
+VEC8_LINE4(A2, B2, C2, D2);                                             \
+VEC8_LINE1(A3, B3, C3, D3);                                             \
+VEC8_LINE1(A4, B4, C4, D4);                                             \
+VEC8_LINE2(A3, B3, C3, D3);                                             \
+VEC8_LINE2(A4, B4, C4, D4);                                             \
+VEC8_LINE3(A3, B3, C3, D3);                                             \
+VEC8_LINE3(A4, B4, C4, D4);                                             \
+VEC8_LINE4(A3, B3, C3, D3);                                             \
+VEC8_LINE4(A4, B4, C4, D4)
+
+#define VEC8_ROUND_HALFANDHALF(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, \
+                               A4, B4, C4, D4)                                 \
+VEC8_LINE1(A1, B1, C1, D1);                                                \
+VEC8_LINE1(A2, B2, C2, D2);                                                \
+VEC8_LINE2(A1, B1, C1, D1);                                                \
+VEC8_LINE2(A2, B2, C2, D2);                                                \
+VEC8_LINE1(A3, B3, C3, D3);                                                \
+VEC8_LINE1(A4, B4, C4, D4);                                                \
+VEC8_LINE2(A3, B3, C3, D3);                                                \
+VEC8_LINE2(A4, B4, C4, D4);                                                \
+VEC8_LINE3(A1, B1, C1, D1);                                                \
+VEC8_LINE3(A2, B2, C2, D2);                                                \
+VEC8_LINE4(A1, B1, C1, D1);                                                \
+VEC8_LINE4(A2, B2, C2, D2);                                                \
+VEC8_LINE3(A3, B3, C3, D3);                                                \
+VEC8_LINE3(A4, B4, C4, D4);                                                \
+VEC8_LINE4(A3, B3, C3, D3);                                                \
+VEC8_LINE4(A4, B4, C4, D4)
+
+#define VEC8_ROUND(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
+                   D4)                                                         \
+VEC8_ROUND_SEQ(A1, B1, C1, D1, A2, B2, C2, D2, A3, B3, C3, D3, A4, B4, C4, \
+               D4)
+
+if(bytes >= 512) {
+	/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
+	__m256i rot16 =
+	        _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
+	                        13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
+	__m256i rot8 =
+	        _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3,
+	                        14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
+	uint32_t in12, in13;
+
+	/* the naive way seems as fast (if not a bit faster) than the vector way */
+	__m256i x_0  = _mm256_set1_epi32(x[0]);
+	__m256i x_1  = _mm256_set1_epi32(x[1]);
+	__m256i x_2  = _mm256_set1_epi32(x[2]);
+	__m256i x_3  = _mm256_set1_epi32(x[3]);
+	__m256i x_4  = _mm256_set1_epi32(x[4]);
+	__m256i x_5  = _mm256_set1_epi32(x[5]);
+	__m256i x_6  = _mm256_set1_epi32(x[6]);
+	__m256i x_7  = _mm256_set1_epi32(x[7]);
+	__m256i x_8  = _mm256_set1_epi32(x[8]);
+	__m256i x_9  = _mm256_set1_epi32(x[9]);
+	__m256i x_10 = _mm256_set1_epi32(x[10]);
+	__m256i x_11 = _mm256_set1_epi32(x[11]);
+	__m256i x_12;
+	__m256i x_13;
+	__m256i x_14 = _mm256_set1_epi32(x[14]);
+	__m256i x_15 = _mm256_set1_epi32(x[15]);
+
+	__m256i orig0  = x_0;
+	__m256i orig1  = x_1;
+	__m256i orig2  = x_2;
+	__m256i orig3  = x_3;
+	__m256i orig4  = x_4;
+	__m256i orig5  = x_5;
+	__m256i orig6  = x_6;
+	__m256i orig7  = x_7;
+	__m256i orig8  = x_8;
+	__m256i orig9  = x_9;
+	__m256i orig10 = x_10;
+	__m256i orig11 = x_11;
+	__m256i orig12;
+	__m256i orig13;
+	__m256i orig14 = x_14;
+	__m256i orig15 = x_15;
+	__m256i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
+	        t_13, t_14, t_15;
+
+	while(bytes >= 512) {
+		const __m256i addv12  = _mm256_set_epi64x(3, 2, 1, 0);
+		const __m256i addv13  = _mm256_set_epi64x(7, 6, 5, 4);
+		const __m256i permute = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
+		__m256i       t12, t13;
+
+		uint64_t in1213;
+		int      i;
+
+		x_0  = orig0;
+		x_1  = orig1;
+		x_2  = orig2;
+		x_3  = orig3;
+		x_4  = orig4;
+		x_5  = orig5;
+		x_6  = orig6;
+		x_7  = orig7;
+		x_8  = orig8;
+		x_9  = orig9;
+		x_10 = orig10;
+		x_11 = orig11;
+		x_14 = orig14;
+		x_15 = orig15;
+
+		in12   = x[12];
+		in13   = x[13];
+		in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
+		x_12 = x_13 = _mm256_broadcastq_epi64(_mm_cvtsi64_si128(in1213));
+
+		t12 = _mm256_add_epi64(addv12, x_12);
+		t13 = _mm256_add_epi64(addv13, x_13);
+
+		x_12 = _mm256_unpacklo_epi32(t12, t13);
+		x_13 = _mm256_unpackhi_epi32(t12, t13);
+
+		t12 = _mm256_unpacklo_epi32(x_12, x_13);
+		t13 = _mm256_unpackhi_epi32(x_12, x_13);
+
+		/* required because unpack* are intra-lane */
+		x_12 = _mm256_permutevar8x32_epi32(t12, permute);
+		x_13 = _mm256_permutevar8x32_epi32(t13, permute);
+
+		orig12 = x_12;
+		orig13 = x_13;
+
+		in1213 += 8;
+
+		x[12] = in1213 & 0xFFFFFFFF;
+		x[13] = (in1213 >> 32) & 0xFFFFFFFF;
+
+		for(i = 0; i < ROUNDS; i += 2) {
+			VEC8_ROUND(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+			VEC8_ROUND(0, 5, 10, 15, 1, 6, 11, 12, 2, 7, 8, 13, 3, 4, 9, 14);
+		}
+
+#define ONEQUAD_TRANSPOSE(A, B, C, D)                                     \
+	{                                                                     \
+		__m128i t0, t1, t2, t3;                                           \
+		x_##A = _mm256_add_epi32(x_##A, orig##A);                         \
+		x_##B = _mm256_add_epi32(x_##B, orig##B);                         \
+		x_##C = _mm256_add_epi32(x_##C, orig##C);                         \
+		x_##D = _mm256_add_epi32(x_##D, orig##D);                         \
+		t_##A = _mm256_unpacklo_epi32(x_##A, x_##B);                      \
+		t_##B = _mm256_unpacklo_epi32(x_##C, x_##D);                      \
+		t_##C = _mm256_unpackhi_epi32(x_##A, x_##B);                      \
+		t_##D = _mm256_unpackhi_epi32(x_##C, x_##D);                      \
+		x_##A = _mm256_unpacklo_epi64(t_##A, t_##B);                      \
+		x_##B = _mm256_unpackhi_epi64(t_##A, t_##B);                      \
+		x_##C = _mm256_unpacklo_epi64(t_##C, t_##D);                      \
+		x_##D = _mm256_unpackhi_epi64(t_##C, t_##D);                      \
+		t0    = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 0),         \
+		                      _mm_loadu_si128((const __m128i*) (m + 0))); \
+		_mm_storeu_si128((__m128i*) (c + 0), t0);                         \
+		t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 0),            \
+		                   _mm_loadu_si128((const __m128i*) (m + 64)));   \
+		_mm_storeu_si128((__m128i*) (c + 64), t1);                        \
+		t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 0),            \
+		                   _mm_loadu_si128((const __m128i*) (m + 128)));  \
+		_mm_storeu_si128((__m128i*) (c + 128), t2);                       \
+		t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 0),            \
+		                   _mm_loadu_si128((const __m128i*) (m + 192)));  \
+		_mm_storeu_si128((__m128i*) (c + 192), t3);                       \
+		t0 = _mm_xor_si128(_mm256_extracti128_si256(x_##A, 1),            \
+		                   _mm_loadu_si128((const __m128i*) (m + 256)));  \
+		_mm_storeu_si128((__m128i*) (c + 256), t0);                       \
+		t1 = _mm_xor_si128(_mm256_extracti128_si256(x_##B, 1),            \
+		                   _mm_loadu_si128((const __m128i*) (m + 320)));  \
+		_mm_storeu_si128((__m128i*) (c + 320), t1);                       \
+		t2 = _mm_xor_si128(_mm256_extracti128_si256(x_##C, 1),            \
+		                   _mm_loadu_si128((const __m128i*) (m + 384)));  \
+		_mm_storeu_si128((__m128i*) (c + 384), t2);                       \
+		t3 = _mm_xor_si128(_mm256_extracti128_si256(x_##D, 1),            \
+		                   _mm_loadu_si128((const __m128i*) (m + 448)));  \
+		_mm_storeu_si128((__m128i*) (c + 448), t3);                       \
+	}
+
+#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
+
+#define ONEQUAD_UNPCK(A, B, C, D)                    \
+	{                                                \
+		x_##A = _mm256_add_epi32(x_##A, orig##A);    \
+		x_##B = _mm256_add_epi32(x_##B, orig##B);    \
+		x_##C = _mm256_add_epi32(x_##C, orig##C);    \
+		x_##D = _mm256_add_epi32(x_##D, orig##D);    \
+		t_##A = _mm256_unpacklo_epi32(x_##A, x_##B); \
+		t_##B = _mm256_unpacklo_epi32(x_##C, x_##D); \
+		t_##C = _mm256_unpackhi_epi32(x_##A, x_##B); \
+		t_##D = _mm256_unpackhi_epi32(x_##C, x_##D); \
+		x_##A = _mm256_unpacklo_epi64(t_##A, t_##B); \
+		x_##B = _mm256_unpackhi_epi64(t_##A, t_##B); \
+		x_##C = _mm256_unpacklo_epi64(t_##C, t_##D); \
+		x_##D = _mm256_unpackhi_epi64(t_##C, t_##D); \
+	}
+
+#define ONEOCTO(A, B, C, D, A2, B2, C2, D2)                          \
+	{                                                                \
+		ONEQUAD_UNPCK(A, B, C, D);                                   \
+		ONEQUAD_UNPCK(A2, B2, C2, D2);                               \
+		t_##A  = _mm256_permute2x128_si256(x_##A, x_##A2, 0x20);     \
+		t_##A2 = _mm256_permute2x128_si256(x_##A, x_##A2, 0x31);     \
+		t_##B  = _mm256_permute2x128_si256(x_##B, x_##B2, 0x20);     \
+		t_##B2 = _mm256_permute2x128_si256(x_##B, x_##B2, 0x31);     \
+		t_##C  = _mm256_permute2x128_si256(x_##C, x_##C2, 0x20);     \
+		t_##C2 = _mm256_permute2x128_si256(x_##C, x_##C2, 0x31);     \
+		t_##D  = _mm256_permute2x128_si256(x_##D, x_##D2, 0x20);     \
+		t_##D2 = _mm256_permute2x128_si256(x_##D, x_##D2, 0x31);     \
+		t_##A  = _mm256_xor_si256(                                   \
+		                t_##A, _mm256_loadu_si256((const __m256i*) (m + 0)));   \
+		t_##B = _mm256_xor_si256(                                    \
+		                t_##B, _mm256_loadu_si256((const __m256i*) (m + 64)));   \
+		t_##C = _mm256_xor_si256(                                    \
+		                t_##C, _mm256_loadu_si256((const __m256i*) (m + 128)));  \
+		t_##D = _mm256_xor_si256(                                    \
+		                t_##D, _mm256_loadu_si256((const __m256i*) (m + 192)));  \
+		t_##A2 = _mm256_xor_si256(                                   \
+		                t_##A2, _mm256_loadu_si256((const __m256i*) (m + 256))); \
+		t_##B2 = _mm256_xor_si256(                                   \
+		                t_##B2, _mm256_loadu_si256((const __m256i*) (m + 320))); \
+		t_##C2 = _mm256_xor_si256(                                   \
+		                t_##C2, _mm256_loadu_si256((const __m256i*) (m + 384))); \
+		t_##D2 = _mm256_xor_si256(                                   \
+		                t_##D2, _mm256_loadu_si256((const __m256i*) (m + 448))); \
+		_mm256_storeu_si256((__m256i*) (c + 0), t_##A);              \
+		_mm256_storeu_si256((__m256i*) (c + 64), t_##B);             \
+		_mm256_storeu_si256((__m256i*) (c + 128), t_##C);            \
+		_mm256_storeu_si256((__m256i*) (c + 192), t_##D);            \
+		_mm256_storeu_si256((__m256i*) (c + 256), t_##A2);           \
+		_mm256_storeu_si256((__m256i*) (c + 320), t_##B2);           \
+		_mm256_storeu_si256((__m256i*) (c + 384), t_##C2);           \
+		_mm256_storeu_si256((__m256i*) (c + 448), t_##D2);           \
+	}
+
+		ONEOCTO(0, 1, 2, 3, 4, 5, 6, 7);
+		m += 32;
+		c += 32;
+		ONEOCTO(8, 9, 10, 11, 12, 13, 14, 15);
+		m -= 32;
+		c -= 32;
+
+#undef ONEQUAD
+#undef ONEQUAD_TRANSPOSE
+#undef ONEQUAD_UNPCK
+#undef ONEOCTO
+
+		bytes -= 512;
+		c += 512;
+		m += 512;
+	}
+}
+
+#undef VEC8_ROT
+#undef VEC8_QUARTERROUND
+#undef VEC8_QUARTERROUND_NAIVE
+#undef VEC8_QUARTERROUND_SHUFFLE
+#undef VEC8_QUARTERROUND_SHUFFLE2
+#undef VEC8_LINE1
+#undef VEC8_LINE2
+#undef VEC8_LINE3
+#undef VEC8_LINE4
+#undef VEC8_ROUND
+#undef VEC8_ROUND_SEQ
+#undef VEC8_ROUND_HALF
+#undef VEC8_ROUND_HALFANDHALF
diff --git a/src/chacha-poly1305/chacha_ssse3.c b/src/chacha-poly1305/chacha_ssse3.c
new file mode 100644
index 000000000..be52884c3
--- /dev/null
+++ b/src/chacha-poly1305/chacha_ssse3.c
@@ -0,0 +1,26 @@
+#include "../system.h"
+
+#include "chacha.h"
+#include "../xalloc.h"
+
+#if defined(__clang__)
+#  pragma clang attribute push (__attribute__((target("sse2,ssse3"))), apply_to=function)
+#elif defined(__GNUC__)
+#  pragma GCC target("sse2", "ssse3")
+#endif
+
+#include <immintrin.h>
+
+void chacha_encrypt_bytes_ssse3(chacha_ctx *ctx, const uint8_t *m, uint8_t *c, uint32_t bytes) {
+	uint32_t *x = &ctx->input[0];
+
+	if(!bytes) {
+		return;
+	}
+
+#include "chacha_ssse3.h"
+}
+
+#ifdef __clang__
+#  pragma clang attribute pop
+#endif
diff --git a/src/chacha-poly1305/chacha_ssse3.h b/src/chacha-poly1305/chacha_ssse3.h
new file mode 100644
index 000000000..a942837fc
--- /dev/null
+++ b/src/chacha-poly1305/chacha_ssse3.h
@@ -0,0 +1,370 @@
+// Copyright (C) 2014-2017 D. J. Bernstein, Romain Dolbeau, Frank Denis
+// Public domain
+
+#define VEC4_ROT(A, IMM) \
+	_mm_or_si128(_mm_slli_epi32(A, IMM), _mm_srli_epi32(A, (32 - IMM)))
+
+/* same, but replace 2 of the shift/shift/or "rotation" by byte shuffles (8 &
+ * 16) (better) */
+#define VEC4_QUARTERROUND_SHUFFLE(A, B, C, D) \
+	x_##A = _mm_add_epi32(x_##A, x_##B);      \
+	t_##A = _mm_xor_si128(x_##D, x_##A);      \
+	x_##D = _mm_shuffle_epi8(t_##A, rot16);   \
+	x_##C = _mm_add_epi32(x_##C, x_##D);      \
+	t_##C = _mm_xor_si128(x_##B, x_##C);      \
+	x_##B = VEC4_ROT(t_##C, 12);              \
+	x_##A = _mm_add_epi32(x_##A, x_##B);      \
+	t_##A = _mm_xor_si128(x_##D, x_##A);      \
+	x_##D = _mm_shuffle_epi8(t_##A, rot8);    \
+	x_##C = _mm_add_epi32(x_##C, x_##D);      \
+	t_##C = _mm_xor_si128(x_##B, x_##C);      \
+	x_##B = VEC4_ROT(t_##C, 7)
+
+#define VEC4_QUARTERROUND(A, B, C, D) VEC4_QUARTERROUND_SHUFFLE(A, B, C, D)
+
+if(bytes >= 256) {
+	/* constant for shuffling bytes (replacing multiple-of-8 rotates) */
+	__m128i rot16 =
+	        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
+	__m128i rot8 =
+	        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
+
+	__m128i x_0  = _mm_set1_epi32(x[0]);
+	__m128i x_1  = _mm_set1_epi32(x[1]);
+	__m128i x_2  = _mm_set1_epi32(x[2]);
+	__m128i x_3  = _mm_set1_epi32(x[3]);
+	__m128i x_4  = _mm_set1_epi32(x[4]);
+	__m128i x_5  = _mm_set1_epi32(x[5]);
+	__m128i x_6  = _mm_set1_epi32(x[6]);
+	__m128i x_7  = _mm_set1_epi32(x[7]);
+	__m128i x_8  = _mm_set1_epi32(x[8]);
+	__m128i x_9  = _mm_set1_epi32(x[9]);
+	__m128i x_10 = _mm_set1_epi32(x[10]);
+	__m128i x_11 = _mm_set1_epi32(x[11]);
+	__m128i x_12;
+	__m128i x_13;
+	__m128i x_14   = _mm_set1_epi32(x[14]);
+	__m128i x_15   = _mm_set1_epi32(x[15]);
+	__m128i orig0  = x_0;
+	__m128i orig1  = x_1;
+	__m128i orig2  = x_2;
+	__m128i orig3  = x_3;
+	__m128i orig4  = x_4;
+	__m128i orig5  = x_5;
+	__m128i orig6  = x_6;
+	__m128i orig7  = x_7;
+	__m128i orig8  = x_8;
+	__m128i orig9  = x_9;
+	__m128i orig10 = x_10;
+	__m128i orig11 = x_11;
+	__m128i orig12;
+	__m128i orig13;
+	__m128i orig14 = x_14;
+	__m128i orig15 = x_15;
+	__m128i t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9, t_10, t_11, t_12,
+	        t_13, t_14, t_15;
+
+	uint32_t in12, in13;
+	int      i;
+
+	while(bytes >= 256) {
+		const __m128i addv12 = _mm_set_epi64x(1, 0);
+		const __m128i addv13 = _mm_set_epi64x(3, 2);
+		__m128i       t12, t13;
+		uint64_t      in1213;
+
+		x_0  = orig0;
+		x_1  = orig1;
+		x_2  = orig2;
+		x_3  = orig3;
+		x_4  = orig4;
+		x_5  = orig5;
+		x_6  = orig6;
+		x_7  = orig7;
+		x_8  = orig8;
+		x_9  = orig9;
+		x_10 = orig10;
+		x_11 = orig11;
+		x_14 = orig14;
+		x_15 = orig15;
+
+		in12   = x[12];
+		in13   = x[13];
+		in1213 = ((uint64_t) in12) | (((uint64_t) in13) << 32);
+		t12    = _mm_set1_epi64x(in1213);
+		t13    = _mm_set1_epi64x(in1213);
+
+		x_12 = _mm_add_epi64(addv12, t12);
+		x_13 = _mm_add_epi64(addv13, t13);
+
+		t12 = _mm_unpacklo_epi32(x_12, x_13);
+		t13 = _mm_unpackhi_epi32(x_12, x_13);
+
+		x_12 = _mm_unpacklo_epi32(t12, t13);
+		x_13 = _mm_unpackhi_epi32(t12, t13);
+
+		orig12 = x_12;
+		orig13 = x_13;
+
+		in1213 += 4;
+
+		x[12] = in1213 & 0xFFFFFFFF;
+		x[13] = (in1213 >> 32) & 0xFFFFFFFF;
+
+		for(i = 0; i < ROUNDS; i += 2) {
+			VEC4_QUARTERROUND(0, 4, 8, 12);
+			VEC4_QUARTERROUND(1, 5, 9, 13);
+			VEC4_QUARTERROUND(2, 6, 10, 14);
+			VEC4_QUARTERROUND(3, 7, 11, 15);
+			VEC4_QUARTERROUND(0, 5, 10, 15);
+			VEC4_QUARTERROUND(1, 6, 11, 12);
+			VEC4_QUARTERROUND(2, 7, 8, 13);
+			VEC4_QUARTERROUND(3, 4, 9, 14);
+		}
+
+#define ONEQUAD_TRANSPOSE(A, B, C, D)                                          \
+	{                                                                          \
+		__m128i t0, t1, t2, t3;                                                \
+		\
+		x_##A = _mm_add_epi32(x_##A, orig##A);                                 \
+		x_##B = _mm_add_epi32(x_##B, orig##B);                                 \
+		x_##C = _mm_add_epi32(x_##C, orig##C);                                 \
+		x_##D = _mm_add_epi32(x_##D, orig##D);                                 \
+		t_##A = _mm_unpacklo_epi32(x_##A, x_##B);                              \
+		t_##B = _mm_unpacklo_epi32(x_##C, x_##D);                              \
+		t_##C = _mm_unpackhi_epi32(x_##A, x_##B);                              \
+		t_##D = _mm_unpackhi_epi32(x_##C, x_##D);                              \
+		x_##A = _mm_unpacklo_epi64(t_##A, t_##B);                              \
+		x_##B = _mm_unpackhi_epi64(t_##A, t_##B);                              \
+		x_##C = _mm_unpacklo_epi64(t_##C, t_##D);                              \
+		x_##D = _mm_unpackhi_epi64(t_##C, t_##D);                              \
+		\
+		t0 = _mm_xor_si128(x_##A, _mm_loadu_si128((const __m128i*) (m + 0)));  \
+		_mm_storeu_si128((__m128i*) (c + 0), t0);                              \
+		t1 = _mm_xor_si128(x_##B, _mm_loadu_si128((const __m128i*) (m + 64))); \
+		_mm_storeu_si128((__m128i*) (c + 64), t1);                             \
+		t2 =                                                                   \
+		                _mm_xor_si128(x_##C, _mm_loadu_si128((const __m128i*) (m + 128))); \
+		_mm_storeu_si128((__m128i*) (c + 128), t2);                            \
+		t3 =                                                                   \
+		                _mm_xor_si128(x_##D, _mm_loadu_si128((const __m128i*) (m + 192))); \
+		_mm_storeu_si128((__m128i*) (c + 192), t3);                            \
+	}
+
+#define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
+
+		ONEQUAD(0, 1, 2, 3);
+		m += 16;
+		c += 16;
+		ONEQUAD(4, 5, 6, 7);
+		m += 16;
+		c += 16;
+		ONEQUAD(8, 9, 10, 11);
+		m += 16;
+		c += 16;
+		ONEQUAD(12, 13, 14, 15);
+		m -= 48;
+		c -= 48;
+
+#undef ONEQUAD
+#undef ONEQUAD_TRANSPOSE
+
+		bytes -= 256;
+		c += 256;
+		m += 256;
+	}
+}
+
+#undef VEC4_ROT
+#undef VEC4_QUARTERROUND
+#undef VEC4_QUARTERROUND_SHUFFLE
+
+while(bytes >= 64) {
+	__m128i       x_0, x_1, x_2, x_3;
+	__m128i       t_1;
+	const __m128i rot16 =
+	        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
+	const __m128i rot8 =
+	        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
+
+	uint32_t in12;
+	uint32_t in13;
+	int      i;
+
+	x_0 = _mm_loadu_si128((const __m128i *)(x + 0));
+	x_1 = _mm_loadu_si128((const __m128i *)(x + 4));
+	x_2 = _mm_loadu_si128((const __m128i *)(x + 8));
+	x_3 = _mm_loadu_si128((const __m128i *)(x + 12));
+
+	for(i = 0; i < ROUNDS; i += 2) {
+		x_0 = _mm_add_epi32(x_0, x_1);
+		x_3 = _mm_xor_si128(x_3, x_0);
+		x_3 = _mm_shuffle_epi8(x_3, rot16);
+
+		x_2 = _mm_add_epi32(x_2, x_3);
+		x_1 = _mm_xor_si128(x_1, x_2);
+
+		t_1 = x_1;
+		x_1 = _mm_slli_epi32(x_1, 12);
+		t_1 = _mm_srli_epi32(t_1, 20);
+		x_1 = _mm_xor_si128(x_1, t_1);
+
+		x_0 = _mm_add_epi32(x_0, x_1);
+		x_3 = _mm_xor_si128(x_3, x_0);
+		x_0 = _mm_shuffle_epi32(x_0, 0x93);
+		x_3 = _mm_shuffle_epi8(x_3, rot8);
+
+		x_2 = _mm_add_epi32(x_2, x_3);
+		x_3 = _mm_shuffle_epi32(x_3, 0x4e);
+		x_1 = _mm_xor_si128(x_1, x_2);
+		x_2 = _mm_shuffle_epi32(x_2, 0x39);
+
+		t_1 = x_1;
+		x_1 = _mm_slli_epi32(x_1, 7);
+		t_1 = _mm_srli_epi32(t_1, 25);
+		x_1 = _mm_xor_si128(x_1, t_1);
+
+		x_0 = _mm_add_epi32(x_0, x_1);
+		x_3 = _mm_xor_si128(x_3, x_0);
+		x_3 = _mm_shuffle_epi8(x_3, rot16);
+
+		x_2 = _mm_add_epi32(x_2, x_3);
+		x_1 = _mm_xor_si128(x_1, x_2);
+
+		t_1 = x_1;
+		x_1 = _mm_slli_epi32(x_1, 12);
+		t_1 = _mm_srli_epi32(t_1, 20);
+		x_1 = _mm_xor_si128(x_1, t_1);
+
+		x_0 = _mm_add_epi32(x_0, x_1);
+		x_3 = _mm_xor_si128(x_3, x_0);
+		x_0 = _mm_shuffle_epi32(x_0, 0x39);
+		x_3 = _mm_shuffle_epi8(x_3, rot8);
+
+		x_2 = _mm_add_epi32(x_2, x_3);
+		x_3 = _mm_shuffle_epi32(x_3, 0x4e);
+		x_1 = _mm_xor_si128(x_1, x_2);
+		x_2 = _mm_shuffle_epi32(x_2, 0x93);
+
+		t_1 = x_1;
+		x_1 = _mm_slli_epi32(x_1, 7);
+		t_1 = _mm_srli_epi32(t_1, 25);
+		x_1 = _mm_xor_si128(x_1, t_1);
+	}
+
+	x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((const __m128i *)(x + 0)));
+	x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((const __m128i *)(x + 4)));
+	x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((const __m128i *)(x + 8)));
+	x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((const __m128i *)(x + 12)));
+	x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((const __m128i *)(m + 0)));
+	x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((const __m128i *)(m + 16)));
+	x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((const __m128i *)(m + 32)));
+	x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((const __m128i *)(m + 48)));
+	_mm_storeu_si128((__m128i *)(c + 0), x_0);
+	_mm_storeu_si128((__m128i *)(c + 16), x_1);
+	_mm_storeu_si128((__m128i *)(c + 32), x_2);
+	_mm_storeu_si128((__m128i *)(c + 48), x_3);
+
+	in12 = x[12];
+	in13 = x[13];
+	in12++;
+
+	if(in12 == 0) {
+		in13++;
+	}
+
+	x[12] = in12;
+	x[13] = in13;
+
+	bytes -= 64;
+	c += 64;
+	m += 64;
+}
+
+if(bytes > 0) {
+	__m128i       x_0, x_1, x_2, x_3;
+	__m128i       t_1;
+	const __m128i rot16 =
+	        _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
+	const __m128i rot8 =
+	        _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
+	uint8_t partialblock[64];
+
+	unsigned int i;
+
+	x_0 = _mm_loadu_si128((const __m128i *)(x + 0));
+	x_1 = _mm_loadu_si128((const __m128i *)(x + 4));
+	x_2 = _mm_loadu_si128((const __m128i *)(x + 8));
+	x_3 = _mm_loadu_si128((const __m128i *)(x + 12));
+
+	for(i = 0; i < ROUNDS; i += 2) {
+		x_0 = _mm_add_epi32(x_0, x_1);
+		x_3 = _mm_xor_si128(x_3, x_0);
+		x_3 = _mm_shuffle_epi8(x_3, rot16);
+
+		x_2 = _mm_add_epi32(x_2, x_3);
+		x_1 = _mm_xor_si128(x_1, x_2);
+
+		t_1 = x_1;
+		x_1 = _mm_slli_epi32(x_1, 12);
+		t_1 = _mm_srli_epi32(t_1, 20);
+		x_1 = _mm_xor_si128(x_1, t_1);
+
+		x_0 = _mm_add_epi32(x_0, x_1);
+		x_3 = _mm_xor_si128(x_3, x_0);
+		x_0 = _mm_shuffle_epi32(x_0, 0x93);
+		x_3 = _mm_shuffle_epi8(x_3, rot8);
+
+		x_2 = _mm_add_epi32(x_2, x_3);
+		x_3 = _mm_shuffle_epi32(x_3, 0x4e);
+		x_1 = _mm_xor_si128(x_1, x_2);
+		x_2 = _mm_shuffle_epi32(x_2, 0x39);
+
+		t_1 = x_1;
+		x_1 = _mm_slli_epi32(x_1, 7);
+		t_1 = _mm_srli_epi32(t_1, 25);
+		x_1 = _mm_xor_si128(x_1, t_1);
+
+		x_0 = _mm_add_epi32(x_0, x_1);
+		x_3 = _mm_xor_si128(x_3, x_0);
+		x_3 = _mm_shuffle_epi8(x_3, rot16);
+
+		x_2 = _mm_add_epi32(x_2, x_3);
+		x_1 = _mm_xor_si128(x_1, x_2);
+
+		t_1 = x_1;
+		x_1 = _mm_slli_epi32(x_1, 12);
+		t_1 = _mm_srli_epi32(t_1, 20);
+		x_1 = _mm_xor_si128(x_1, t_1);
+
+		x_0 = _mm_add_epi32(x_0, x_1);
+		x_3 = _mm_xor_si128(x_3, x_0);
+		x_0 = _mm_shuffle_epi32(x_0, 0x39);
+		x_3 = _mm_shuffle_epi8(x_3, rot8);
+
+		x_2 = _mm_add_epi32(x_2, x_3);
+		x_3 = _mm_shuffle_epi32(x_3, 0x4e);
+		x_1 = _mm_xor_si128(x_1, x_2);
+		x_2 = _mm_shuffle_epi32(x_2, 0x93);
+
+		t_1 = x_1;
+		x_1 = _mm_slli_epi32(x_1, 7);
+		t_1 = _mm_srli_epi32(t_1, 25);
+		x_1 = _mm_xor_si128(x_1, t_1);
+	}
+
+	x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((const __m128i *)(x + 0)));
+	x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((const __m128i *)(x + 4)));
+	x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((const __m128i *)(x + 8)));
+	x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((const __m128i *)(x + 12)));
+	_mm_storeu_si128((__m128i *)(partialblock + 0), x_0);
+	_mm_storeu_si128((__m128i *)(partialblock + 16), x_1);
+	_mm_storeu_si128((__m128i *)(partialblock + 32), x_2);
+	_mm_storeu_si128((__m128i *)(partialblock + 48), x_3);
+
+	for(i = 0; i < bytes; i++) {
+		c[i] = m[i] ^ partialblock[i];
+	}
+
+	memzero(partialblock, sizeof partialblock);
+}
diff --git a/src/chacha-poly1305/meson.build b/src/chacha-poly1305/meson.build
index d8fd74cc1..8fc3cd534 100644
--- a/src/chacha-poly1305/meson.build
+++ b/src/chacha-poly1305/meson.build
@@ -4,11 +4,38 @@ src_chacha_poly = files(
   'poly1305.c',
 )
 
+chacha_impl = 'generic'
+if cdata.has('HAVE_CPU_SSSE3')
+  src_chacha_poly += files('chacha_ssse3.c')
+  chacha_impl = 'SSSE3'
+  if cdata.has('HAVE_CPU_AVX2')
+    src_chacha_poly += files('chacha_avx2.c')
+    chacha_impl = 'AVX2'
+  endif
+endif
+
+if meson_version.version_compare('>=0.53')
+  summary({ 'ChaCha20': chacha_impl }, section: 'Cryptography')
+endif
+
 lib_chacha_poly = static_library(
   'chacha_poly',
   sources: src_chacha_poly,
   implicit_include_directories: false,
+  link_with: lib_cpu_features,
   include_directories: inc_conf,
   build_by_default: false,
 )
 
+if os_name != 'windows'
+  exe_bench_chacha = executable(
+    'bench_chacha',
+    sources: files('../random.c', 'bench_chacha.c'),
+    link_with: lib_chacha_poly,
+    implicit_include_directories: false,
+    include_directories: inc_conf,
+    build_by_default: false,
+  )
+
+  benchmark('bench_chacha', exe_bench_chacha)
+endif
diff --git a/src/cpu.c b/src/cpu.c
new file mode 100644
index 000000000..67e26503e
--- /dev/null
+++ b/src/cpu.c
@@ -0,0 +1,89 @@
+#include "system.h"
+
+#include <assert.h>
+
+#include "cpu.h"
+
+#define CPUID_ECX_SSSE3   0x00000200
+#define CPUID_EBX_AVX2    0x00000020
+
+#define CPU_INFO_LEN 4
+
+static uint16_t features;
+
+// Copyright (c) 2014-2021 Frank Denis
+static void tinc_cpuid(unsigned int cpu_info[CPU_INFO_LEN], const unsigned int cpu_info_type) {
+	memset(cpu_info, 0, CPU_INFO_LEN * sizeof(*cpu_info));
+
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86))
+	__cpuid((int *) cpu_info, cpu_info_type);
+#elif defined(HAVE_CPUID)
+#  if defined(__x86_64__)
+	__asm__ __volatile__("xchgq %%rbx, %q1; cpuid; xchgq %%rbx, %q1"
+	                     : "=a"(cpu_info[0]), "=&r"(cpu_info[1]),
+	                     "=c"(cpu_info[2]), "=d"(cpu_info[3])
+	                     : "0"(cpu_info_type), "2"(0U));
+#  elif defined(__i386__)
+	__asm__ __volatile__(
+	        "pushfl; pushfl; "
+	        "popl %0; "
+	        "movl %0, %1; xorl %2, %0; "
+	        "pushl %0; "
+	        "popfl; pushfl; popl %0; popfl"
+	        : "=&r"(cpu_info[0]), "=&r"(cpu_info[1])
+	        : "i"(0x200000));
+
+	if(((cpu_info[0] ^ cpu_info[1]) & 0x200000) == 0x0) {
+		return;
+	}
+
+	__asm__ __volatile__("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1"
+	                     : "=a"(cpu_info[0]), "=&r"(cpu_info[1]),
+	                     "=c"(cpu_info[2]), "=d"(cpu_info[3])
+	                     : "0"(cpu_info_type), "2"(0U));
+#  else
+	__asm__ __volatile__("cpuid"
+	                     : "=a"(cpu_info[0]), "=b"(cpu_info[1]),
+	                     "=c"(cpu_info[2]), "=d"(cpu_info[3])
+	                     : "0"(cpu_info_type), "2"(0U));
+#  endif
+#else
+	(void)cpu_info_type;
+#endif
+}
+
+static bool initialized;
+
+void cpu_detect_features(void) {
+	initialized = true;
+
+	unsigned int cpu_info[CPU_INFO_LEN];
+	tinc_cpuid(cpu_info, 0x00);
+
+	if(!cpu_info[0]) {
+		return;
+	}
+
+#ifdef HAVE_CPU_SSSE3
+	tinc_cpuid(cpu_info, 0x01);
+
+	if(cpu_info[2] & CPUID_ECX_SSSE3) {
+		features |= CPU_SSSE3;
+	}
+
+#endif
+
+#ifdef HAVE_CPU_AVX2
+	tinc_cpuid(cpu_info, 0x07);
+
+	if(cpu_info[1] & CPUID_EBX_AVX2) {
+		features |= CPU_AVX2;
+	}
+
+#endif
+}
+
+bool cpu_supports(cpu_feature_t feat) {
+	assert(initialized);
+	return features & feat;
+}
diff --git a/src/cpu.h b/src/cpu.h
new file mode 100644
index 000000000..c95407325
--- /dev/null
+++ b/src/cpu.h
@@ -0,0 +1,17 @@
+#ifndef TINC_CPU_H
+#define TINC_CPU_H
+
+#include "system.h"
+
+typedef enum {
+	CPU_AVX2  = 1 << 0,
+	CPU_SSSE3 = 1 << 1,
+} cpu_feature_t;
+
+// Detect supported features. Should be called once at application startup.
+void cpu_detect_features(void);
+
+// Check if current CPU supports feature
+bool cpu_supports(cpu_feature_t feat);
+
+#endif // TINC_CPU_H
diff --git a/src/gcrypt/crypto.c b/src/gcrypt/crypto.c
index 815bedf1b..627ac7ef1 100644
--- a/src/gcrypt/crypto.c
+++ b/src/gcrypt/crypto.c
@@ -3,8 +3,10 @@
 #include <gcrypt.h>
 
 #include "../crypto.h"
+#include "../chacha-poly1305/chacha.h"
 
 void crypto_init(void) {
+	chacha_resolve_functions();
 	gcry_control(GCRYCTL_INIT_SECMEM, 32 * 1024, 0);
 	gcry_control(GCRYCTL_INITIALIZATION_FINISHED, 0);
 }
diff --git a/src/meson.build b/src/meson.build
index d9f7b14bd..eda335fe5 100644
--- a/src/meson.build
+++ b/src/meson.build
@@ -18,6 +18,27 @@ foreach attr : ['malloc', 'nonnull', 'warn_unused_result', 'packed', 'format']
   endif
 endforeach
 
+src_cpuid = '''
+  int main(void) {
+    unsigned int cpu_info[4];
+    __asm__ __volatile__ ("xchgl %%ebx, %k1; cpuid; xchgl %%ebx, %k1" :
+                          "=a" (cpu_info[0]), "=&r" (cpu_info[1]),
+                          "=c" (cpu_info[2]), "=d" (cpu_info[3]) :
+                          "0" (0U), "2" (0U));
+    return 0;
+  }
+'''
+
+if cpu_family in ['x86', 'x86_64'] and cc.compiles(src_cpuid)
+  cdata.set('HAVE_CPUID', 1, description: 'have cpuid instruction support')
+  if cc.has_header_symbol('immintrin.h', '_mm_shuffle_epi8')
+    cdata.set('HAVE_CPU_SSSE3', 1, description: 'have SSSE3 headers')
+    if cc.has_header_symbol('immintrin.h', '_mm256_set1_epi32')
+      cdata.set('HAVE_CPU_AVX2', 1, description: 'have AVX2 headers')
+    endif
+  endif
+endif
+
 if cc.compiles('''
     #include <stdlib.h>
     extern void *make() __attribute__((malloc(free)));
@@ -112,6 +133,14 @@ check_types = [
   'struct nd_opt_hdr',
 ]
 
+lib_cpu_features = static_library(
+  'cpu_features',
+  sources: 'cpu.c',
+  implicit_include_directories: false,
+  include_directories: inc_conf,
+  build_by_default: false,
+)
+
 subdir('ed25519')
 subdir('chacha-poly1305')
 
@@ -362,6 +391,10 @@ if opt_crypto != 'nolegacy'
   src_lib_crypto += ['cipher.c', 'digest.c']
 endif
 
+if meson_version.version_compare('>=0.53')
+  summary({ 'library': opt_crypto }, section: 'Cryptography')
+endif
+
 subdir('include')
 
 have_sandbox = cdata.has('HAVE_SANDBOX')
diff --git a/src/nolegacy/crypto.c b/src/nolegacy/crypto.c
index 4e6f427ad..f235cd514 100644
--- a/src/nolegacy/crypto.c
+++ b/src/nolegacy/crypto.c
@@ -18,7 +18,8 @@
 */
 
 #include "../crypto.h"
+#include "../chacha-poly1305/chacha.h"
 
-// No-op for those cryptographic libraries that
-// do not require any additional initialization.
-void crypto_init(void) {}
+void crypto_init(void) {
+	chacha_resolve_functions();
+}
diff --git a/src/openssl/crypto.c b/src/openssl/crypto.c
index 3960c3e84..871c5a4e2 100644
--- a/src/openssl/crypto.c
+++ b/src/openssl/crypto.c
@@ -23,8 +23,11 @@
 #include <openssl/engine.h>
 
 #include "../crypto.h"
+#include "../chacha-poly1305/chacha.h"
 
 void crypto_init(void) {
+	chacha_resolve_functions();
+
 #if OPENSSL_VERSION_MAJOR < 3
 	ENGINE_load_builtin_engines();
 #endif
diff --git a/src/sptps_speed.c b/src/sptps_speed.c
index c7c6e5463..9d78f242e 100644
--- a/src/sptps_speed.c
+++ b/src/sptps_speed.c
@@ -22,6 +22,7 @@
 
 #include <poll.h>
 
+#include "benchmark.h"
 #include "crypto.h"
 #include "ecdh.h"
 #include "ecdsa.h"
@@ -81,30 +82,6 @@ static void receive_data(sptps_t *sptps) {
 	}
 }
 
-struct timespec start;
-struct timespec end;
-double elapsed;
-double rate;
-unsigned int count;
-
-static void clock_start(void) {
-	count = 0;
-	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
-}
-
-static bool clock_countto(double seconds) {
-	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
-	elapsed = (double) end.tv_sec + (double) end.tv_nsec * 1e-9
-	          - (double) start.tv_sec - (double) start.tv_nsec * 1e-9;
-
-	if(elapsed < seconds) {
-		return ++count;
-	}
-
-	rate = count / elapsed;
-	return false;
-}
-
 static int run_benchmark(int argc, char *argv[]) {
 	ecdsa_t *key1, *key2;
 	ecdh_t *ecdh1, *ecdh2;