From e367e5c0c9344d31f2ed0b318ded966136eee4fe Mon Sep 17 00:00:00 2001 From: Aaron O'Mullan Date: Wed, 4 Sep 2024 02:35:10 +0900 Subject: [PATCH 1/2] perf: experiment with alt dispatch pattern To close gap between runtime dispatched x64 (to avx2) and avx2 compile-time. We specifically generate specialized versions of header parsing functions, moving the dispatch/inlining boundary higher up the call tree --- src/lib.rs | 125 ++++++++++++++++++++++++++++++++++++++++++-- src/simd/runtime.rs | 45 +++++++++++----- 2 files changed, 152 insertions(+), 18 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4ccd783..e43e8c5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1043,6 +1043,122 @@ struct HeaderParserConfig { ignore_invalid_headers: bool, } +// Runtime build of parse_headers_iter_uninit +#[cfg(all( + httparse_simd, + not(any( + httparse_simd_target_feature_sse42, + httparse_simd_target_feature_avx2, + )), + any( + target_arch = "x86", + target_arch = "x86_64", + ), +))] +fn parse_headers_iter_uninit<'a>(headers: &mut &mut [MaybeUninit>], bytes: &mut Bytes<'a>, config: &HeaderParserConfig) -> Result { + static mut PARSE_FUNC: fn(&mut &mut [MaybeUninit>], &mut Bytes<'a>, &HeaderParserConfig) -> Result = parse_headers_setup; + + fn parse_headers_avx2<'a>(headers: &mut &mut [MaybeUninit>], bytes: &mut Bytes<'a>, config: &HeaderParserConfig) -> Result { + struct Avx2HeaderMatcher; + impl HeaderMatcher for Avx2HeaderMatcher { + #[inline(always)] + fn match_name(bytes: &mut Bytes) { + simd::avx2::match_header_name_vectored(bytes) + } + #[inline(always)] + fn match_value(bytes: &mut Bytes) { + simd::avx2::match_header_value_vectored(bytes) + } + } + + _parse_headers_iter_uninit::<'a, Avx2HeaderMatcher>(headers, bytes, config) + } + + fn parse_headers_sse42<'a>(headers: &mut &mut [MaybeUninit>], bytes: &mut Bytes<'a>, config: &HeaderParserConfig) -> Result { + struct Sse42HeaderMatcher; + impl HeaderMatcher for Sse42HeaderMatcher { + #[inline(always)] + fn match_name(bytes: &mut Bytes) { + simd::sse42::match_header_name_vectored(bytes) + } + #[inline(always)] + fn match_value(bytes: &mut Bytes) { + simd::sse42::match_header_value_vectored(bytes) + } + } + + _parse_headers_iter_uninit::<'a, Sse42HeaderMatcher>(headers, bytes, config) + } + + fn parse_headers_swar<'a>(headers: &mut &mut [MaybeUninit>], bytes: &mut Bytes<'a>, config: &HeaderParserConfig) -> Result { + struct SwarHeaderMatcher; + impl HeaderMatcher for SwarHeaderMatcher { + #[inline(always)] + fn match_name(bytes: &mut Bytes) { + simd::swar::match_header_name_vectored(bytes) + } + #[inline(always)] + fn match_value(bytes: &mut Bytes) { + simd::swar::match_header_value_vectored(bytes) + } + } + + _parse_headers_iter_uninit::<'a, SwarHeaderMatcher>(headers, bytes, config) + } + + fn parse_headers_setup(headers: &mut &mut [MaybeUninit>], bytes: &mut Bytes<'a>, config: &HeaderParserConfig) -> Result { + if is_x86_feature_detected!("avx2") { + unsafe { + PARSE_FUNC = parse_headers_avx2; + } + } else if is_x86_feature_detected!("sse4.2") { + unsafe { + PARSE_FUNC = parse_headers_sse42; + } + } else { + unsafe { + PARSE_FUNC = parse_headers_swar; + } + } + + unsafe { + PARSE_FUNC(headers, bytes, config) + } + } +} + +// Specialized build of parse_headers_iter_uninit +#[cfg(not(all( + httparse_simd, + not(any( + httparse_simd_target_feature_sse42, + httparse_simd_target_feature_avx2, + )), + any( + target_arch = "x86", + target_arch = "x86_64", + ), +)))] +fn parse_headers_iter_uninit<'a>(headers: &mut &mut [MaybeUninit>], bytes: &mut Bytes<'a>, config: &HeaderParserConfig) -> Result { + struct SimdHeaderMatcher; + impl HeaderMatcher for SimdHeaderMatcher { + #[inline(always)] + fn match_name(bytes: &mut Bytes) { + simd::match_header_name_vectored(bytes) + } + #[inline(always)] + fn match_value(bytes: &mut Bytes) { + simd::match_header_value_vectored(bytes) + } + } + _parse_headers_iter_uninit::(headers, bytes, config) +} + +trait HeaderMatcher { + fn match_name(bytes: &mut Bytes); + fn match_value(bytes: &mut Bytes); +} + /* Function which parsers headers into uninitialized buffer. * * Guarantees that it doesn't write garbage, so casting @@ -1052,11 +1168,12 @@ struct HeaderParserConfig { * Also it promises `headers` get shrunk to number of initialized headers, * so casting the other way around after calling this function is safe */ -fn parse_headers_iter_uninit<'a>( +fn _parse_headers_iter_uninit<'a, Matcher: HeaderMatcher>( headers: &mut &mut [MaybeUninit>], bytes: &mut Bytes<'a>, config: &HeaderParserConfig -) -> Result { +) -> Result +{ /* Flow of this function is pretty complex, especially with macros, * so this struct makes sure we shrink `headers` to only parsed ones. @@ -1181,7 +1298,7 @@ fn parse_headers_iter_uninit<'a>( #[allow(clippy::never_loop)] // parse header name until colon let header_name: &str = 'name: loop { - simd::match_header_name_vectored(bytes); + Matcher::match_name(bytes); let mut b = next!(bytes); // SAFETY: previously bumped by 1 with next! -> always safe. @@ -1241,7 +1358,7 @@ fn parse_headers_iter_uninit<'a>( 'value_lines: loop { // parse value till EOL - simd::match_header_value_vectored(bytes); + Matcher::match_value(bytes); let b = next!(bytes); //found_ctl diff --git a/src/simd/runtime.rs b/src/simd/runtime.rs index c523a92..2d2743d 100644 --- a/src/simd/runtime.rs +++ b/src/simd/runtime.rs @@ -1,7 +1,7 @@ -use std::sync::atomic::{AtomicU8, Ordering}; -use crate::iter::Bytes; use super::avx2; use super::sse42; +use crate::iter::Bytes; +use std::sync::atomic::{AtomicU8, Ordering}; const AVX2: u8 = 1; const SSE42: u8 = 2; @@ -34,24 +34,41 @@ pub fn match_header_name_vectored(bytes: &mut Bytes) { super::swar::match_header_name_vectored(bytes); } +static mut MATCH_URI_VECTORED: fn(&mut Bytes) = setup_and_call_match_uri_vectored; +static mut MATCH_HEADER_VALUE_VECTORED: fn(&mut Bytes) = setup_and_call_match_header_value_vectored; + +fn setup_and_call_match_uri_vectored(bytes: &mut Bytes) { + unsafe { + let feature = get_runtime_feature(); + MATCH_URI_VECTORED = match feature { + AVX2 => avx2::match_uri_vectored, + SSE42 => sse42::match_uri_vectored, + _ /* NOP */ => super::swar::match_uri_vectored, + }; + MATCH_URI_VECTORED(bytes); + } +} + +fn setup_and_call_match_header_value_vectored(bytes: &mut Bytes) { + unsafe { + let feature = get_runtime_feature(); + MATCH_HEADER_VALUE_VECTORED = match feature { + AVX2 => avx2::match_header_value_vectored, + SSE42 => sse42::match_header_value_vectored, + _ /* NOP */ => super::swar::match_header_value_vectored, + }; + MATCH_HEADER_VALUE_VECTORED(bytes); + } +} + pub fn match_uri_vectored(bytes: &mut Bytes) { - // SAFETY: calls are guarded by a feature check unsafe { - match get_runtime_feature() { - AVX2 => avx2::match_uri_vectored(bytes), - SSE42 => sse42::match_uri_vectored(bytes), - _ /* NOP */ => super::swar::match_uri_vectored(bytes), - } + MATCH_URI_VECTORED(bytes); } } pub fn match_header_value_vectored(bytes: &mut Bytes) { - // SAFETY: calls are guarded by a feature check unsafe { - match get_runtime_feature() { - AVX2 => avx2::match_header_value_vectored(bytes), - SSE42 => sse42::match_header_value_vectored(bytes), - _ /* NOP */ => super::swar::match_header_value_vectored(bytes), - } + MATCH_HEADER_VALUE_VECTORED(bytes); } } From dd31c6db3ac6b7c2679a34e5aea6de11623c159c Mon Sep 17 00:00:00 2001 From: Aaron O'Mullan Date: Wed, 4 Sep 2024 02:50:40 +0900 Subject: [PATCH 2/2] re-export funcs --- src/lib.rs | 12 ++++++------ src/simd/runtime.rs | 7 +++++++ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e43e8c5..d85f95e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1063,11 +1063,11 @@ fn parse_headers_iter_uninit<'a>(headers: &mut &mut [MaybeUninit>], b impl HeaderMatcher for Avx2HeaderMatcher { #[inline(always)] fn match_name(bytes: &mut Bytes) { - simd::avx2::match_header_name_vectored(bytes) + simd::avx2_match_header_name_vectored(bytes) } #[inline(always)] fn match_value(bytes: &mut Bytes) { - simd::avx2::match_header_value_vectored(bytes) + simd::avx2_match_header_value_vectored(bytes) } } @@ -1079,11 +1079,11 @@ fn parse_headers_iter_uninit<'a>(headers: &mut &mut [MaybeUninit>], b impl HeaderMatcher for Sse42HeaderMatcher { #[inline(always)] fn match_name(bytes: &mut Bytes) { - simd::sse42::match_header_name_vectored(bytes) + simd::sse42_match_header_name_vectored(bytes) } #[inline(always)] fn match_value(bytes: &mut Bytes) { - simd::sse42::match_header_value_vectored(bytes) + simd::sse42_match_header_value_vectored(bytes) } } @@ -1095,11 +1095,11 @@ fn parse_headers_iter_uninit<'a>(headers: &mut &mut [MaybeUninit>], b impl HeaderMatcher for SwarHeaderMatcher { #[inline(always)] fn match_name(bytes: &mut Bytes) { - simd::swar::match_header_name_vectored(bytes) + simd::swar_match_header_name_vectored(bytes) } #[inline(always)] fn match_value(bytes: &mut Bytes) { - simd::swar::match_header_value_vectored(bytes) + simd::swar_match_header_value_vectored(bytes) } } diff --git a/src/simd/runtime.rs b/src/simd/runtime.rs index 2d2743d..71dfd15 100644 --- a/src/simd/runtime.rs +++ b/src/simd/runtime.rs @@ -3,6 +3,13 @@ use super::sse42; use crate::iter::Bytes; use std::sync::atomic::{AtomicU8, Ordering}; +pub use self::avx2::match_header_name_vectored as avx2_match_header_name_vectored; +pub use self::avx2::match_header_value_vectored as avx2_match_header_value_vectored; +pub use self::sse42::match_header_name_vectored as sse42_match_header_name_vectored; +pub use self::sse42::match_header_value_vectored as sse42_match_header_value_vectored; +pub use self::swar::match_header_name_vectored as swar_match_header_name_vectored; +pub use self::swar::match_header_value_vectored as swar_match_header_value_vectored; + const AVX2: u8 = 1; const SSE42: u8 = 2; const NOP: u8 = 3;