From 3f068fd038fbdc416d736c227a55a3e4592cf440 Mon Sep 17 00:00:00 2001 From: Joel Wurtz Date: Thu, 29 Aug 2024 12:12:37 +0200 Subject: [PATCH] feat(path): allow utf8 chars in path --- src/lib.rs | 85 ++++++++++++++++++++++++++++++++--------------- src/simd/avx2.rs | 38 ++++++++------------- src/simd/neon.rs | 17 ++++------ src/simd/sse42.rs | 50 +++++++++------------------- src/simd/swar.rs | 2 +- 5 files changed, 94 insertions(+), 98 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4ccd783..b1c930f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -59,35 +59,26 @@ fn is_token(b: u8) -> bool { b > 0x1F && b < 0x7F } -// ASCII codes to accept URI string. -// i.e. A-Z a-z 0-9 !#$%&'*+-._();:@=,/?[]~^ +// char codes to accept URI string. +// i.e. b'!' <= char and char != 127 // TODO: Make a stricter checking for URI string? static URI_MAP: [bool; 256] = byte_map![ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -// \0 \n 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -// commands 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -// \w ! " # $ % & ' ( ) * + , - . / - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, -// 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -// @ A B C D E F G H I J K L M N O 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -// P Q R S T U V W X Y Z [ \ ] ^ _ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -// ` a b c d e f g h i j k l m n o + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, -// p q r s t u v w x y z { | } ~ del -// ====== Extended ASCII (aka. obs-text) ====== - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ]; #[inline] @@ -963,10 +954,11 @@ pub fn parse_uri<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> { return Err(Error::Token); } - return Ok(Status::Complete( - // SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8. - unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) }, - )); + // SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8. + match str::from_utf8(unsafe { bytes.slice_skip(1) }) { + Ok(uri) => Ok(Status::Complete(uri)), + Err(_) => Err(Error::Token), + } } else { Err(Error::Token) } @@ -2053,7 +2045,7 @@ mod tests { assert_eq!(parse_chunk_size(b"567f8a\rfoo"), Err(crate::InvalidChunkSize)); assert_eq!(parse_chunk_size(b"567f8a\rfoo"), Err(crate::InvalidChunkSize)); assert_eq!(parse_chunk_size(b"567xf8a\r\n"), Err(crate::InvalidChunkSize)); - assert_eq!(parse_chunk_size(b"ffffffffffffffff\r\n"), Ok(Status::Complete((18, std::u64::MAX)))); + assert_eq!(parse_chunk_size(b"ffffffffffffffff\r\n"), Ok(Status::Complete((18, u64::MAX)))); assert_eq!(parse_chunk_size(b"1ffffffffffffffff\r\n"), Err(crate::InvalidChunkSize)); assert_eq!(parse_chunk_size(b"Affffffffffffffff\r\n"), Err(crate::InvalidChunkSize)); assert_eq!(parse_chunk_size(b"fffffffffffffffff\r\n"), Err(crate::InvalidChunkSize)); @@ -2161,7 +2153,7 @@ mod tests { assert_eq!(result, Err(crate::Error::Token)); } - static REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH: &[u8] = b"GET /foo>ohno HTTP/1.1\r\n\r\n"; + static REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH: &[u8] = b"GET /foo ohno HTTP/1.1\r\n\r\n"; #[test] fn test_request_with_multiple_spaces_and_bad_path() { @@ -2170,6 +2162,21 @@ mod tests { let result = crate::ParserConfig::default() .allow_multiple_spaces_in_request_line_delimiters(true) .parse_request(&mut request, REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH); + assert_eq!(result, Err(crate::Error::Version)); + } + + // This test ensure there is an error when there is a DEL character in the path + // since we allow all char from 0x21 code except DEL, this test ensure that DEL + // is not allowed in the path + static REQUEST_WITH_DEL_IN_PATH: &[u8] = b"GET /foo\x7Fohno HTTP/1.1\r\n\r\n"; + + #[test] + fn test_request_with_del_in_path() { + let mut headers = [EMPTY_HEADER; NUM_OF_HEADERS]; + let mut request = Request::new(&mut headers[..]); + let result = crate::ParserConfig::default() + .allow_multiple_spaces_in_request_line_delimiters(true) + .parse_request(&mut request, crate::tests::REQUEST_WITH_DEL_IN_PATH); assert_eq!(result, Err(crate::Error::Token)); } @@ -2676,4 +2683,30 @@ mod tests { assert_eq!(response.headers[0].name, "foo"); assert_eq!(response.headers[0].value, &b"bar"[..]); } + + #[test] + fn test_utf8_in_path_ok() { + let mut headers = [EMPTY_HEADER; 1]; + let mut request = Request::new(&mut headers[..]); + + let result = crate::ParserConfig::default().parse_request(&mut request, b"GET /test?post=I\xE2\x80\x99msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n"); + + assert_eq!(result, Ok(Status::Complete(67))); + assert_eq!(request.version.unwrap(), 1); + assert_eq!(request.method.unwrap(), "GET"); + assert_eq!(request.path.unwrap(), "/test?post=I’msorryIforkedyou"); + assert_eq!(request.headers.len(), 1); + assert_eq!(request.headers[0].name, "Host"); + assert_eq!(request.headers[0].value, &b"example.org"[..]); + } + + #[test] + fn test_bad_utf8_in_path() { + let mut headers = [EMPTY_HEADER; 1]; + let mut request = Request::new(&mut headers[..]); + + let result = crate::ParserConfig::default().parse_request(&mut request, b"GET /test?post=I\xE2msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n"); + + assert_eq!(result, Err(crate::Error::Token)); + } } diff --git a/src/simd/avx2.rs b/src/simd/avx2.rs index c1a41f9..078c365 100644 --- a/src/simd/avx2.rs +++ b/src/simd/avx2.rs @@ -4,7 +4,9 @@ use crate::iter::Bytes; #[target_feature(enable = "avx2")] pub unsafe fn match_uri_vectored(bytes: &mut Bytes) { while bytes.as_ref().len() >= 32 { + let advance = match_url_char_32_avx(bytes.as_ref()); + bytes.advance(advance); if advance != 32 { @@ -28,32 +30,18 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize { let ptr = buf.as_ptr(); - let LSH: __m256i = _mm256_set1_epi8(0x0f); - - // See comment in sse42::match_url_char_16_sse. - - let URI: __m256i = _mm256_setr_epi8( - 0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, - 0xfc, 0xfc, 0xfc, 0xfc, 0xf4, 0xfc, 0xf4, 0x7c, - 0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, - 0xfc, 0xfc, 0xfc, 0xfc, 0xf4, 0xfc, 0xf4, 0x7c, - ); - let ARF: __m256i = _mm256_setr_epi8( - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - ); - - let data = _mm256_lddqu_si256(ptr as *const _); - let rbms = _mm256_shuffle_epi8(URI, data); - let cols = _mm256_and_si256(LSH, _mm256_srli_epi16(data, 4)); - let bits = _mm256_and_si256(_mm256_shuffle_epi8(ARF, cols), rbms); - - let v = _mm256_cmpeq_epi8(bits, _mm256_setzero_si256()); - let r = _mm256_movemask_epi8(v) as u32; + // %x21-%x7e %x80-%xff + let DEL: __m256i = _mm256_set1_epi8(0x7f); + let LOW: __m256i = _mm256_set1_epi8(0x21); - r.trailing_zeros() as usize + let dat = _mm256_lddqu_si256(ptr as *const _); + // unsigned comparison dat >= LOW + let low = _mm256_cmpeq_epi8(_mm256_max_epu8(dat, LOW), dat); + let del = _mm256_cmpeq_epi8(dat, DEL); + let bit = _mm256_andnot_si256(del, low); + let res = _mm256_movemask_epi8(bit) as u32; + // TODO: use .trailing_ones() once MSRV >= 1.46 + (!res).trailing_zeros() as usize } #[target_feature(enable = "avx2")] diff --git a/src/simd/neon.rs b/src/simd/neon.rs index c6b86a8..6d2796a 100644 --- a/src/simd/neon.rs +++ b/src/simd/neon.rs @@ -125,17 +125,12 @@ unsafe fn match_header_name_char_16_neon(ptr: *const u8) -> usize { unsafe fn match_url_char_16_neon(ptr: *const u8) -> usize { let input = vld1q_u8(ptr); - // Check that b'!' <= input <= b'~' - let result = vandq_u8( - vcleq_u8(vdupq_n_u8(b'!'), input), - vcleq_u8(input, vdupq_n_u8(b'~')), - ); - // Check that input != b'<' and input != b'>' - let lt = vceqq_u8(input, vdupq_n_u8(b'<')); - let gt = vceqq_u8(input, vdupq_n_u8(b'>')); - let ltgt = vorrq_u8(lt, gt); - // Nand with result - let result = vbicq_u8(result, ltgt); + // Check that b'!' <= and b != 127 + let result = vcleq_u8(vdupq_n_u8(b'!'), input); + + // Disallow del + let del = vceqq_u8(input, vdupq_n_u8(0x7F)); + let result = vbicq_u8(result, del); offsetz(result) as usize } diff --git a/src/simd/sse42.rs b/src/simd/sse42.rs index d6fbf02..0fabdfe 100644 --- a/src/simd/sse42.rs +++ b/src/simd/sse42.rs @@ -4,6 +4,7 @@ use crate::iter::Bytes; pub unsafe fn match_uri_vectored(bytes: &mut Bytes) { while bytes.as_ref().len() >= 16 { let advance = match_url_char_16_sse(bytes.as_ref()); + bytes.advance(advance); if advance != 16 { @@ -14,7 +15,7 @@ pub unsafe fn match_uri_vectored(bytes: &mut Bytes) { } #[inline(always)] -#[allow(non_snake_case, overflowing_literals)] +#[allow(non_snake_case)] unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize { debug_assert!(buf.len() >= 16); @@ -25,40 +26,19 @@ unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize { let ptr = buf.as_ptr(); - let LSH: __m128i = _mm_set1_epi8(0x0f); - - // The first 0xf8 corresponds to the 8 first rows of the first column - // of URI_MAP in the crate's root, with the first row corresponding to bit 0 - // and the 8th row corresponding to bit 7. - // The 8 first rows give 0 0 0 1 1 1 1 1, which is 0xf8 (with least - // significant digit on the left). - // - // Another example just to drive the point home: in column 15, '>' is - // rejected, so the values are 0 0 1 0 1 1 1 1, which gives us 0xf4. - // - // Thanks to Vlad Krasnov for explaining this stuff to us mere mortals in - // a GitHub comment! - // - // https://github.com/seanmonstar/httparse/pull/89#issuecomment-807039219 - - let URI: __m128i = _mm_setr_epi8( - 0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, - 0xfc, 0xfc, 0xfc, 0xfc, 0xf4, 0xfc, 0xf4, 0x7c, - ); - let ARF: __m128i = _mm_setr_epi8( - 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - ); - - let data = _mm_lddqu_si128(ptr as *const _); - let rbms = _mm_shuffle_epi8(URI, data); - let cols = _mm_and_si128(LSH, _mm_srli_epi16(data, 4)); - let bits = _mm_and_si128(_mm_shuffle_epi8(ARF, cols), rbms); - - let v = _mm_cmpeq_epi8(bits, _mm_setzero_si128()); - let r = _mm_movemask_epi8(v) as u16; - - r.trailing_zeros() as usize + // %x21-%x7e %x80-%xff + let DEL: __m128i = _mm_set1_epi8(0x7f); + let LOW: __m128i = _mm_set1_epi8(0x21); + + let dat = _mm_lddqu_si128(ptr as *const _); + // unsigned comparison dat >= LOW + let low = _mm_cmpeq_epi8(_mm_max_epu8(dat, LOW), dat); + let del = _mm_cmpeq_epi8(dat, DEL); + let bit = _mm_andnot_si128(del, low); + let res = _mm_movemask_epi8(bit) as u16; + + // TODO: use .trailing_ones() once MSRV >= 1.46 + (!res).trailing_zeros() as usize } #[target_feature(enable = "sse4.2")] diff --git a/src/simd/swar.rs b/src/simd/swar.rs index 857fc58..5925d62 100644 --- a/src/simd/swar.rs +++ b/src/simd/swar.rs @@ -106,7 +106,7 @@ fn match_block(f: impl Fn(u8) -> bool, block: ByteBlock) -> usize { // A const alternative to u64::from_ne_bytes to avoid bumping MSRV (1.36 => 1.44) // creates a u64 whose bytes are each equal to b const fn uniform_block(b: u8) -> usize { - (b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize + (b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize } // A byte-wise range-check on an enire word/block,