From 3f068fd038fbdc416d736c227a55a3e4592cf440 Mon Sep 17 00:00:00 2001
From: Joel Wurtz <jwurtz@jolicode.com>
Date: Thu, 29 Aug 2024 12:12:37 +0200
Subject: [PATCH] feat(path): allow utf8 chars in path

---
 src/lib.rs        | 85 ++++++++++++++++++++++++++++++++---------------
 src/simd/avx2.rs  | 38 ++++++++-------------
 src/simd/neon.rs  | 17 ++++------
 src/simd/sse42.rs | 50 +++++++++-------------------
 src/simd/swar.rs  |  2 +-
 5 files changed, 94 insertions(+), 98 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 4ccd783..b1c930f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -59,35 +59,26 @@ fn is_token(b: u8) -> bool {
     b > 0x1F && b < 0x7F
 }
 
-// ASCII codes to accept URI string.
-// i.e. A-Z a-z 0-9 !#$%&'*+-._();:@=,/?[]~^
+// char codes to accept URI string.
+// i.e. b'!' <= char and char != 127
 // TODO: Make a stricter checking for URI string?
 static URI_MAP: [bool; 256] = byte_map![
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-//  \0                            \n
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-//  commands
     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-//  \w !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
-//  0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-//  @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-//  P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-//  `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
-//  p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  del
-//   ====== Extended ASCII (aka. obs-text) ======
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 ];
 
 #[inline]
@@ -963,10 +954,11 @@ pub fn parse_uri<'a>(bytes: &mut Bytes<'a>) -> Result<&'a str> {
             return Err(Error::Token);
         }
 
-        return Ok(Status::Complete(
-            // SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
-            unsafe { str::from_utf8_unchecked(bytes.slice_skip(1)) },
-        ));
+        // SAFETY: all bytes up till `i` must have been `is_token` and therefore also utf-8.
+        match str::from_utf8(unsafe { bytes.slice_skip(1) }) {
+            Ok(uri) => Ok(Status::Complete(uri)),
+            Err(_) => Err(Error::Token),
+        }
     } else {
         Err(Error::Token)
     }
@@ -2053,7 +2045,7 @@ mod tests {
         assert_eq!(parse_chunk_size(b"567f8a\rfoo"), Err(crate::InvalidChunkSize));
         assert_eq!(parse_chunk_size(b"567f8a\rfoo"), Err(crate::InvalidChunkSize));
         assert_eq!(parse_chunk_size(b"567xf8a\r\n"), Err(crate::InvalidChunkSize));
-        assert_eq!(parse_chunk_size(b"ffffffffffffffff\r\n"), Ok(Status::Complete((18, std::u64::MAX))));
+        assert_eq!(parse_chunk_size(b"ffffffffffffffff\r\n"), Ok(Status::Complete((18, u64::MAX))));
         assert_eq!(parse_chunk_size(b"1ffffffffffffffff\r\n"), Err(crate::InvalidChunkSize));
         assert_eq!(parse_chunk_size(b"Affffffffffffffff\r\n"), Err(crate::InvalidChunkSize));
         assert_eq!(parse_chunk_size(b"fffffffffffffffff\r\n"), Err(crate::InvalidChunkSize));
@@ -2161,7 +2153,7 @@ mod tests {
         assert_eq!(result, Err(crate::Error::Token));
     }
 
-    static REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH: &[u8] = b"GET   /foo>ohno HTTP/1.1\r\n\r\n";
+    static REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH: &[u8] = b"GET   /foo ohno HTTP/1.1\r\n\r\n";
 
     #[test]
     fn test_request_with_multiple_spaces_and_bad_path() {
@@ -2170,6 +2162,21 @@ mod tests {
         let result = crate::ParserConfig::default()
             .allow_multiple_spaces_in_request_line_delimiters(true)
             .parse_request(&mut request, REQUEST_WITH_MULTIPLE_SPACES_AND_BAD_PATH);
+        assert_eq!(result, Err(crate::Error::Version));
+    }
+
+    // This test ensure there is an error when there is a DEL character in the path
+    // since we allow all char from 0x21 code except DEL, this test ensure that DEL
+    // is not allowed in the path
+    static REQUEST_WITH_DEL_IN_PATH: &[u8] = b"GET   /foo\x7Fohno HTTP/1.1\r\n\r\n";
+
+    #[test]
+    fn test_request_with_del_in_path() {
+        let mut headers = [EMPTY_HEADER; NUM_OF_HEADERS];
+        let mut request = Request::new(&mut headers[..]);
+        let result = crate::ParserConfig::default()
+            .allow_multiple_spaces_in_request_line_delimiters(true)
+            .parse_request(&mut request, crate::tests::REQUEST_WITH_DEL_IN_PATH);
         assert_eq!(result, Err(crate::Error::Token));
     }
 
@@ -2676,4 +2683,30 @@ mod tests {
         assert_eq!(response.headers[0].name, "foo");
         assert_eq!(response.headers[0].value, &b"bar"[..]);
     }
+
+    #[test]
+    fn test_utf8_in_path_ok() {
+        let mut headers = [EMPTY_HEADER; 1];
+        let mut request = Request::new(&mut headers[..]);
+
+        let result = crate::ParserConfig::default().parse_request(&mut request, b"GET /test?post=I\xE2\x80\x99msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n");
+
+        assert_eq!(result, Ok(Status::Complete(67)));
+        assert_eq!(request.version.unwrap(), 1);
+        assert_eq!(request.method.unwrap(), "GET");
+        assert_eq!(request.path.unwrap(), "/test?post=I’msorryIforkedyou");
+        assert_eq!(request.headers.len(), 1);
+        assert_eq!(request.headers[0].name, "Host");
+        assert_eq!(request.headers[0].value, &b"example.org"[..]);
+    }
+
+    #[test]
+    fn test_bad_utf8_in_path() {
+        let mut headers = [EMPTY_HEADER; 1];
+        let mut request = Request::new(&mut headers[..]);
+
+        let result = crate::ParserConfig::default().parse_request(&mut request, b"GET /test?post=I\xE2msorryIforkedyou HTTP/1.1\r\nHost: example.org\r\n\r\n");
+
+        assert_eq!(result, Err(crate::Error::Token));
+    }
 }
diff --git a/src/simd/avx2.rs b/src/simd/avx2.rs
index c1a41f9..078c365 100644
--- a/src/simd/avx2.rs
+++ b/src/simd/avx2.rs
@@ -4,7 +4,9 @@ use crate::iter::Bytes;
 #[target_feature(enable = "avx2")]
 pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
     while bytes.as_ref().len() >= 32 {
+
         let advance = match_url_char_32_avx(bytes.as_ref());
+
         bytes.advance(advance);
 
         if advance != 32 {
@@ -28,32 +30,18 @@ unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
 
     let ptr = buf.as_ptr();
 
-    let LSH: __m256i = _mm256_set1_epi8(0x0f);
-
-    // See comment in sse42::match_url_char_16_sse.
-
-    let URI: __m256i = _mm256_setr_epi8(
-        0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
-        0xfc, 0xfc, 0xfc, 0xfc, 0xf4, 0xfc, 0xf4, 0x7c,
-        0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
-        0xfc, 0xfc, 0xfc, 0xfc, 0xf4, 0xfc, 0xf4, 0x7c,
-    );
-    let ARF: __m256i = _mm256_setr_epi8(
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    );
-
-    let data = _mm256_lddqu_si256(ptr as *const _);
-    let rbms = _mm256_shuffle_epi8(URI, data);
-    let cols = _mm256_and_si256(LSH, _mm256_srli_epi16(data, 4));
-    let bits = _mm256_and_si256(_mm256_shuffle_epi8(ARF, cols), rbms);
-
-    let v = _mm256_cmpeq_epi8(bits, _mm256_setzero_si256());
-    let r = _mm256_movemask_epi8(v) as u32;
+    // %x21-%x7e %x80-%xff
+    let DEL: __m256i = _mm256_set1_epi8(0x7f);
+    let LOW: __m256i = _mm256_set1_epi8(0x21);
 
-    r.trailing_zeros() as usize
+    let dat = _mm256_lddqu_si256(ptr as *const _);
+    // unsigned comparison dat >= LOW
+    let low = _mm256_cmpeq_epi8(_mm256_max_epu8(dat, LOW), dat);
+    let del = _mm256_cmpeq_epi8(dat, DEL);
+    let bit = _mm256_andnot_si256(del, low);
+    let res = _mm256_movemask_epi8(bit) as u32;
+    // TODO: use .trailing_ones() once MSRV >= 1.46
+    (!res).trailing_zeros() as usize
 }
 
 #[target_feature(enable = "avx2")]
diff --git a/src/simd/neon.rs b/src/simd/neon.rs
index c6b86a8..6d2796a 100644
--- a/src/simd/neon.rs
+++ b/src/simd/neon.rs
@@ -125,17 +125,12 @@ unsafe fn match_header_name_char_16_neon(ptr: *const u8) -> usize {
 unsafe fn match_url_char_16_neon(ptr: *const u8) -> usize {
     let input = vld1q_u8(ptr);
 
-    // Check that b'!' <= input <= b'~'
-    let result = vandq_u8(
-        vcleq_u8(vdupq_n_u8(b'!'), input),
-        vcleq_u8(input, vdupq_n_u8(b'~')),
-    );
-    // Check that input != b'<' and input != b'>'
-    let lt = vceqq_u8(input, vdupq_n_u8(b'<'));
-    let gt = vceqq_u8(input, vdupq_n_u8(b'>'));
-    let ltgt = vorrq_u8(lt, gt);
-    // Nand with result
-    let result = vbicq_u8(result, ltgt);
+    // Check that b'!' <= and b != 127
+    let result = vcleq_u8(vdupq_n_u8(b'!'), input);
+
+    // Disallow del
+    let del = vceqq_u8(input, vdupq_n_u8(0x7F));
+    let result = vbicq_u8(result, del);
 
     offsetz(result) as usize
 }
diff --git a/src/simd/sse42.rs b/src/simd/sse42.rs
index d6fbf02..0fabdfe 100644
--- a/src/simd/sse42.rs
+++ b/src/simd/sse42.rs
@@ -4,6 +4,7 @@ use crate::iter::Bytes;
 pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
     while bytes.as_ref().len() >= 16 {
         let advance = match_url_char_16_sse(bytes.as_ref());
+
         bytes.advance(advance);
 
         if advance != 16 {
@@ -14,7 +15,7 @@ pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
 }
 
 #[inline(always)]
-#[allow(non_snake_case, overflowing_literals)]
+#[allow(non_snake_case)]
 unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
     debug_assert!(buf.len() >= 16);
 
@@ -25,40 +26,19 @@ unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
 
     let ptr = buf.as_ptr();
 
-    let LSH: __m128i = _mm_set1_epi8(0x0f);
-
-    // The first 0xf8 corresponds to the 8 first rows of the first column
-    // of URI_MAP in the crate's root, with the first row corresponding to bit 0
-    // and the 8th row corresponding to bit 7.
-    // The 8 first rows give 0 0 0 1 1 1 1 1, which is 0xf8 (with least
-    // significant digit on the left).
-    //
-    // Another example just to drive the point home: in column 15, '>' is
-    // rejected, so the values are 0 0 1 0 1 1 1 1, which gives us 0xf4.
-    //
-    // Thanks to Vlad Krasnov for explaining this stuff to us mere mortals in
-    // a GitHub comment!
-    //
-    // https://github.com/seanmonstar/httparse/pull/89#issuecomment-807039219
-
-    let URI: __m128i = _mm_setr_epi8(
-        0xf8, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
-        0xfc, 0xfc, 0xfc, 0xfc, 0xf4, 0xfc, 0xf4, 0x7c,
-    );
-    let ARF: __m128i = _mm_setr_epi8(
-        0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    );
-
-    let data = _mm_lddqu_si128(ptr as *const _);
-    let rbms = _mm_shuffle_epi8(URI, data);
-    let cols = _mm_and_si128(LSH, _mm_srli_epi16(data, 4));
-    let bits = _mm_and_si128(_mm_shuffle_epi8(ARF, cols), rbms);
-
-    let v = _mm_cmpeq_epi8(bits, _mm_setzero_si128());
-    let r = _mm_movemask_epi8(v) as u16;
-
-    r.trailing_zeros() as usize
+    // %x21-%x7e %x80-%xff
+    let DEL: __m128i = _mm_set1_epi8(0x7f);
+    let LOW: __m128i = _mm_set1_epi8(0x21);
+
+    let dat = _mm_lddqu_si128(ptr as *const _);
+    // unsigned comparison dat >= LOW
+    let low = _mm_cmpeq_epi8(_mm_max_epu8(dat, LOW), dat);
+    let del = _mm_cmpeq_epi8(dat, DEL);
+    let bit = _mm_andnot_si128(del, low);
+    let res = _mm_movemask_epi8(bit) as u16;
+
+    // TODO: use .trailing_ones() once MSRV >= 1.46
+    (!res).trailing_zeros() as usize
 }
 
 #[target_feature(enable = "sse4.2")]
diff --git a/src/simd/swar.rs b/src/simd/swar.rs
index 857fc58..5925d62 100644
--- a/src/simd/swar.rs
+++ b/src/simd/swar.rs
@@ -106,7 +106,7 @@ fn match_block(f: impl Fn(u8) -> bool, block: ByteBlock) -> usize {
 // A const alternative to u64::from_ne_bytes to avoid bumping MSRV (1.36 => 1.44)
 // creates a u64 whose bytes are each equal to b
 const fn uniform_block(b: u8) -> usize {
-    (b as u64 *  0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize 
+    (b as u64 *  0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize
 }
 
 // A byte-wise range-check on an enire word/block,