From a1dbd566fcf4e684d94b6c5b9435e1b428c0ea3a Mon Sep 17 00:00:00 2001 From: mdecimus Date: Wed, 16 Oct 2024 10:49:01 +0200 Subject: [PATCH] Minor tokenizer improvements --- Cargo.lock | 69 ++++++++++++++---------------- crates/nlp/src/tokenizers/types.rs | 42 ++++-------------- 2 files changed, 42 insertions(+), 69 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 491978169..e78a971f3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1066,7 +1066,7 @@ dependencies = [ "dns-update", "futures", "hostname 0.4.0", - "hyper 1.4.1", + "hyper 1.5.0", "idna 1.0.2", "imagesize", "imap_proto", @@ -1606,18 +1606,18 @@ dependencies = [ [[package]] name = "derive_builder" -version = "0.20.1" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd33f37ee6a119146a1781d3356a7c26028f83d779b2e04ecd45fdc75c76877b" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" dependencies = [ "derive_builder_macro", ] [[package]] name = "derive_builder_core" -version = "0.20.1" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7431fa049613920234f22c47fdc33e6cf3ee83067091ea4277a3f8c4587aae38" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" dependencies = [ "darling 0.20.10", "proc-macro2", @@ -1627,9 +1627,9 @@ dependencies = [ [[package]] name = "derive_builder_macro" -version = "0.20.1" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", "syn 2.0.79", @@ -2707,9 +2707,9 @@ checksum = "9994b79e8c1a39b3166c63ae7823bb2b00831e2a96a31399c50fe69df408eaeb" [[package]] name = "hyper" -version = "0.14.30" +version = "0.14.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9" +checksum = "8c08302e8fa335b151b788c775ff56e7a03ae64ff85c548ee820fecb70356e85" dependencies = [ "bytes", "futures-channel", @@ -2731,9 +2731,9 @@ dependencies = [ [[package]] name = "hyper" -version = "1.4.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" +checksum = "bbbff0a806a4728c99295b254c8838933b5b082d75e3cb70c8dab21fdfbcfa9a" dependencies = [ "bytes", "futures-channel", @@ -2758,7 +2758,7 @@ checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" dependencies = [ "futures-util", "http 0.2.12", - "hyper 0.14.30", + "hyper 0.14.31", "rustls 0.21.12", "tokio", "tokio-rustls 0.24.1", @@ -2772,7 +2772,7 @@ checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" dependencies = [ "futures-util", "http 1.1.0", - "hyper 1.4.1", + "hyper 1.5.0", "hyper-util", "rustls 0.23.14", "rustls-pki-types", @@ -2788,7 +2788,7 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3203a961e5c83b6f5498933e78b6b263e208c197b63e9c6c53cc82ffd3f63793" dependencies = [ - "hyper 1.4.1", + "hyper 1.5.0", "hyper-util", "pin-project-lite", "tokio", @@ -2806,7 +2806,7 @@ dependencies = [ "futures-util", "http 1.1.0", "http-body 1.0.1", - "hyper 1.4.1", + "hyper 1.5.0", "pin-project-lite", "socket2", "tokio", @@ -3241,7 +3241,7 @@ dependencies = [ "futures-util", "hkdf", "http-body-util", - "hyper 1.4.1", + "hyper 1.5.0", "hyper-util", "jmap_proto", "lz4_flex", @@ -4095,12 +4095,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.20.1" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82881c4be219ab5faaf2ad5e5e5ecdff8c66bd7402ca3160975c93b24961afd1" -dependencies = [ - "portable-atomic", -] +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "opaque-debug" @@ -4110,9 +4107,9 @@ checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" [[package]] name = "openssl" -version = "0.10.66" +version = "0.10.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" +checksum = "7b8cefcf97f41316955f9294cd61f639bdcfa9f2f230faac6cb896aa8ab64704" dependencies = [ "bitflags 2.6.0", "cfg-if", @@ -4142,9 +4139,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.103" +version = "0.9.104" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6" +checksum = "45abf306cbf99debc8195b66b7346498d7b10c210de50418b5ccd7ceba08c741" dependencies = [ "cc", "libc", @@ -4656,9 +4653,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "b3e4daa0dcf6feba26f985457cdf104d4b4256fc5a09547140f3631bb076b19a" dependencies = [ "unicode-ident", ] @@ -5115,7 +5112,7 @@ dependencies = [ "h2 0.3.26", "http 0.2.12", "http-body 0.4.6", - "hyper 0.14.30", + "hyper 0.14.31", "hyper-rustls 0.24.2", "ipnet", "js-sys", @@ -5159,7 +5156,7 @@ dependencies = [ "http 1.1.0", "http-body 1.0.1", "http-body-util", - "hyper 1.4.1", + "hyper 1.5.0", "hyper-rustls 0.27.3", "hyper-util", "ipnet", @@ -5396,7 +5393,7 @@ dependencies = [ "hex", "hmac 0.12.1", "http 0.2.12", - "hyper 0.14.30", + "hyper 0.14.31", "hyper-rustls 0.24.2", "log", "maybe-async", @@ -6073,7 +6070,7 @@ dependencies = [ "directory", "form_urlencoded", "http-body-util", - "hyper 1.4.1", + "hyper 1.5.0", "hyper-util", "lru-cache", "mail-auth", @@ -6443,7 +6440,7 @@ dependencies = [ "form_urlencoded", "futures", "http-body-util", - "hyper 1.4.1", + "hyper 1.5.0", "hyper-util", "imap", "imap_proto", @@ -6744,7 +6741,7 @@ dependencies = [ "http 1.1.0", "http-body 1.0.1", "http-body-util", - "hyper 1.4.1", + "hyper 1.5.0", "hyper-timeout", "hyper-util", "percent-encoding", @@ -7132,9 +7129,9 @@ dependencies = [ [[package]] name = "uuid" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" +checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" dependencies = [ "getrandom", ] @@ -7352,7 +7349,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] diff --git a/crates/nlp/src/tokenizers/types.rs b/crates/nlp/src/tokenizers/types.rs index e5b776778..41d48c0e8 100644 --- a/crates/nlp/src/tokenizers/types.rs +++ b/crates/nlp/src/tokenizers/types.rs @@ -68,12 +68,7 @@ impl<'x> Iterator for TypesTokenizer<'x> { } // Try parsing email - if self.tokenize_emails && token.word.is_email_atom() - /*&& self.peek_has_tokens( - &[TokenType::Punctuation('@'), TokenType::Punctuation('.')], - TokenType::Space, - )*/ - { + if self.tokenize_emails && token.word.is_email_atom() { self.peek_rewind(); if let Some(email) = self.try_parse_email() { self.peek_advance(); @@ -83,9 +78,7 @@ impl<'x> Iterator for TypesTokenizer<'x> { } // Try parsing URL without scheme - if self.tokenize_urls_without_scheme && token.word.is_domain_atom(true) - //&& self.peek_has_tokens(&[TokenType::Punctuation('.')], TokenType::Space) - { + if self.tokenize_urls_without_scheme && token.word.is_domain_atom(true) { self.peek_rewind(); if let Some(url) = self.try_parse_url(None) { self.peek_advance(); @@ -247,30 +240,6 @@ impl<'x> TypesTokenizer<'x> { self.peek_pos = 0; } - /*fn peek_has_tokens( - &mut self, - tokens: &[TokenType<&'_ str>], - stop_token: impl Fn(&TokenType<&'_ str>) -> bool, - ) -> bool { - let mut tokens = tokens.iter().copied(); - let mut token = tokens.next().unwrap(); - while let Some(t) = self.peek() { - if t.word == token { - if let Some(next_token) = tokens.next() { - token = next_token; - } else { - self.peek_rewind(); - return true; - } - } else if stop_token(&t.word) { - break; - } - } - - self.peek_rewind(); - false - }*/ - fn try_parse_url( &mut self, scheme_token: Option>>, @@ -498,6 +467,9 @@ impl<'x> TypesTokenizer<'x> { // Find local part loop { let token = self.peek()?; + if token.to - start_token.from > 255 { + return None; + } match token.word { word if word.is_email_atom() => { last_is_dot = false; @@ -585,6 +557,10 @@ impl<'x> TypesTokenizer<'x> { } end_pos = token.to; restore_pos = self.peek_pos; + + if end_pos - start_pos > 255 { + return None; + } } self.peek_pos = restore_pos;