Skip to content

Commit

Permalink
Minor tokenizer improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
mdecimus committed Oct 16, 2024
1 parent 0ee2fe3 commit a1dbd56
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 69 deletions.
69 changes: 33 additions & 36 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 9 additions & 33 deletions crates/nlp/src/tokenizers/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,7 @@ impl<'x> Iterator for TypesTokenizer<'x> {
}

// Try parsing email
if self.tokenize_emails && token.word.is_email_atom()
/*&& self.peek_has_tokens(
&[TokenType::Punctuation('@'), TokenType::Punctuation('.')],
TokenType::Space,
)*/
{
if self.tokenize_emails && token.word.is_email_atom() {
self.peek_rewind();
if let Some(email) = self.try_parse_email() {
self.peek_advance();
Expand All @@ -83,9 +78,7 @@ impl<'x> Iterator for TypesTokenizer<'x> {
}

// Try parsing URL without scheme
if self.tokenize_urls_without_scheme && token.word.is_domain_atom(true)
//&& self.peek_has_tokens(&[TokenType::Punctuation('.')], TokenType::Space)
{
if self.tokenize_urls_without_scheme && token.word.is_domain_atom(true) {
self.peek_rewind();
if let Some(url) = self.try_parse_url(None) {
self.peek_advance();
Expand Down Expand Up @@ -247,30 +240,6 @@ impl<'x> TypesTokenizer<'x> {
self.peek_pos = 0;
}

/*fn peek_has_tokens(
&mut self,
tokens: &[TokenType<&'_ str>],
stop_token: impl Fn(&TokenType<&'_ str>) -> bool,
) -> bool {
let mut tokens = tokens.iter().copied();
let mut token = tokens.next().unwrap();
while let Some(t) = self.peek() {
if t.word == token {
if let Some(next_token) = tokens.next() {
token = next_token;
} else {
self.peek_rewind();
return true;
}
} else if stop_token(&t.word) {
break;
}
}
self.peek_rewind();
false
}*/

fn try_parse_url(
&mut self,
scheme_token: Option<Token<TokenType<&'_ str>>>,
Expand Down Expand Up @@ -498,6 +467,9 @@ impl<'x> TypesTokenizer<'x> {
// Find local part
loop {
let token = self.peek()?;
if token.to - start_token.from > 255 {
return None;
}
match token.word {
word if word.is_email_atom() => {
last_is_dot = false;
Expand Down Expand Up @@ -585,6 +557,10 @@ impl<'x> TypesTokenizer<'x> {
}
end_pos = token.to;
restore_pos = self.peek_pos;

if end_pos - start_pos > 255 {
return None;
}
}
self.peek_pos = restore_pos;

Expand Down

0 comments on commit a1dbd56

Please sign in to comment.