Skip to content

Commit

Permalink
'fixed:jieba'
Browse files Browse the repository at this point in the history
  • Loading branch information
lyirs committed Apr 28, 2024
1 parent 7807b19 commit df21c83
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion rasa/nlu/tokenizers/jieba_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,15 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
text = message.get(attribute)

tokenized = jieba.tokenize(text)
tokens = [Token(word, start) for (word, start, end) in tokenized]
tokens = []
current_position = 0
for word, start, end in tokenized:
if word.strip() == "":
continue
word_start = text.find(word, current_position)
word_end = word_start + len(word)
tokens.append(Token(word, word_start, word_end))
current_position = word_end

return self._apply_token_pattern(tokens)

Expand Down

0 comments on commit df21c83

Please sign in to comment.