Skip to content

Commit

Permalink
Unidecode non-ASCII input texts
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed May 20, 2024
1 parent f78d610 commit 2910462
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 4 deletions.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dom-tokenizers"
version = "0.0.8"
version = "0.0.9"
authors = [{ name = "Gary Benson" }]
description = "DOM-aware tokenizers for 🤗 Hugging Face language models"
readme = "README.md"
Expand All @@ -27,6 +27,7 @@ dependencies = [
"python-magic",
"tokenizers",
"transformers",
"unidecode",
]

[project.urls]
Expand All @@ -50,6 +51,7 @@ train = [
[project.scripts]
train-tokenizer = "dom_tokenizers.train:main"
dump-tokenizations = "dom_tokenizers.dump:main"
diff-tokenizer = "dom_tokenizers.diff:main"
tokenizer-diff = "dom_tokenizers.diff:main"

[build-system]
Expand Down
32 changes: 29 additions & 3 deletions src/dom_tokenizers/pre_tokenizers/dom_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import magic

from tokenizers import NormalizedString, PreTokenizedString
from unidecode import unidecode


class DOMSnapshotPreTokenizer:
Expand Down Expand Up @@ -99,8 +100,7 @@ def base64_matcher(min_encoded_len=24):

class TokenEmitter:
MAXWORDLEN = 32
WORD_RE = re.compile(
r"[a-z0-9]+(?:[a-z0-9']*[a-z0-9])?") # XXX English only :(
WORD_RE = re.compile(r"\w+(?:['’]\w+)*")
ESCAPED_RE = re.compile(
r"((?:%|\\x|\\u[0-9a-f]{2})[0-9a-f]{2})", re.I)
HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$")
Expand Down Expand Up @@ -184,7 +184,33 @@ def _split(self, text):
text = text[limit:]

def _split_words(self, text):
return self.WORD_RE.findall(text.lower())
# self.WORD_RE uses "\w" to match all unicode alphanumerics, but
# that also matches "_" which we don't want, so we zap them here
text = text.replace("_", " ")

# We currently limit the characters in tokens to a small subset
# of ASCII. Allowing any uncode alphanumeric massively inflates
# the tokenizer's base vocabulary, from 68 symbols to 1145 with
# gbenson/interesting-dom-snapshots, and that's a small dataset
# of which only a small fraction uses non-Latin alphabets. If
# nothing else this means we need a larger vocabulary and hence
# more complex models, and it doesn't make sense to take that hit
# without a more representative corpus or any way to create or
# validate one. Until then, we use unidecode to transliterate
# non-ASCII characters, as a way to get meaning into embeddings
# of non-Latin-alphabet texts. It's by no means perfect, see
# https://pypi.org/project/Unidecode/#frequently-asked-questions
# for e.g. issues with CJK languages, but transliteration gets
# at least some meaning, meaning we lose if we just drop all the
# not-ASCII on the floor. It also means we generate tokenizers
# that can encode pretty much anything, from the BMP at least.
words = []
for word in self.WORD_RE.findall(text):
if word.isascii():
words.append(word)
else:
words.extend(unidecode(word).split())
return [word.lower() for word in words]

def _match_urlish_base64(self, encoded):
urlish = "/".join(self.URLISH_RE.findall(encoded))
Expand Down

0 comments on commit 2910462

Please sign in to comment.