Unidecode non-ASCII input texts

gbenson · May 20, 2024 · 2910462 · 2910462
1 parent f78d610
commit 2910462
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 4 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dom-tokenizers"
-version = "0.0.8"
+version = "0.0.9"
 authors = [{ name = "Gary Benson" }]
 description = "DOM-aware tokenizers for 🤗 Hugging Face language models"
 readme = "README.md"
@@ -27,6 +27,7 @@ dependencies = [
     "python-magic",
     "tokenizers",
     "transformers",
+    "unidecode",
 ]
 
 [project.urls]
@@ -50,6 +51,7 @@ train = [
 [project.scripts]
 train-tokenizer = "dom_tokenizers.train:main"
 dump-tokenizations = "dom_tokenizers.dump:main"
+diff-tokenizer = "dom_tokenizers.diff:main"
 tokenizer-diff = "dom_tokenizers.diff:main"
 
 [build-system]

diff --git a/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py b/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py
@@ -13,6 +13,7 @@
 import magic
 
 from tokenizers import NormalizedString, PreTokenizedString
+from unidecode import unidecode
 
 
 class DOMSnapshotPreTokenizer:
@@ -99,8 +100,7 @@ def base64_matcher(min_encoded_len=24):
 
 class TokenEmitter:
     MAXWORDLEN = 32
-    WORD_RE = re.compile(
-        r"[a-z0-9]+(?:[a-z0-9']*[a-z0-9])?")  # XXX English only :(
+    WORD_RE = re.compile(r"\w+(?:['’]\w+)*")
     ESCAPED_RE = re.compile(
         r"((?:%|\\x|\\u[0-9a-f]{2})[0-9a-f]{2})", re.I)
     HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$")
@@ -184,7 +184,33 @@ def _split(self, text):
             text = text[limit:]
 
     def _split_words(self, text):
-        return self.WORD_RE.findall(text.lower())
+        # self.WORD_RE uses "\w" to match all unicode alphanumerics, but
+        # that also matches "_" which we don't want, so we zap them here
+        text = text.replace("_", " ")
+
+        # We currently limit the characters in tokens to a small subset
+        # of ASCII.  Allowing any uncode alphanumeric massively inflates
+        # the tokenizer's base vocabulary, from 68 symbols to 1145 with
+        # gbenson/interesting-dom-snapshots, and that's a small dataset
+        # of which only a small fraction uses non-Latin alphabets.  If
+        # nothing else this means we need a larger vocabulary and hence
+        # more complex models, and it doesn't make sense to take that hit
+        # without a more representative corpus or any way to create or
+        # validate one.  Until then, we use unidecode to transliterate
+        # non-ASCII characters, as a way to get meaning into embeddings
+        # of non-Latin-alphabet texts.  It's by no means perfect, see
+        # https://pypi.org/project/Unidecode/#frequently-asked-questions
+        # for e.g. issues with CJK languages, but transliteration gets
+        # at least some meaning, meaning we lose if we just drop all the
+        # not-ASCII on the floor.  It also means we generate tokenizers
+        # that can encode pretty much anything, from the BMP at least.
+        words = []
+        for word in self.WORD_RE.findall(text):
+            if word.isascii():
+                words.append(word)
+            else:
+                words.extend(unidecode(word).split())
+        return [word.lower() for word in words]
 
     def _match_urlish_base64(self, encoded):
         urlish = "/".join(self.URLISH_RE.findall(encoded))