Skip to content

Commit

Permalink
Support hooking into models
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed May 22, 2024
1 parent c51f6b4 commit 4f06134
Showing 1 changed file with 18 additions and 7 deletions.
25 changes: 18 additions & 7 deletions src/dom_tokenizers/pre_tokenizers/pre_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,35 @@

class BasePreTokenizer:
@classmethod
def hook_into(cls, tokenizer):
"""Reconfigure `tokenizer` for DOM-aware pre-tokenization.
def hook_into(cls, model_or_tokenizer):
"""Reconfigure `model_or_tokenizer` for DOM-aware pre-tokenization.
"""
cls().bind_to(tokenizer)
cls().bind_to(model_or_tokenizer)

def __init__(self):
self._tokenizer = None
self._model = None
self._lowercase_output = False

def bind_to(self, tokenizer):
"""Reconfigure `tokenizer` to pre-tokenize using `self`.
def bind_to(self, model_or_tokenizer):
"""Reconfigure `model_or_tokenizer` to pre-tokenize using `self`.
"""
if self._tokenizer is not None:
raise RuntimeError("already bound")

try:
backend = tokenizer.backend_tokenizer
backend = model_or_tokenizer.backend_tokenizer
tokenizer = model_or_tokenizer
except AttributeError as e:
raise TypeError("not a tokenizer") from e
try:
tokenizer = model_or_tokenizer.tokenizer
backend = tokenizer.backend_tokenizer
self._model = weakref.proxy(model_or_tokenizer)
except AttributeError:
pass
raise TypeError("not a tokenizer or model") from e
del model_or_tokenizer

if hasattr(tokenizer, "dom_pre_tokenizer"):
raise RuntimeError("already bound")

Expand Down

0 comments on commit 4f06134

Please sign in to comment.