diff --git a/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py b/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py index 0658135..ae3aa99 100644 --- a/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py +++ b/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py @@ -22,9 +22,6 @@ class DOMSnapshotPreTokenizer(PreTokenizer): """Pre-tokenizer that consumes JSON-serialized DOM snapshots and emits tokenized representations of the snapshotted DOMs. """ - bos_token = "[BOS]" # beginning of sequence - eos_token = "[EOS]" # end of sequence - sep_token = "[SEP]" # separator between documents elem_token = "[TAG]" # beginning of element name attr_token = "[ATTR]" # beginning of attribute comm_token = "[COMMENT]" # beginning of comment @@ -59,9 +56,6 @@ def _split_serialized(self, snapshot: dict) -> Iterable[List[NormalizedString]]: attr_token = [NormalizedString(self.attr_token)] for document_index, document in enumerate(snapshot["documents"]): - token = self.bos_token if document_index == 0 else self.sep_token - yield [NormalizedString(token)] - nodes = document["nodes"] for node_index, node_values in enumerate(zip( nodes["nodeType"], @@ -86,8 +80,6 @@ def _split_serialized(self, snapshot: dict) -> Iterable[List[NormalizedString]]: yield [NormalizedString(self.comm_token)] yield emitter.emit(value_index) - yield [NormalizedString(self.eos_token)] - _B64_RE_S = r"(?:[A-Za-z0-9+/]{4}){" _B64_RE_E = r",}(?:[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{2}==)?"