Skip to content

Commit

Permalink
Don't reuse other models' special tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed May 22, 2024
1 parent 111ce9a commit 63360ca
Showing 1 changed file with 0 additions and 8 deletions.
8 changes: 0 additions & 8 deletions src/dom_tokenizers/pre_tokenizers/dom_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,6 @@ class DOMSnapshotPreTokenizer(PreTokenizer):
"""Pre-tokenizer that consumes JSON-serialized DOM snapshots
and emits tokenized representations of the snapshotted DOMs.
"""
bos_token = "[BOS]" # beginning of sequence
eos_token = "[EOS]" # end of sequence
sep_token = "[SEP]" # separator between documents
elem_token = "[TAG]" # beginning of element name
attr_token = "[ATTR]" # beginning of attribute
comm_token = "[COMMENT]" # beginning of comment
Expand Down Expand Up @@ -59,9 +56,6 @@ def _split_serialized(self, snapshot: dict) -> Iterable[List[NormalizedString]]:
attr_token = [NormalizedString(self.attr_token)]

for document_index, document in enumerate(snapshot["documents"]):
token = self.bos_token if document_index == 0 else self.sep_token
yield [NormalizedString(token)]

nodes = document["nodes"]
for node_index, node_values in enumerate(zip(
nodes["nodeType"],
Expand All @@ -86,8 +80,6 @@ def _split_serialized(self, snapshot: dict) -> Iterable[List[NormalizedString]]:
yield [NormalizedString(self.comm_token)]
yield emitter.emit(value_index)

yield [NormalizedString(self.eos_token)]


_B64_RE_S = r"(?:[A-Za-z0-9+/]{4}){"
_B64_RE_E = r",}(?:[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{2}==)?"
Expand Down

0 comments on commit 63360ca

Please sign in to comment.