Skip to content

Commit

Permalink
*** replace tokens?
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed May 22, 2024
1 parent 6df3b80 commit 25ee938
Showing 1 changed file with 20 additions and 5 deletions.
25 changes: 20 additions & 5 deletions src/dom_tokenizers/pre_tokenizers/dom_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,26 @@ class DOMSnapshotPreTokenizer(PreTokenizer):
"""Pre-tokenizer that consumes JSON-serialized DOM snapshots
and emits tokenized representations of the snapshotted DOMs.
"""
bos_token = "[BOS]" # beginning of sequence
eos_token = "[EOS]" # end of sequence
sep_token = "[SEP]" # separator between documents
elem_token = "[TAG]" # beginning of element name
attr_token = "[ATTR]" # beginning of attribute
open_tag_token = "<" # an element name is next
close_tag_token = "/" # an element name is next
attr_name_token = "_" # an attribute name is next
attr_value_token = "=" # an attribute value is next
tag_done_token = ">" # end of open/close element

#or, prefixed tokens, like:
tokens = [
"<html>",
"<a",
"_href=",
">",
text,
[base64]
"</a>",
"</html>",
]

start_comment_token = "!"

comm_token = "[COMMENT]" # beginning of comment
base64_token = "[BASE64]" # beginning of some base64
long_token = "[LONG]" # elided long token
Expand Down

0 comments on commit 25ee938

Please sign in to comment.