Skip to content

Commit

Permalink
*** replace tokens?
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed May 23, 2024
1 parent 5a3e0f1 commit 3460b24
Showing 1 changed file with 23 additions and 2 deletions.
25 changes: 23 additions & 2 deletions src/dom_tokenizers/pre_tokenizers/dom_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,29 @@ class DOMSnapshotPreTokenizer(PreTokenizer):
"""Pre-tokenizer that consumes JSON-serialized DOM snapshots
and emits tokenized representations of the snapshotted DOMs.
"""
elem_token = "[TAG]" # beginning of element name
attr_token = "[ATTR]" # beginning of attribute
open_tag_token = "<" # an element name is next
close_tag_token = "/" # an element name is next
attr_name_token = "_" # an attribute name is next
attr_value_token = "=" # an attribute value is next
tag_done_token = ">" # end of open/close element

elem_token = open_tag_token # XXX
attr_token = attr_name_token # XXX

# or, prefixed tokens, like:
tokens = [
"<html>",
"<a",
"_href=",
">",
# text,
"[base64]",
"</a>",
"</html>",
]

start_comment_token = "!"

comm_token = "[COMMENT]" # beginning of comment
base64_token = "[BASE64]" # beginning of some base64
long_token = "[LONG]" # elided long token
Expand Down

0 comments on commit 3460b24

Please sign in to comment.