Skip to content

Commit

Permalink
Don't sniff decimal or hex constants as base64
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 2, 2024
1 parent 2c24a89 commit 347737f
Showing 1 changed file with 19 additions and 1 deletion.
20 changes: 19 additions & 1 deletion src/dom_tokenizers/pre_tokenizers/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def special_tokens(self) -> Iterable[str]:
# XXX older bits
MAXWORDLEN = 32
WORD_RE = re.compile(r"(?:\w+['’]?)+")
HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$")
HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$", re.I)
DIGIT_RE = re.compile(r"\d")
LONGEST_URLISH = 1024 # XXX?
URLISH_LOOKBACK = 5
Expand All @@ -85,6 +85,7 @@ def special_tokens(self) -> Iterable[str]:
LONGEST_PHITEST = 85
BASE64_RE = base64_matcher()
B64_PNG_RE = re.compile(r"iVBORw0KGg[o-r]")
B64_HEX_RE = re.compile(r"^(0x)?([0-9a-f]+)$", re.I)
XML_HDR_RE = re.compile(r"<([a-z]{3,})\s+[a-z]+")

def split(self, text: str) -> Iterable[str]:
Expand Down Expand Up @@ -161,6 +162,23 @@ def split(self, text: str) -> Iterable[str]:

# Are we looking at something that might be base64?
if self.BASE64_RE.match(curr):
if curr.isdecimal():
if VERBOSE: # pragma: no cover
debug("it's a decimal number")
cursor += 1
continue

match = self.B64_HEX_RE.match(curr)
if match:
if VERBOSE: # pragma: no cover
debug("it's hex")
new_splits = match.groups()
if new_splits[0] is not None:
splits[cursor:cursor+1] = new_splits
cursor += 1
cursor += 1
continue

cursor = self._sub_base64(splits, cursor)
continue

Expand Down

0 comments on commit 347737f

Please sign in to comment.