Skip to content

Commit

Permalink
Add case-insensitivity and early-abort for small words when none exist.
Browse files Browse the repository at this point in the history
  • Loading branch information
dmsnell committed May 14, 2024
1 parent f2cab96 commit aa78d9b
Showing 1 changed file with 47 additions and 21 deletions.
68 changes: 47 additions & 21 deletions src/wp-includes/class-wp-token-map.php
Original file line number Diff line number Diff line change
Expand Up @@ -355,12 +355,20 @@ public static function from_precomputed_table( $key_length, $groups, $large_word
*
* @since 6.6.0
*
* @param string $word Determine if this word is a lookup key in the map.
* @param string $word Determine if this word is a lookup key in the map.
* @param ?string $case_sensitivity 'case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
* @return bool Whether there's an entry for the given word in the map.
*/
public function contains( $word ) {
public function contains( $word, $case_sensitivity = 'case-sensitive' ) {
$ignore_case = 'case-insensitive' === $case_sensitivity;

if ( $this->key_length >= strlen( $word ) ) {
$word_at = strpos( $this->small_words, str_pad( $word, $this->key_length + 1, "\x00" ), STR_PAD_RIGHT );
if ( 0 === strlen( $this->small_words ) ) {
return false;
}

$term = str_pad( $word, $this->key_length + 1, "\x00", STR_PAD_RIGHT );
$word_at = $ignore_case ? stripos( $this->small_words, $term ) : strpos( $this->small_words, $term );
if ( false === $word_at ) {
return false;
}
Expand All @@ -369,7 +377,7 @@ public function contains( $word ) {
}

$group_key = substr( $word, 0, $this->key_length );
$group_at = strpos( $this->groups, $group_key );
$group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key );
if ( false === $group_at ) {
return false;
}
Expand All @@ -386,7 +394,7 @@ public function contains( $word ) {
$mapping_length = unpack( 'C', $group[ $at++ ] )[1];
$mapping_at = $at;

if ( $token_length === $length && 0 === substr_compare( $group, $slug, $token_at, $token_length ) ) {
if ( $token_length === $length && 0 === substr_compare( $group, $slug, $token_at, $token_length, $ignore_case ) ) {
return true;
}

Expand Down Expand Up @@ -432,22 +440,26 @@ public function contains( $word ) {
*
* @since 6.6.0
*
* @param string $text String in which to search for a lookup key.
* @param ?int $offset How many bytes into the string where the lookup key ought to start.
* @param ?int &$skip_bytes Holds byte-length of found lookup key if matched, otherwise not set.
* @param string $text String in which to search for a lookup key.
* @param ?int $offset How many bytes into the string where the lookup key ought to start.
* @param ?int &$skip_bytes Holds byte-length of found lookup key if matched, otherwise not set.
* @param ?string $case_sensitivity 'case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
* @return string|false Mapped value of lookup key if found, otherwise `false`.
*/
public function read_token( $text, $offset = 0, &$skip_bytes = null ) {
public function read_token( $text, $offset = 0, &$skip_bytes = null, $case_sensitivity = 'case-sensitive' ) {
$ignore_case = 'case-insensitive' === $case_sensitivity;
$text_length = strlen( $text );

// Search for a long word first, if the text is long enough, and if that fails, a short one.
if ( $text_length > $this->key_length ) {
$group_key = substr( $text, $offset, $this->key_length );

$group_at = strpos( $this->groups, $group_key );
$group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key );
if ( false === $group_at ) {
// Perhaps a short word then.
return $this->read_small_token( $text, $offset, $skip_bytes );
return strlen( $this->small_words ) > 0
? $this->read_small_token( $text, $offset, $skip_bytes, $case_sensitivity )
: false;
}

$group = $this->large_words[ $group_at / ( $this->key_length + 1 ) ];
Expand All @@ -460,7 +472,7 @@ public function read_token( $text, $offset = 0, &$skip_bytes = null ) {
$mapping_length = unpack( 'C', $group[ $at++ ] )[1];
$mapping_at = $at;

if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length ) ) {
if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length, $ignore_case ) ) {
$skip_bytes = $this->key_length + $token_length;
return substr( $group, $mapping_at, $mapping_length );
}
Expand All @@ -470,26 +482,37 @@ public function read_token( $text, $offset = 0, &$skip_bytes = null ) {
}

// Perhaps a short word then.
return $this->read_small_token( $text, $offset, $skip_bytes );
return strlen( $this->small_words ) > 0
? $this->read_small_token( $text, $offset, $skip_bytes, $case_sensitivity )
: false;
}

/**
* Finds a match for a short word at the index.
*
* @since 6.6.0.
*
* @param string $text String in which to search for a lookup key.
* @param ?int $offset How many bytes into the string where the lookup key ought to start.
* @param ?int &$skip_bytes Holds byte-length of found lookup key if matched, otherwise not set.
* @param string $text String in which to search for a lookup key.
* @param ?int $offset How many bytes into the string where the lookup key ought to start.
* @param ?int &$skip_bytes Holds byte-length of found lookup key if matched, otherwise not set.
* @param ?string $case_sensitivity 'case-insensitive' to ignore ASCII case or default of 'case-sensitive'.
* @return string|false Mapped value of lookup key if found, otherwise `false`.
*/
private function read_small_token( $text, $offset, &$skip_bytes ) {
$small_length = strlen( $this->small_words );
$starting_char = $text[ $offset ];
private function read_small_token( $text, $offset, &$skip_bytes, $case_sensitivity = 'case-sensitive' ) {
$ignore_case = 'case-insensitive' === $case_sensitivity;
$small_length = strlen( $this->small_words );
$search_text = substr( $text, $offset, $this->key_length );
if ( $ignore_case ) {
$search_text = strtoupper( $search_text );
}
$starting_char = $search_text[0];

$at = 0;
while ( $at < $small_length ) {
if ( $starting_char !== $this->small_words[ $at ] ) {
if (
$starting_char !== $this->small_words[ $at ] &&
( ! $ignore_case || strtoupper( $this->small_words[ $at ] ) !== $starting_char )
) {
$at += $this->key_length + 1;
continue;
}
Expand All @@ -500,7 +523,10 @@ private function read_small_token( $text, $offset, &$skip_bytes ) {
return $this->small_mappings[ $at / ( $this->key_length + 1 ) ];
}

if ( $text[ $offset + $adjust ] !== $this->small_words[ $at + $adjust ] ) {
if (
$search_text[ $adjust ] !== $this->small_words[ $at + $adjust ] &&
( ! $ignore_case || strtoupper( $this->small_words[ $at + $adjust ] !== $search_text[ $adjust ] ) )
) {
$at += $this->key_length + 1;
continue 2;
}
Expand Down

0 comments on commit aa78d9b

Please sign in to comment.