From 77b3a5b33a0a0a5d5654c402034c28d2f8cacbfa Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Tue, 3 Oct 2023 10:17:49 -0700 Subject: [PATCH] Experiment: Add optimized set lookup class --- src/wp-includes/class-wp-token-set.php | 224 ++++++++++ .../html4wp-named-character-entities.php | 382 ++++++++++++++++++ src/wp-includes/kses.php | 4 +- src/wp-settings.php | 2 + 4 files changed, 610 insertions(+), 2 deletions(-) create mode 100644 src/wp-includes/class-wp-token-set.php create mode 100644 src/wp-includes/html-api/html4wp-named-character-entities.php diff --git a/src/wp-includes/class-wp-token-set.php b/src/wp-includes/class-wp-token-set.php new file mode 100644 index 0000000000000..faffeaedeb6d8 --- /dev/null +++ b/src/wp-includes/class-wp-token-set.php @@ -0,0 +1,224 @@ +key_length = $key_length; + + // Start by grouping words. + + $groups = array(); + $shorts = array(); + foreach ( $words as $word ) { + if ( ! is_string( $word ) || self::MAX_LENGTH <= strlen( $word ) ) { + return null; + } + + $length = strlen( $word ); + + if ( $key_length >= $length ) { + $shorts[] = $word; + } else { + $group = substr( $word, 0, $key_length ); + + if ( ! isset( $groups[ $group ] ) ) { + $groups[ $group ] = array(); + } + + $groups[ $group ][] = substr( $word, $key_length ); + } + } + + // Sort the words by longest-first, then alphabetical. + + usort( $shorts, array( self::class, 'longest_first_then_alphabetical' ) ); + foreach ( $groups as $group_key => $group ) { + usort( $groups[ $group_key ], array( self::class, 'longest_first_then_alphabetical' ) ); + } + + // Finally construct the optimized lookups. + + foreach ( $shorts as $word ) { + $set->small_words .= str_pad( $word, $key_length, "\x00" ); + } + + foreach ( $groups as $group => $group_words ) { + $group_string = ''; + + foreach ( $group_words as $word ) { + $group_string .= chr( strlen( $word ) ) . $word; + } + + $set->large_words[ $group ] = $group_string; + } + + return $set; + } + + public static function from_precomputed_table( $key_length, $large_words, $small_words ) { + $set = new WP_Token_Set(); + + $set->key_length = $key_length; + $set->large_words = $large_words; + $set->small_words = $small_words; + + return $set; + } + + public function contains( $word ) { + if ( $this->key_length >= strlen( $word ) ) { + return str_contains( $this->small_words, str_pad( $word, $this->key_length, "\x00" ) ); + } + + $group_key = substr( $word, 0, $this->key_length ); + if ( ! isset( $this->large_words[ $group_key ] ) ) { + return false; + } + + $group = $this->large_words[ $group_key ]; + $slug = substr( $word, $this->key_length ); + $length = strlen( $slug ); + $at = 0; + while ( $at < strlen( $group ) ) { + $token_length = ord( $group[ $at++ ] ); + if ( $token_length === $length && 0 === substr_compare( $group, $slug, $at, $token_length ) ) { + return true; + } + + $at += $token_length; + } + + return false; + } + + public function read_token( $text, $offset ) { + $text_length = strlen( $text ); + + // Search for a long word first, if the text is long enough, and if that fails, a short one. + if ( $this->key_length < $text_length ) { + $group_key = substr( $text, $offset, $this->key_length ); + + if ( ! isset( $this->large_words[ $group_key ] ) ) { + return false; + } + + $group = $this->large_words[ $group_key ]; + $group_length = strlen( $group ); + $at = 0; + while ( $at < $group_length ) { + $token_length = ord( $group[ $at++ ] ); + $token = substr( $group, $at, $token_length ); + + if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length ) ) { + return $group_key . $token; + } + + $at += $token_length; + } + } + + // Perhaps a short word then. + $small_text = str_pad( substr( $text, $offset, $this->key_length ), $this->key_length, "\x00" ); + $at = strpos( $this->small_words, $small_text ); + + return false !== $at + ? rtrim( substr( $this->small_words, $at, $this->key_length ), "\x00" ) + : false; + } + + public function to_array() { + $tokens = array(); + + $at = 0; + while ( $at < strlen( $this->small_words ) ) { + $tokens[] = rtrim( substr( $this->small_words, $at, $this->key_length ), "\x00" ); + $at += $this->key_length; + } + + foreach ( $this->large_words as $prefix => $group ) { + $at = 0; + while ( $at < strlen( $group ) ) { + $length = ord( $group[ $at++ ] ); + $tokens[] = $prefix . rtrim( substr( $group, $at, $length ), "\x00" ); + $at += $length; + } + } + + return $tokens; + } + + public function precomputed_php_source_table( $indent = "\t" ) { + $i1 = $indent; + $i2 = $indent . $indent; + + $output = self::class . "::from_precomputed_table(\n"; + $output .= "{$i1}{$this->key_length},\n"; + $output .= "{$i1}array(\n"; + + $prefixes = array_keys( $this->large_words ); + sort( $prefixes ); + foreach ( $prefixes as $prefix ) { + $group = $this->large_words[ $prefix ]; + $comment_line = "{$i2}//"; + $data_line = "{$i2}'{$prefix}' => \""; + $at = 0; + while ( $at < strlen( $group ) ) { + $length = ord( $group[ $at++ ] ); + $digits = str_pad( dechex( $length ), 2, '0', STR_PAD_LEFT ); + $token = substr( $group, $at, $length ); + $at += $length; + + $comment_line .= " &{$prefix}{$token}"; + $data_line .= "\\x{$digits}{$token}"; + } + $comment_line .= "\n"; + $data_line .= "\",\n"; + + $output .= $comment_line; + $output .= $data_line; + } + + $output .= "{$i1}),\n"; + $small_text = str_replace( "\x00", '\x00', $this->small_words ); + $output .= "{$i1}'{$small_text}'\n"; + $output .= ");\n"; + + return $output; + } + + private static function longest_first_then_alphabetical( $a, $b ) { + if ( $a === $b ) { + return 0; + } + + $la = strlen( $a ); + $lb = strlen( $b ); + + // Longer strings are less-than for comparison's sake. + if ( $la !== $lb ) { + return $lb - $la; + } + + return strcmp( $a, $b ); + } +} diff --git a/src/wp-includes/html-api/html4wp-named-character-entities.php b/src/wp-includes/html-api/html4wp-named-character-entities.php new file mode 100644 index 0000000000000..9d9ce8b3af0af --- /dev/null +++ b/src/wp-includes/html-api/html4wp-named-character-entities.php @@ -0,0 +1,382 @@ + "\x03lig", + // Á + 'Aa' => "\x04cute", + // Â + 'Ac' => "\x03irc", + // À + 'Ag' => "\x04rave", + // &Alpha + 'Al' => "\x03pha", + // Å + 'Ar' => "\x03ing", + // Ã + 'At' => "\x04ilde", + // Ä + 'Au' => "\x02ml", + // &Beta + 'Be' => "\x02ta", + // Ç + 'Cc' => "\x04edil", + // &Chi + 'Ch' => "\x01i", + // &Dagger + 'Da' => "\x04gger", + // &Delta + 'De' => "\x03lta", + // Ð + 'ET' => "\x01H", + // É + 'Ea' => "\x04cute", + // Ê + 'Ec' => "\x03irc", + // È + 'Eg' => "\x04rave", + // &Epsilon + 'Ep' => "\x05silon", + // &Eta + 'Et' => "\x01a", + // Ë + 'Eu' => "\x02ml", + // &Gamma + 'Ga' => "\x03mma", + // Í + 'Ia' => "\x04cute", + // Î + 'Ic' => "\x03irc", + // Ì + 'Ig' => "\x04rave", + // &Iota + 'Io' => "\x02ta", + // Ï + 'Iu' => "\x02ml", + // &Kappa + 'Ka' => "\x03ppa", + // &Lambda + 'La' => "\x04mbda", + // Ñ + 'Nt' => "\x04ilde", + // &OElig + 'OE' => "\x03lig", + // Ó + 'Oa' => "\x04cute", + // Ô + 'Oc' => "\x03irc", + // Ò + 'Og' => "\x04rave", + // &Omicron &Omega + 'Om' => "\x05icron\x03ega", + // Ø + 'Os' => "\x04lash", + // Õ + 'Ot' => "\x04ilde", + // Ö + 'Ou' => "\x02ml", + // &Phi + 'Ph' => "\x01i", + // &Prime + 'Pr' => "\x03ime", + // &Psi + 'Ps' => "\x01i", + // &Rho + 'Rh' => "\x01o", + // &Scaron + 'Sc' => "\x04aron", + // &Sigma + 'Si' => "\x03gma", + // Þ + 'TH' => "\x03ORN", + // &Tau + 'Ta' => "\x01u", + // &Theta + 'Th' => "\x03eta", + // Ú + 'Ua' => "\x04cute", + // Û + 'Uc' => "\x03irc", + // Ù + 'Ug' => "\x04rave", + // &Upsilon + 'Up' => "\x05silon", + // Ü + 'Uu' => "\x02ml", + // Ý + 'Ya' => "\x04cute", + // &Yuml + 'Yu' => "\x02ml", + // &Zeta + 'Ze' => "\x02ta", + // á + 'aa' => "\x04cute", + // â ´ + 'ac' => "\x03irc\x03ute", + // æ + 'ae' => "\x03lig", + // à + 'ag' => "\x04rave", + // &alefsym &alpha + 'al' => "\x05efsym\x03pha", + // & + 'am' => "\x01p", + // &and &ang + 'an' => "\x01d\x01g", + // &apos + 'ap' => "\x02os", + // å + 'ar' => "\x03ing", + // &asymp + 'as' => "\x03ymp", + // ã + 'at' => "\x04ilde", + // ä + 'au' => "\x02ml", + // &bdquo + 'bd' => "\x03quo", + // &beta + 'be' => "\x02ta", + // ¦ + 'br' => "\x04vbar", + // &bull + 'bu' => "\x02ll", + // &cap + 'ca' => "\x01p", + // ç + 'cc' => "\x04edil", + // ¸ ¢ + 'ce' => "\x03dil\x02nt", + // &chi + 'ch' => "\x01i", + // &circ + 'ci' => "\x02rc", + // &clubs + 'cl' => "\x03ubs", + // &cong © + 'co' => "\x02ng\x02py", + // &crarr + 'cr' => "\x03arr", + // ¤ &cup + 'cu' => "\x04rren\x01p", + // &dArr + 'dA' => "\x02rr", + // &dagger &darr + 'da' => "\x04gger\x02rr", + // &delta ° + 'de' => "\x03lta\x01g", + // ÷ &diams + 'di' => "\x04vide\x03ams", + // é + 'ea' => "\x04cute", + // ê + 'ec' => "\x03irc", + // è + 'eg' => "\x04rave", + // &empty &emsp + 'em' => "\x03pty\x02sp", + // &ensp + 'en' => "\x02sp", + // &epsilon + 'ep' => "\x05silon", + // &equiv + 'eq' => "\x03uiv", + // &eta ð + 'et' => "\x01a\x01h", + // ë &euro + 'eu' => "\x02ml\x02ro", + // &exist + 'ex' => "\x03ist", + // &fnof + 'fn' => "\x02of", + // &forall + 'fo' => "\x04rall", + // ½ ¼ ¾ &frasl + 'fr' => "\x04ac12\x04ac14\x04ac34\x03asl", + // &gamma + 'ga' => "\x03mma", + // &hArr + 'hA' => "\x02rr", + // &harr + 'ha' => "\x02rr", + // &hearts &hellip + 'he' => "\x04arts\x04llip", + // í + 'ia' => "\x04cute", + // î + 'ic' => "\x03irc", + // ¡ + 'ie' => "\x03xcl", + // ì + 'ig' => "\x04rave", + // &image + 'im' => "\x03age", + // &infin &int + 'in' => "\x03fin\x01t", + // &iota + 'io' => "\x02ta", + // ¿ + 'iq' => "\x04uest", + // &isin + 'is' => "\x02in", + // ï + 'iu' => "\x02ml", + // &kappa + 'ka' => "\x03ppa", + // &lArr + 'lA' => "\x02rr", + // &lambda « &lang &larr + 'la' => "\x04mbda\x03quo\x02ng\x02rr", + // &lceil + 'lc' => "\x03eil", + // &ldquo + 'ld' => "\x03quo", + // &lfloor + 'lf' => "\x04loor", + // &lowast &loz + 'lo' => "\x04wast\x01z", + // &lrm + 'lr' => "\x01m", + // &lsaquo &lsquo + 'ls' => "\x04aquo\x03quo", + // ¯ + 'ma' => "\x02cr", + // &mdash + 'md' => "\x03ash", + // · µ &minus + 'mi' => "\x04ddot\x03cro\x03nus", + // &nabla + 'na' => "\x03bla", + //   + 'nb' => "\x02sp", + // &ndash + 'nd' => "\x03ash", + // ¬in ¬ + 'no' => "\x03tin\x01t", + // &nsub + 'ns' => "\x02ub", + // ñ + 'nt' => "\x04ilde", + // ó + 'oa' => "\x04cute", + // ô + 'oc' => "\x03irc", + // &oelig + 'oe' => "\x03lig", + // ò + 'og' => "\x04rave", + // &oline + 'ol' => "\x03ine", + // &omicron &omega + 'om' => "\x05icron\x03ega", + // &oplus + 'op' => "\x03lus", + // ª º + 'or' => "\x02df\x02dm", + // ø + 'os' => "\x04lash", + // õ &otimes + 'ot' => "\x04ilde\x04imes", + // ö + 'ou' => "\x02ml", + // ¶ &part + 'pa' => "\x02ra\x02rt", + // &permil &perp + 'pe' => "\x04rmil\x02rp", + // &phi + 'ph' => "\x01i", + // &piv + 'pi' => "\x01v", + // ± + 'pl' => "\x04usmn", + // £ + 'po' => "\x03und", + // &prime &prod &prop + 'pr' => "\x03ime\x02od\x02op", + // &psi + 'ps' => "\x01i", + // " + 'qu' => "\x02ot", + // &rArr + 'rA' => "\x02rr", + // &radic » &rang &rarr + 'ra' => "\x03dic\x03quo\x02ng\x02rr", + // &rceil + 'rc' => "\x03eil", + // &rdquo + 'rd' => "\x03quo", + // &real ® + 're' => "\x02al\x01g", + // &rfloor + 'rf' => "\x04loor", + // &rho + 'rh' => "\x01o", + // &rlm + 'rl' => "\x01m", + // &rsaquo &rsquo + 'rs' => "\x04aquo\x03quo", + // &sbquo + 'sb' => "\x03quo", + // &scaron + 'sc' => "\x04aron", + // &sdot + 'sd' => "\x02ot", + // § + 'se' => "\x02ct", + // ­ + 'sh' => "\x01y", + // &sigmaf &sigma &sim + 'si' => "\x04gmaf\x03gma\x01m", + // &spades + 'sp' => "\x04ades", + // &sube ¹ ² ³ &supe &sub &sum &sup + 'su' => "\x02be\x02p1\x02p2\x02p3\x02pe\x01b\x01m\x01p", + // ß + 'sz' => "\x03lig", + // &tau + 'ta' => "\x01u", + // &thetasym &there4 &thinsp &theta þ + 'th' => "\x06etasym\x04ere4\x04insp\x03eta\x03orn", + // &tilde × + 'ti' => "\x03lde\x03mes", + // &trade + 'tr' => "\x03ade", + // &uArr + 'uA' => "\x02rr", + // ú &uarr + 'ua' => "\x04cute\x02rr", + // û + 'uc' => "\x03irc", + // ù + 'ug' => "\x04rave", + // ¨ + 'um' => "\x01l", + // &upsilon &upsih + 'up' => "\x05silon\x03sih", + // ü + 'uu' => "\x02ml", + // &weierp + 'we' => "\x04ierp", + // ý + 'ya' => "\x04cute", + // ¥ + 'ye' => "\x01n", + // ÿ + 'yu' => "\x02ml", + // &zeta + 'ze' => "\x02ta", + // &zwnj &zwj + 'zw' => "\x02nj\x01j", + ), + 'MuNuPiXigegtleltmuneninuorpixi' +); diff --git a/src/wp-includes/kses.php b/src/wp-includes/kses.php index 27da1679e8779..f59e44b8bd50d 100644 --- a/src/wp-includes/kses.php +++ b/src/wp-includes/kses.php @@ -1921,14 +1921,14 @@ function wp_kses_normalize_entities( $content, $context = 'html' ) { * @return string Correctly encoded entity. */ function wp_kses_named_entities( $matches ) { - global $allowedentitynames; + global $html4wp_named_character_entity_set; if ( empty( $matches[1] ) ) { return ''; } $i = $matches[1]; - return ( ! in_array( $i, $allowedentitynames, true ) ) ? "&$i;" : "&$i;"; + return ! $html4wp_named_character_entity_set->contains( $i ) ? "&$i;" : "&$i;"; } /** diff --git a/src/wp-settings.php b/src/wp-settings.php index 528f335cb7c2a..bf039dd44647d 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -106,6 +106,7 @@ // Load early WordPress files. require ABSPATH . WPINC . '/class-wp-list-util.php'; +require ABSPATH . WPINC . '/class-wp-token-set.php'; require ABSPATH . WPINC . '/formatting.php'; require ABSPATH . WPINC . '/meta.php'; require ABSPATH . WPINC . '/functions.php'; @@ -217,6 +218,7 @@ require ABSPATH . WPINC . '/feed.php'; require ABSPATH . WPINC . '/bookmark.php'; require ABSPATH . WPINC . '/bookmark-template.php'; +require ABSPATH . WPINC . '/html-api/html4wp-named-character-entities.php'; require ABSPATH . WPINC . '/kses.php'; require ABSPATH . WPINC . '/cron.php'; require ABSPATH . WPINC . '/deprecated.php';