From a99fda866cfe8147f48e9db575e96fb999ceb448 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 1 May 2024 16:22:39 -0700 Subject: [PATCH] add docs, remove is_string --- src/wp-includes/class-wp-token-map.php | 49 +++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/src/wp-includes/class-wp-token-map.php b/src/wp-includes/class-wp-token-map.php index 497c0b6f554b7..1575572e170f6 100644 --- a/src/wp-includes/class-wp-token-map.php +++ b/src/wp-includes/class-wp-token-map.php @@ -43,6 +43,53 @@ * array( "😯", "🙂", "🙁", "😕" ) * ); * + * ## Determining Key Length. + * + * The choice of the size of the key length should be based on the data being stored in + * the token map. It should divide the data as evenly as possible, but should not create + * so many groups that a large fraction of the groups only contain a single token. + * + * For the HTML5 named character references, a key length of 2 was found to provide a + * sufficient spread and should be a good default for relatively large sets of tokens. + * + * However, for some data sets this might be too long. For example, a list of smilies + * may be too small for a key length of 2. Perhaps 1 would be more appropriate. It's + * best to experiment and determine empirically which values are appropriate. + * + * ## Generate Pre-Computed Source Code. + * + * Since the `WP_Token_Map` is designed for relatively static lookups, it can be + * advantageous to precompute the values and instantiate a table that has already + * sorted and grouped the tokens and built the lookup strings. + * + * This can be done with `WP_Token_Map::precomputed_php_source_table()`. + * + * Note that if there is a leading character that all tokens need, such as `&` for + * HTML named character references, it can be beneficial to exclude this from the + * token map. Instead, find occurrences of the leading character and then use the + * token map to see if the following characters complete the token. + * + * Example: + * + * $map = WP_Token_Map::from_array( array( 'simple_smile:' => '🙂', 'sob:' => '😭' ) ); + * echo $map->precomputed_php_source_table(); + * // Output + * WP_Token_Map::from_precomputed_table( + * 2, + * "si\x00so\x00", + * array( + * // simple_smile:[🙂]. + * "\x0bmple_smile:\x04🙂", + * // sob:[😭]. + * "\x02b:\x04😭", + * ), + * "", + * array() + * ); + * + * This precomputed value can be stored directly in source code and will skip the + * startup cost of generating the lookup strings. See `$html5_named_character_entities`. + * * @since 6.6.0 */ class WP_Token_Map { @@ -164,7 +211,7 @@ public static function from_array( $mappings, $key_length = 2 ) { $groups = array(); $shorts = array(); foreach ( $mappings as $word => $mapping ) { - if ( ! is_string( $word ) || self::MAX_LENGTH <= strlen( $word ) ) { + if ( self::MAX_LENGTH <= strlen( $word ) ) { return null; }