From 0351b78bb9b6119f2b77d91e25d3ec7195a53c90 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sun, 28 Apr 2024 15:53:41 +0300 Subject: [PATCH] Try some optimizations --- .../html-api/class-wp-html-decoder.php | 182 ++++++++++-------- 1 file changed, 107 insertions(+), 75 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php index 3497595903778..112956bb70fc0 100644 --- a/src/wp-includes/html-api/class-wp-html-decoder.php +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -243,10 +243,8 @@ public static function decode( $context, $text, $at = 0, $length = null ) { * @return string|null Decoded character reference if found, otherwise `false`. */ public static function read_character_reference( $context, $text, $at, &$skip_bytes = null ) { - global $html5_named_character_entity_set; - $length = strlen( $text ); - if ( $at + 1 >= $length ) { + if ( $at + 2 >= $length ) { return null; } @@ -283,18 +281,18 @@ public static function read_character_reference( $context, $text, $at, &$skip_by $max_digits = 7; // 􏿿 } - // Cannot encode invalid Unicode code points. Max is to U+10FFFF. - $zero_count = strspn( $text, '0', $digits_at ); - $digit_count = strspn( $text, $numeric_digits, $digits_at + $zero_count ); - $after_digits = $digits_at + $zero_count + $digit_count; - $has_semicolon = $after_digits < $length && ';' === $text[ $after_digits ]; - $end_of_span = $has_semicolon ? $after_digits + 1 : $after_digits; + $zero_count = strspn( $text, '0', $digits_at ); + $digit_count = strspn( $text, $numeric_digits, $digits_at + $zero_count ); // `&#` or `&#x` without digits returns into plaintext. if ( 0 === $digit_count && 0 === $zero_count ) { return null; } + $after_digits = $digits_at + $zero_count + $digit_count; + $has_semicolon = $after_digits < $length && ';' === $text[ $after_digits ]; + $end_of_span = $has_semicolon ? $after_digits + 1 : $after_digits; + if ( 0 === $digit_count ) { $skip_bytes = $end_of_span - $at; return '�'; @@ -328,60 +326,12 @@ public static function read_character_reference( $context, $text, $at, &$skip_by * @see https://infra.spec.whatwg.org/#noncharacter */ - /* - * Code points in the C1 controls area need to be remapped as if they - * were stored in Windows-1252. Note! This transformation only happens - * for numeric character references. The raw code points in the byte - * stream are not translated. - * - * > If the number is one of the numbers in the first column of - * > the following table, then find the row with that number in - * > the first column, and set the character reference code to - * > the number in the second column of that row. - */ - if ( $code_point >= 0x80 && $code_point <= 0x9F ) { - $windows_1252_mapping = array( - 0x20AC, // 0x80 -> EURO SIGN (€). - 0x81, // 0x81 -> (no change). - 0x201A, // 0x82 -> SINGLE LOW-9 QUOTATION MARK (‚). - 0x0192, // 0x83 -> LATIN SMALL LETTER F WITH HOOK (ƒ). - 0x201E, // 0x84 -> DOUBLE LOW-9 QUOTATION MARK („). - 0x2026, // 0x85 -> HORIZONTAL ELLIPSIS (…). - 0x2020, // 0x86 -> DAGGER (†). - 0x2021, // 0x87 -> DOUBLE DAGGER (‡). - 0x02C6, // 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ). - 0x2030, // 0x89 -> PER MILLE SIGN (‰). - 0x0160, // 0x8A -> LATIN CAPITAL LETTER S WITH CARON (Š). - 0x2039, // 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹). - 0x0152, // 0x8C -> LATIN CAPITAL LIGATURE OE (Œ). - 0x8D, // 0x8D -> (no change). - 0x017D, // 0x8E -> LATIN CAPITAL LETTER Z WITH CARON (Ž). - 0x8F, // 0x8F -> (no change). - 0x90, // 0x90 -> (no change). - 0x2018, // 0x91 -> LEFT SINGLE QUOTATION MARK (‘). - 0x2019, // 0x92 -> RIGHT SINGLE QUOTATION MARK (’). - 0x201C, // 0x93 -> LEFT DOUBLE QUOTATION MARK (“). - 0x201D, // 0x94 -> RIGHT DOUBLE QUOTATION MARK (”). - 0x2022, // 0x95 -> BULLET (•). - 0x2013, // 0x96 -> EN DASH (–). - 0x2014, // 0x97 -> EM DASH (—). - 0x02DC, // 0x98 -> SMALL TILDE (˜). - 0x2122, // 0x99 -> TRADE MARK SIGN (™). - 0x0161, // 0x9A -> LATIN SMALL LETTER S WITH CARON (š). - 0x203A, // 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›). - 0x0153, // 0x9C -> LATIN SMALL LIGATURE OE (œ). - 0x9D, // 0x9D -> (no change). - 0x017E, // 0x9E -> LATIN SMALL LETTER Z WITH CARON (ž). - 0x0178, // 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ). - ); - - $code_point = $windows_1252_mapping[ $code_point - 0x80 ]; - } - $skip_bytes = $end_of_span - $at; return self::code_point_to_utf8_bytes( $code_point ); } + global $html5_named_character_entity_set; + /** Tracks inner parsing within the named character reference. */ $name_at = $at + 1; // Minimum named character reference is two characters. E.g. `GT`. @@ -460,32 +410,114 @@ public static function code_point_to_utf8_bytes( $code_point ) { return '�'; } - if ( $code_point <= 0x7F ) { - return chr( $code_point ); - } - - if ( $code_point <= 0x7FF ) { - $byte1 = ( $code_point >> 6 ) | 0xC0; - $byte2 = $code_point & 0x3F | 0x80; + if ( $code_point > 0xFFFF ) { + $byte1 = ( $code_point >> 18 ) | 0xF0; + $byte2 = ( $code_point >> 12 ) & 0x3F | 0x80; + $byte3 = ( $code_point >> 6 ) & 0x3F | 0x80; + $byte4 = $code_point & 0x3F | 0x80; - return pack( 'CC', $byte1, $byte2 ); + return chr( $byte1 ) . chr( $byte2 ) . chr( $byte3 ) . chr( $byte4 ); } - if ( $code_point <= 0xFFFF ) { + if ( $code_point > 0x7FF ) { $byte1 = ( $code_point >> 12 ) | 0xE0; $byte2 = ( $code_point >> 6 ) & 0x3F | 0x80; $byte3 = $code_point & 0x3F | 0x80; - return pack( 'CCC', $byte1, $byte2, $byte3 ); + return chr( $byte1 ) . chr( $byte2 ) . chr( $byte3 ); } - if ( $code_point <= 0x10FFFF ) { - $byte1 = ( $code_point >> 18 ) | 0xF0; - $byte2 = ( $code_point >> 12 ) & 0x3F | 0x80; - $byte3 = ( $code_point >> 6 ) & 0x3F | 0x80; - $byte4 = $code_point & 0x3F | 0x80; + if ( $code_point <= 0x7F ) { + return chr( $code_point ); + } + + if ( $code_point <= 0x7FF ) { + /* + * Code points in the C1 controls area need to be remapped as if they + * were stored in Windows-1252. Note! This transformation only happens + * for numeric character references. The raw code points in the byte + * stream are not translated. + * + * > If the number is one of the numbers in the first column of + * > the following table, then find the row with that number in + * > the first column, and set the character reference code to + * > the number in the second column of that row. + */ + if ( $code_point <= 0x9F ) { + $windows_1252_mapping = array( +// 0x20AC, // 0x80 -> EURO SIGN (€). +// 0x81, // 0x81 -> (no change). +// 0x201A, // 0x82 -> SINGLE LOW-9 QUOTATION MARK (‚). +// 0x0192, // 0x83 -> LATIN SMALL LETTER F WITH HOOK (ƒ). +// 0x201E, // 0x84 -> DOUBLE LOW-9 QUOTATION MARK („). +// 0x2026, // 0x85 -> HORIZONTAL ELLIPSIS (…). +// 0x2020, // 0x86 -> DAGGER (†). +// 0x2021, // 0x87 -> DOUBLE DAGGER (‡). +// 0x02C6, // 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ). +// 0x2030, // 0x89 -> PER MILLE SIGN (‰). +// 0x0160, // 0x8A -> LATIN CAPITAL LETTER S WITH CARON (Š). +// 0x2039, // 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹). +// 0x0152, // 0x8C -> LATIN CAPITAL LIGATURE OE (Œ). +// 0x8D, // 0x8D -> (no change). +// 0x017D, // 0x8E -> LATIN CAPITAL LETTER Z WITH CARON (Ž). +// 0x8F, // 0x8F -> (no change). +// 0x90, // 0x90 -> (no change). +// 0x2018, // 0x91 -> LEFT SINGLE QUOTATION MARK (‘). +// 0x2019, // 0x92 -> RIGHT SINGLE QUOTATION MARK (’). +// 0x201C, // 0x93 -> LEFT DOUBLE QUOTATION MARK (“). +// 0x201D, // 0x94 -> RIGHT DOUBLE QUOTATION MARK (”). +// 0x2022, // 0x95 -> BULLET (•). +// 0x2013, // 0x96 -> EN DASH (–). +// 0x2014, // 0x97 -> EM DASH (—). +// 0x02DC, // 0x98 -> SMALL TILDE (˜). +// 0x2122, // 0x99 -> TRADE MARK SIGN (™). +// 0x0161, // 0x9A -> LATIN SMALL LETTER S WITH CARON (š). +// 0x203A, // 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›). +// 0x0153, // 0x9C -> LATIN SMALL LIGATURE OE (œ). +// 0x9D, // 0x9D -> (no change). +// 0x017E, // 0x9E -> LATIN SMALL LETTER Z WITH CARON (ž). +// 0x0178, // 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ). + '€', + '', + '‚', + 'ƒ', + '„', + '…', + '†', + '‡', + 'ˆ', + '‰', + 'Š', + '‹', + 'Œ', + '', + 'Ž', + '', + '', + '‘', + '’', + '“', + '”', + '•', + '–', + '—', + '˜', + '™', + 'š', + '›', + 'œ', + '', + 'ž', + 'Ÿ', + ); + + return $windows_1252_mapping[ $code_point - 0x80 ]; + } + + $byte1 = ( $code_point >> 6 ) | 0xC0; + $byte2 = $code_point & 0x3F | 0x80; - return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 ); + return chr( $byte1 ) . chr( $byte2 ); } } }