From 3ff78cc501e5255f0db02d932a63f3e69a8be085 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 24 Apr 2024 16:45:31 +0300 Subject: [PATCH] HTML API: Add custom text decoder. Provide a custom decoder for strings coming from HTML attributes and markup. This custom decoder is necessary because of deficiencies in PHP's `html_entity_decode()` function: - It isn't aware of 720 of the possible named character references in HTML, leaving many out that should be translated. - It isn't able to decode character references in data segments where the final semicolon is missing, or when there are ambiguous characters after the reference name but before the semicolon. This one is complicated: refer to the HTML5 specification to clarify. This decoder will also provide some conveniences, such as making a single-pass and interruptable decode operation possible. This will provide a number of opportunities to optimize detection and decoding of things like value prefixes, and whether a value contains a given substring. --- .../html-api/class-wp-html-decoder.php | 435 ++++++++++++++++++ .../html-api/class-wp-html-tag-processor.php | 10 +- src/wp-settings.php | 1 + .../html-api/wpHtmlProcessorHtml5lib.php | 65 ++- 4 files changed, 465 insertions(+), 46 deletions(-) create mode 100644 src/wp-includes/html-api/class-wp-html-decoder.php diff --git a/src/wp-includes/html-api/class-wp-html-decoder.php b/src/wp-includes/html-api/class-wp-html-decoder.php new file mode 100644 index 0000000000000..7d446b2c08f30 --- /dev/null +++ b/src/wp-includes/html-api/class-wp-html-decoder.php @@ -0,0 +1,435 @@ += $end ) { + break; + } + + $character_reference = self::read_character_reference( $context, $text, $next_character_reference_at, $skip_bytes ); + if ( isset( $character_reference ) ) { + $at = $next_character_reference_at; + $decoded .= substr( $text, $was_at, $at - $was_at ); + $decoded .= $character_reference; + $at += $skip_bytes; + $was_at = $at; + continue; + } + + ++$at; + } + + if ( 0 === $was_at ) { + return $text; + } + + if ( $was_at < $end ) { + $decoded .= substr( $text, $was_at, $end - $was_at ); + } + + return $decoded; + } + + /** + * Attempt to read a character reference at the given location in a given string, + * depending on the context in which it's found. + * + * If a character reference is found, this function will return the translated value + * that the reference maps to. It will then set in `$skip_bytes` how many bytes of + * input it read while consuming the character reference. This gives calling code the + * opportunity to advance its cursor when traversing a string and decoding. It + * indicates how long the character reference was. + * + * Example: + * + * null === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 0 ); + * '…' === WP_HTML_Decoder::read_character_reference( 'attribute', 'Ships…', 5, $skip_bytes ); + * 8 === $skip_bytes; + * + * null === WP_HTML_Decoder::read_character_reference( 'attribute', '∉', 0 ); + * '¬' === WP_HTML_Decoder::read_character_reference( 'attribute', '∉', 0, $skip_bytes ); + * 4 === $skip_bytes; + * + * @since 6.6.0 + * + * @param string $context `attribute` for decoding attribute values, `data` otherwise. + * @param string $text Text document containing span of text to decode. + * @param ?int $at Byte offset into text where span begins, defaults to the beginning. + * @param ?int $skip_bytes How many bytes the decodable portion of the text spans. + * The default value spans to the end of the text. + * @return string|null Decoded character reference if found, otherwise `false`. + */ + public static function read_character_reference( $context, $text, $at, &$skip_bytes = null ) { + global $html5_named_character_references; + + $length = strlen( $text ); + if ( $at + 1 >= $length ) { + return null; + } + + if ( '&' !== $text[ $at ] ) { + return null; + } + + /* + * Numeric character references. + * + * When truncated, these will encode the code point found by parsing the + * digits that are available. For example, when `🅰` is truncated + * to `DZ` it will encode `DZ`. It does not: + * - know how to parse the original `🅰`. + * - fail to parse and return plaintext `DZ`. + * - fail to parse and return the replacement character `�` + */ + if ( '#' === $text[ $at + 1 ] ) { + if ( $at + 2 >= $length ) { + return null; + } + + /** Tracks inner parsing within the numeric character reference. */ + $digits_at = $at + 2; + + if ( 'x' === $text[ $digits_at ] || 'X' === $text[ $digits_at ] ) { + $numeric_base = 16; + $numeric_digits = '0123456789abcdefABCDEF'; + $max_digits = 6; // 􏿿 + ++$digits_at; + } else { + $numeric_base = 10; + $numeric_digits = '0123456789'; + $max_digits = 7; // 􏿿 + } + + // Cannot encode invalid Unicode code points. Max is to U+10FFFF. + $zero_count = strspn( $text, '0', $digits_at ); + $digit_count = strspn( $text, $numeric_digits, $digits_at + $zero_count ); + $after_digits = $digits_at + $zero_count + $digit_count; + $has_semicolon = $after_digits < $length && ';' === $text[ $after_digits ]; + $end_of_span = $has_semicolon ? $after_digits + 1 : $after_digits; + + // `&#` or `&#x` without digits returns into plaintext. + if ( 0 === $digit_count && 0 === $zero_count ) { + return null; + } + + if ( 0 === $digit_count ) { + $skip_bytes = $end_of_span - $at; + return '�'; + } + + if ( $digit_count - $zero_count > $max_digits ) { + $skip_bytes = $end_of_span - $at; + return '�'; + } + + $digits = substr( $text, $digits_at + $zero_count, $digit_count ); + $code_point = intval( $digits, $numeric_base ); + + /* + * Noncharacters, 0x0D, and non-ASCII-whitespace control characters. + * + * > A noncharacter is a code point that is in the range U+FDD0 to U+FDEF, + * > inclusive, or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, + * > U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, + * > U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, + * > U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, + * > U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF. + * + * A C0 control is a code point that is in the range of U+00 to U+1F, + * but ASCII whitespace includes U+09, U+0A, U+0C, and U+0D. + * + * These characters are invalid but still decode as any valid character. + * This comment is here to note and explain why there's no check to + * remove these characters or replace them. + * + * @see https://infra.spec.whatwg.org/#noncharacter + */ + + /* + * Code points in the C1 controls area need to be remapped as if they + * were stored in Windows-1252. Note! This transformation only happens + * for numeric character references. The raw code points in the byte + * stream are not translated. + * + * > If the number is one of the numbers in the first column of + * > the following table, then find the row with that number in + * > the first column, and set the character reference code to + * > the number in the second column of that row. + */ + if ( $code_point >= 0x80 && $code_point <= 0x9F ) { + $windows_1252_mapping = array( + 0x20AC, // 0x80 -> EURO SIGN (€). + 0x81, // 0x81 -> (no change). + 0x201A, // 0x82 -> SINGLE LOW-9 QUOTATION MARK (‚). + 0x0192, // 0x83 -> LATIN SMALL LETTER F WITH HOOK (ƒ). + 0x201E, // 0x84 -> DOUBLE LOW-9 QUOTATION MARK („). + 0x2026, // 0x85 -> HORIZONTAL ELLIPSIS (…). + 0x2020, // 0x86 -> DAGGER (†). + 0x2021, // 0x87 -> DOUBLE DAGGER (‡). + 0x02C6, // 0x88 -> MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ). + 0x2030, // 0x89 -> PER MILLE SIGN (‰). + 0x0160, // 0x8A -> LATIN CAPITAL LETTER S WITH CARON (Š). + 0x2039, // 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹). + 0x0152, // 0x8C -> LATIN CAPITAL LIGATURE OE (Œ). + 0x8D, // 0x8D -> (no change). + 0x017D, // 0x8E -> LATIN CAPITAL LETTER Z WITH CARON (Ž). + 0x8F, // 0x8F -> (no change). + 0x90, // 0x90 -> (no change). + 0x2018, // 0x91 -> LEFT SINGLE QUOTATION MARK (‘). + 0x2019, // 0x92 -> RIGHT SINGLE QUOTATION MARK (’). + 0x201C, // 0x93 -> LEFT DOUBLE QUOTATION MARK (“). + 0x201D, // 0x94 -> RIGHT DOUBLE QUOTATION MARK (”). + 0x2022, // 0x95 -> BULLET (•). + 0x2013, // 0x96 -> EN DASH (–). + 0x2014, // 0x97 -> EM DASH (—). + 0x02DC, // 0x98 -> SMALL TILDE (˜). + 0x2122, // 0x99 -> TRADE MARK SIGN (™). + 0x0161, // 0x9A -> LATIN SMALL LETTER S WITH CARON (š). + 0x203A, // 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›). + 0x0153, // 0x9C -> LATIN SMALL LIGATURE OE (œ). + 0x9D, // 0x9D -> (no change). + 0x017E, // 0x9E -> LATIN SMALL LETTER Z WITH CARON (ž). + 0x0178, // 0x9F -> LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ). + ); + + $code_point = $windows_1252_mapping[ $code_point - 0x80 ]; + } + + $skip_bytes = $end_of_span - $at; + return self::code_point_to_utf8_bytes( $code_point ); + } + + /** Tracks inner parsing within the named character reference. */ + $name_at = $at + 1; + // Minimum named character reference is two characters. E.g. `GT`. + if ( $name_at + 2 > $length ) { + return null; + } + + $name_length = 0; + $replacement = $html5_named_character_references->read_token( $text, $name_at, $name_length ); + if ( false === $replacement ) { + return null; + } + + $after_name = $name_at + $name_length; + + // If the match ended with a semicolon then it should always be decoded. + if ( ';' === $text[ $name_at + $name_length - 1 ] ) { + $skip_bytes = $after_name - $at; + return $replacement; + } + + /* + * At this point though there's a match for an entry in the named + * character reference table but the match doesn't end in `;`. + * It may be allowed if it's followed by something unambiguous. + */ + $ambiguous_follower = ( + $after_name < $length && + $name_at < $length && + ( + ctype_alnum( $text[ $after_name ] ) || + '=' === $text[ $after_name ] + ) + ); + + // It's non-ambiguous, safe to leave it in. + if ( ! $ambiguous_follower ) { + $skip_bytes = $after_name - $at; + return $replacement; + } + + if ( 'attribute' === $context ) { + return null; + } + + $skip_bytes = $after_name - $at; + return $replacement; + } + + /** + * Encode a code point number into the UTF-8 encoding. + * + * This encoder implements the encoding algorithm for converting a number + * into a byte sequence, but if it receives an invalid code point it will + * return the Unicode Replacement Character U+FFFD `�`. + * + * Example: + * + * '🅰' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0x1f170 ); + * + * // Half of a surrogate pair is an invalid code point. + * '�' === WP_HTML_Decoder::code_point_to_utf8_bytes( 0xd83c ); + * + * @since 6.6.0 + * + * @see https://www.rfc-editor.org/rfc/rfc3629 UTF-8 + * + * @param int $code_point Which code point to convert. + * @return string Converted code point, or `�` if invalid. + */ + public static function code_point_to_utf8_bytes( $code_point ) { + if ( + $code_point <= 0 || + ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) || + $code_point > 0x10FFFF + ) { + return '�'; + } + + if ( $code_point <= 0x7F ) { + return chr( $code_point ); + } + + if ( $code_point <= 0x7FF ) { + $byte1 = ( $code_point >> 6 ) | 0xC0; + $byte2 = $code_point & 0x3F | 0x80; + + return pack( 'CC', $byte1, $byte2 ); + } + + if ( $code_point <= 0xFFFF ) { + $byte1 = ( $code_point >> 12 ) | 0xE0; + $byte2 = ( $code_point >> 6 ) & 0x3F | 0x80; + $byte3 = $code_point & 0x3F | 0x80; + + return pack( 'CCC', $byte1, $byte2, $byte3 ); + } + + if ( $code_point <= 0x10FFFF ) { + $byte1 = ( $code_point >> 18 ) | 0xF0; + $byte2 = ( $code_point >> 12 ) & 0x3F | 0x80; + $byte3 = ( $code_point >> 6 ) & 0x3F | 0x80; + $byte4 = $code_point & 0x3F | 0x80; + + return pack( 'CCCC', $byte1, $byte2, $byte3, $byte4 ); + } + } +} diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 4597a888b5efe..c4a5ffe6de7a8 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -15,10 +15,6 @@ * - Prune the whitespace when removing classes/attributes: e.g. "a b c" -> "c" not " c". * This would increase the size of the changes for some operations but leave more * natural-looking output HTML. - * - Properly decode HTML character references in `get_attribute()`. PHP's - * `html_entity_decode()` is wrong in a couple ways: it doesn't account for the - * no-ambiguous-ampersand rule, and it improperly handles the way semicolons may - * or may not terminate a character reference. * * @package WordPress * @subpackage HTML-API @@ -2499,7 +2495,7 @@ private function get_enqueued_attribute_value( $comparable_name ) { * 3. Double-quoting ends at the last character in the update. */ $enqueued_value = substr( $enqueued_text, $equals_at + 2, -1 ); - return html_entity_decode( $enqueued_value ); + return WP_HTML_Decoder::decode_attribute( $enqueued_value ); } /** @@ -2572,7 +2568,7 @@ public function get_attribute( $name ) { $raw_value = substr( $this->html, $attribute->value_starts_at, $attribute->value_length ); - return html_entity_decode( $raw_value ); + return WP_HTML_Decoder::decode_attribute( $raw_value ); } /** @@ -2872,7 +2868,7 @@ public function get_modifiable_text() { return $text; } - $decoded = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE ); + $decoded = WP_HTML_Decoder::decode_text_node( $text ); /* * TEXTAREA skips a leading newline, but this newline may appear not only as the diff --git a/src/wp-settings.php b/src/wp-settings.php index 4d8a35ae8358f..c644e60605dbc 100644 --- a/src/wp-settings.php +++ b/src/wp-settings.php @@ -252,6 +252,7 @@ require ABSPATH . WPINC . '/html-api/class-wp-html-attribute-token.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-span.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-text-replacement.php'; +require ABSPATH . WPINC . '/html-api/class-wp-html-decoder.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-tag-processor.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-unsupported-exception.php'; require ABSPATH . WPINC . '/html-api/class-wp-html-active-formatting-elements.php'; diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index c40481ac18e45..523966d412d25 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -31,41 +31,32 @@ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase { * Skip specific tests that may not be supported or have known issues. */ const SKIP_TESTS = array( - 'adoption01/line0046' => 'Unimplemented: Reconstruction of active formatting elements.', - 'adoption01/line0159' => 'Unimplemented: Reconstruction of active formatting elements.', - 'adoption01/line0318' => 'Unimplemented: Reconstruction of active formatting elements.', - 'entities02/line0100' => 'Encoded characters without semicolon termination in attribute values are not handled properly', - 'entities02/line0114' => 'Encoded characters without semicolon termination in attribute values are not handled properly', - 'entities02/line0128' => 'Encoded characters without semicolon termination in attribute values are not handled properly', - 'entities02/line0142' => 'Encoded characters without semicolon termination in attribute values are not handled properly', - 'entities02/line0156' => 'Encoded characters without semicolon termination in attribute values are not handled properly', - 'inbody01/line0001' => 'Bug.', - 'inbody01/line0014' => 'Bug.', - 'inbody01/line0029' => 'Bug.', - 'menuitem-element/line0012' => 'Bug.', - 'plain-text-unsafe/line0001' => 'HTML entities may be mishandled.', - 'plain-text-unsafe/line0105' => 'Binary.', - 'tests1/line0342' => "Closing P tag implicitly creates opener, which we don't visit.", - 'tests1/line0720' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests1/line0833' => 'Bug.', - 'tests15/line0001' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests15/line0022' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests2/line0317' => 'HTML entities may be mishandled.', - 'tests2/line0408' => 'HTML entities may be mishandled.', - 'tests2/line0650' => 'Whitespace only test never enters "in body" parsing mode.', - 'tests20/line0497' => "Closing P tag implicitly creates opener, which we don't visit.", - 'tests23/line0001' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests23/line0041' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests23/line0069' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests23/line0101' => 'Unimplemented: Reconstruction of active formatting elements.', - 'tests25/line0169' => 'Bug.', - 'tests26/line0263' => 'Bug: An active formatting element should be created for a trailing text node.', - 'tests7/line0354' => 'Bug.', - 'tests8/line0001' => 'Bug.', - 'tests8/line0020' => 'Bug.', - 'tests8/line0037' => 'Bug.', - 'tests8/line0052' => 'Bug.', - 'webkit01/line0174' => 'Bug.', + 'adoption01/line0046' => 'Unimplemented: Reconstruction of active formatting elements.', + 'adoption01/line0159' => 'Unimplemented: Reconstruction of active formatting elements.', + 'adoption01/line0318' => 'Unimplemented: Reconstruction of active formatting elements.', + 'inbody01/line0001' => 'Bug.', + 'inbody01/line0014' => 'Bug.', + 'inbody01/line0029' => 'Bug.', + 'menuitem-element/line0012' => 'Bug.', + 'tests1/line0342' => "Closing P tag implicitly creates opener, which we don't visit.", + 'tests1/line0720' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests1/line0833' => 'Bug.', + 'tests15/line0001' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests15/line0022' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests2/line0650' => 'Whitespace only test never enters "in body" parsing mode.', + 'tests20/line0497' => "Closing P tag implicitly creates opener, which we don't visit.", + 'tests23/line0001' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests23/line0041' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests23/line0069' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests23/line0101' => 'Unimplemented: Reconstruction of active formatting elements.', + 'tests25/line0169' => 'Bug.', + 'tests26/line0263' => 'Bug: An active formatting element should be created for a trailing text node.', + 'tests7/line0354' => 'Bug.', + 'tests8/line0001' => 'Bug.', + 'tests8/line0020' => 'Bug.', + 'tests8/line0037' => 'Bug.', + 'tests8/line0052' => 'Bug.', + 'webkit01/line0174' => 'Bug.', ); @@ -107,10 +98,6 @@ public function data_external_html5lib_tests() { continue; } - if ( 'entities01.dat' === $entry || 'entities02.dat' === $entry ) { - continue; - } - foreach ( self::parse_html5_dat_testfile( $test_dir . $entry ) as $k => $test ) { // strip .dat extension from filename $test_suite = substr( $entry, 0, -4 );