From 15dca4effa8f1786176aa7acd6f3076c48f4a1fa Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Fri, 23 Aug 2024 15:40:15 +0000 Subject: [PATCH] HTML API: Add support for missing FRAMESET and "after" insertion modes. As part of work to add more spec support to the HTML API, this patch adds support for the FRAMESET-related insertion modes, as well as the set of missing after insertion modes. These modes run at the end of parsing a document, closing it and taking care of any lingering tags. Developed in https://github.com/wordpress/wordpress-develop/7165 Discussed in https://core.trac.wordpress.org/ticket/61576 Props dmsnell, jonsurrell. See #61576. git-svn-id: https://develop.svn.wordpress.org/trunk@58926 602fd350-edb4-49c9-b593-d223f7449a82 --- .../html-api/class-wp-html-processor.php | 334 +++++++++++++++++- .../html-api/wpHtmlProcessorHtml5lib.php | 3 + 2 files changed, 331 insertions(+), 6 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index ca7a4cf3e0e4f..168b8ace4c394 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -3972,7 +3972,71 @@ private function step_in_template(): bool { * @return bool Whether an element was found. */ private function step_after_body(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY . ' state.' ); + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * + * > Process the token using the rules for the "in body" insertion mode. + */ + case '#text': + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step_in_body(); + } + goto after_body_anything_else; + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->bail( 'Content outside of BODY is unsupported.' ); + break; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > An end tag whose tag name is "html" + * + * > If the parser was created as part of the HTML fragment parsing algorithm, + * > this is a parse error; ignore the token. (fragment case) + * > + * > Otherwise, switch the insertion mode to "after after body". + */ + case '-HTML': + if ( isset( $this->context_node ) ) { + return $this->step(); + } + + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY; + return true; + } + + /* + * > Parse error. Switch the insertion mode to "in body" and reprocess the token. + */ + after_body_anything_else: + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); } /** @@ -3991,7 +4055,109 @@ private function step_after_body(): bool { * @return bool Whether an element was found. */ private function step_in_frameset(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET . ' state.' ); + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * > + * > Insert the character. + * + * This algorithm effectively strips non-whitespace characters from text and inserts + * them under HTML. This is not supported at this time. + */ + case '#text': + $text = $this->get_modifiable_text(); + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step_in_body(); + } + $this->bail( 'Non-whitespace characters cannot be handled in frameset.' ); + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > A start tag whose tag name is "frameset" + */ + case '+FRAMESET': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > An end tag whose tag name is "frameset" + */ + case '-FRAMESET': + /* + * > If the current node is the root html element, then this is a parse error; + * > ignore the token. (fragment case) + */ + if ( $this->state->stack_of_open_elements->current_node_is( 'HTML' ) ) { + return $this->step(); + } + + /* + * > Otherwise, pop the current node from the stack of open elements. + */ + $this->state->stack_of_open_elements->pop(); + + /* + * > If the parser was not created as part of the HTML fragment parsing algorithm + * > (fragment case), and the current node is no longer a frameset element, then + * > switch the insertion mode to "after frameset". + */ + if ( ! isset( $this->context_node ) && ! $this->state->stack_of_open_elements->current_node_is( 'FRAMESET' ) ) { + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET; + } + + return true; + + /* + * > A start tag whose tag name is "frame" + * + * > Insert an HTML element for the token. Immediately pop the + * > current node off the stack of open elements. + * > + * > Acknowledge the token's self-closing flag, if it is set. + */ + case '+FRAME': + $this->insert_html_element( $this->state->current_token ); + $this->state->stack_of_open_elements->pop(); + return true; + + /* + * > A start tag whose tag name is "noframes" + */ + case '+NOFRAMES': + return $this->step_in_head(); + } + + // Parse error: ignore the token. + return $this->step(); } /** @@ -4010,7 +4176,67 @@ private function step_in_frameset(): bool { * @return bool Whether an element was found. */ private function step_after_frameset(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET . ' state.' ); + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * > + * > Insert the character. + * + * This algorithm effectively strips non-whitespace characters from text and inserts + * them under HTML. This is not supported at this time. + */ + case '#text': + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step_in_body(); + } + $this->bail( 'Non-whitespace characters cannot be handled in after frameset' ); + break; + + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->insert_html_element( $this->state->current_token ); + return true; + + /* + * > A DOCTYPE token + */ + case 'html': + // Parse error: ignore the token. + return $this->step(); + + /* + * > A start tag whose tag name is "html" + */ + case '+HTML': + return $this->step_in_body(); + + /* + * > An end tag whose tag name is "html" + */ + case '-HTML': + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET; + return true; + + /* + * > A start tag whose tag name is "noframes" + */ + case '+NOFRAMES': + return $this->step_in_head(); + } + + // Parse error: ignore the token. + return $this->step(); } /** @@ -4029,7 +4255,52 @@ private function step_after_frameset(): bool { * @return bool Whether an element was found. */ private function step_after_after_body(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY . ' state.' ); + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->bail( 'Content outside of HTML is unsupported.' ); + break; + + /* + * > A DOCTYPE token + * > A start tag whose tag name is "html" + * + * > Process the token using the rules for the "in body" insertion mode. + */ + case 'html': + case '+HTML': + return $this->step_in_body(); + + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * > + * > Process the token using the rules for the "in body" insertion mode. + */ + case '#text': + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step_in_body(); + } + goto after_after_body_anything_else; + break; + } + + /* + * > Parse error. Switch the insertion mode to "in body" and reprocess the token. + */ + after_after_body_anything_else: + $this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; + return $this->step( self::REPROCESS_CURRENT_NODE ); } /** @@ -4048,7 +4319,57 @@ private function step_after_after_body(): bool { * @return bool Whether an element was found. */ private function step_after_after_frameset(): bool { - $this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET . ' state.' ); + $tag_name = $this->get_token_name(); + $token_type = $this->get_token_type(); + $op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : ''; + $op = "{$op_sigil}{$tag_name}"; + + switch ( $op ) { + /* + * > A comment token + */ + case '#comment': + case '#funky-comment': + case '#presumptuous-tag': + $this->bail( 'Content outside of HTML is unsupported.' ); + break; + + /* + * > A DOCTYPE token + * > A start tag whose tag name is "html" + * + * > Process the token using the rules for the "in body" insertion mode. + */ + case 'html': + case '+HTML': + return $this->step_in_body(); + + /* + * > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), + * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + * > + * > Process the token using the rules for the "in body" insertion mode. + * + * This algorithm effectively strips non-whitespace characters from text and inserts + * them under HTML. This is not supported at this time. + */ + case '#text': + $text = $this->get_modifiable_text(); + if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) { + return $this->step_in_body(); + } + $this->bail( 'Non-whitespace characters cannot be handled in after after frameset.' ); + break; + + /* + * > A start tag whose tag name is "noframes" + */ + case '+NOFRAMES': + return $this->step_in_head(); + } + + // Parse error: ignore the token. + return $this->step(); } /** @@ -4115,7 +4436,8 @@ private function step_in_foreign_content(): bool { */ case '#cdata-section': case '#comment': - case '#funky_comment': + case '#funky-comment': + case '#presumptuous-tag': $this->insert_foreign_element( $this->state->current_token, false ); return true; diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index e772ab39e6356..15d50d1934116 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -33,10 +33,13 @@ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase { 'tests1/line0692' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly', 'tests14/line0022' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 'tests14/line0055' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests19/line0488' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests19/line0500' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 'tests19/line0965' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.', 'tests19/line1079' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 'tests2/line0207' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 'tests2/line0686' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', + 'tests2/line0697' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 'tests2/line0709' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.', 'tests5/line0013' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.', 'tests5/line0077' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',