Skip to content

Commit

Permalink
HTML API: Add support for missing FRAMESET and "after" insertion modes.
Browse files Browse the repository at this point in the history
As part of work to add more spec support to the HTML API, this patch adds support for the FRAMESET-related insertion modes, as well as the set of missing after insertion modes. These modes run at the end of parsing a document, closing it and taking care of any lingering tags.

Developed in https://github.com/wordpress/wordpress-develop/7165
Discussed in https://core.trac.wordpress.org/ticket/61576

Props dmsnell, jonsurrell.
See #61576.


git-svn-id: https://develop.svn.wordpress.org/trunk@58926 602fd350-edb4-49c9-b593-d223f7449a82
  • Loading branch information
dmsnell committed Aug 23, 2024
1 parent 1139a51 commit 15dca4e
Show file tree
Hide file tree
Showing 2 changed files with 331 additions and 6 deletions.
334 changes: 328 additions & 6 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -3972,7 +3972,71 @@ private function step_in_template(): bool {
* @return bool Whether an element was found.
*/
private function step_after_body(): bool {
$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY . ' state.' );
$tag_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$tag_name}";

switch ( $op ) {
/*
* > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
* > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
*
* > Process the token using the rules for the "in body" insertion mode.
*/
case '#text':
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
return $this->step_in_body();
}
goto after_body_anything_else;
break;

/*
* > A comment token
*/
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->bail( 'Content outside of BODY is unsupported.' );
break;

/*
* > A DOCTYPE token
*/
case 'html':
// Parse error: ignore the token.
return $this->step();

/*
* > A start tag whose tag name is "html"
*/
case '+HTML':
return $this->step_in_body();

/*
* > An end tag whose tag name is "html"
*
* > If the parser was created as part of the HTML fragment parsing algorithm,
* > this is a parse error; ignore the token. (fragment case)
* >
* > Otherwise, switch the insertion mode to "after after body".
*/
case '-HTML':
if ( isset( $this->context_node ) ) {
return $this->step();
}

$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY;
return true;
}

/*
* > Parse error. Switch the insertion mode to "in body" and reprocess the token.
*/
after_body_anything_else:
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
return $this->step( self::REPROCESS_CURRENT_NODE );
}

/**
Expand All @@ -3991,7 +4055,109 @@ private function step_after_body(): bool {
* @return bool Whether an element was found.
*/
private function step_in_frameset(): bool {
$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET . ' state.' );
$tag_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$tag_name}";

switch ( $op ) {
/*
* > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
* > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
* >
* > Insert the character.
*
* This algorithm effectively strips non-whitespace characters from text and inserts
* them under HTML. This is not supported at this time.
*/
case '#text':
$text = $this->get_modifiable_text();
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
return $this->step_in_body();
}
$this->bail( 'Non-whitespace characters cannot be handled in frameset.' );
break;

/*
* > A comment token
*/
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->insert_html_element( $this->state->current_token );
return true;

/*
* > A DOCTYPE token
*/
case 'html':
// Parse error: ignore the token.
return $this->step();

/*
* > A start tag whose tag name is "html"
*/
case '+HTML':
return $this->step_in_body();

/*
* > A start tag whose tag name is "frameset"
*/
case '+FRAMESET':
$this->insert_html_element( $this->state->current_token );
return true;

/*
* > An end tag whose tag name is "frameset"
*/
case '-FRAMESET':
/*
* > If the current node is the root html element, then this is a parse error;
* > ignore the token. (fragment case)
*/
if ( $this->state->stack_of_open_elements->current_node_is( 'HTML' ) ) {
return $this->step();
}

/*
* > Otherwise, pop the current node from the stack of open elements.
*/
$this->state->stack_of_open_elements->pop();

/*
* > If the parser was not created as part of the HTML fragment parsing algorithm
* > (fragment case), and the current node is no longer a frameset element, then
* > switch the insertion mode to "after frameset".
*/
if ( ! isset( $this->context_node ) && ! $this->state->stack_of_open_elements->current_node_is( 'FRAMESET' ) ) {
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET;
}

return true;

/*
* > A start tag whose tag name is "frame"
*
* > Insert an HTML element for the token. Immediately pop the
* > current node off the stack of open elements.
* >
* > Acknowledge the token's self-closing flag, if it is set.
*/
case '+FRAME':
$this->insert_html_element( $this->state->current_token );
$this->state->stack_of_open_elements->pop();
return true;

/*
* > A start tag whose tag name is "noframes"
*/
case '+NOFRAMES':
return $this->step_in_head();
}

// Parse error: ignore the token.
return $this->step();
}

/**
Expand All @@ -4010,7 +4176,67 @@ private function step_in_frameset(): bool {
* @return bool Whether an element was found.
*/
private function step_after_frameset(): bool {
$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET . ' state.' );
$tag_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$tag_name}";

switch ( $op ) {
/*
* > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
* > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
* >
* > Insert the character.
*
* This algorithm effectively strips non-whitespace characters from text and inserts
* them under HTML. This is not supported at this time.
*/
case '#text':
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
return $this->step_in_body();
}
$this->bail( 'Non-whitespace characters cannot be handled in after frameset' );
break;

/*
* > A comment token
*/
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->insert_html_element( $this->state->current_token );
return true;

/*
* > A DOCTYPE token
*/
case 'html':
// Parse error: ignore the token.
return $this->step();

/*
* > A start tag whose tag name is "html"
*/
case '+HTML':
return $this->step_in_body();

/*
* > An end tag whose tag name is "html"
*/
case '-HTML':
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET;
return true;

/*
* > A start tag whose tag name is "noframes"
*/
case '+NOFRAMES':
return $this->step_in_head();
}

// Parse error: ignore the token.
return $this->step();
}

/**
Expand All @@ -4029,7 +4255,52 @@ private function step_after_frameset(): bool {
* @return bool Whether an element was found.
*/
private function step_after_after_body(): bool {
$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY . ' state.' );
$tag_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$tag_name}";

switch ( $op ) {
/*
* > A comment token
*/
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->bail( 'Content outside of HTML is unsupported.' );
break;

/*
* > A DOCTYPE token
* > A start tag whose tag name is "html"
*
* > Process the token using the rules for the "in body" insertion mode.
*/
case 'html':
case '+HTML':
return $this->step_in_body();

/*
* > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
* > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
* >
* > Process the token using the rules for the "in body" insertion mode.
*/
case '#text':
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
return $this->step_in_body();
}
goto after_after_body_anything_else;
break;
}

/*
* > Parse error. Switch the insertion mode to "in body" and reprocess the token.
*/
after_after_body_anything_else:
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
return $this->step( self::REPROCESS_CURRENT_NODE );
}

/**
Expand All @@ -4048,7 +4319,57 @@ private function step_after_after_body(): bool {
* @return bool Whether an element was found.
*/
private function step_after_after_frameset(): bool {
$this->bail( 'No support for parsing in the ' . WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET . ' state.' );
$tag_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$tag_name}";

switch ( $op ) {
/*
* > A comment token
*/
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->bail( 'Content outside of HTML is unsupported.' );
break;

/*
* > A DOCTYPE token
* > A start tag whose tag name is "html"
*
* > Process the token using the rules for the "in body" insertion mode.
*/
case 'html':
case '+HTML':
return $this->step_in_body();

/*
* > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
* > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
* >
* > Process the token using the rules for the "in body" insertion mode.
*
* This algorithm effectively strips non-whitespace characters from text and inserts
* them under HTML. This is not supported at this time.
*/
case '#text':
$text = $this->get_modifiable_text();
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
return $this->step_in_body();
}
$this->bail( 'Non-whitespace characters cannot be handled in after after frameset.' );
break;

/*
* > A start tag whose tag name is "noframes"
*/
case '+NOFRAMES':
return $this->step_in_head();
}

// Parse error: ignore the token.
return $this->step();
}

/**
Expand Down Expand Up @@ -4115,7 +4436,8 @@ private function step_in_foreign_content(): bool {
*/
case '#cdata-section':
case '#comment':
case '#funky_comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->insert_foreign_element( $this->state->current_token, false );
return true;

Expand Down
3 changes: 3 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,13 @@ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase {
'tests1/line0692' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly',
'tests14/line0022' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
'tests14/line0055' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
'tests19/line0488' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
'tests19/line0500' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
'tests19/line0965' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',
'tests19/line1079' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
'tests2/line0207' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
'tests2/line0686' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
'tests2/line0697' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
'tests2/line0709' => 'Unimplemented: This parser does not add missing attributes to existing HTML or BODY tags.',
'tests5/line0013' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',
'tests5/line0077' => 'Bug: Mixed whitespace, non-whitespace text in head not split correctly.',
Expand Down

0 comments on commit 15dca4e

Please sign in to comment.