From c22cd4b5540911b32a6a8e8cdcf3389f2309244a Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Sun, 10 Dec 2023 15:17:01 +0100 Subject: [PATCH] HTML API: Avoid processing incomplete syntax elements. The HTML Tag Processor is able to know if it starts parsing a syntax element and reaches the end of the document before it reaches the end of the element. In these cases, after this patch, the processor will indicate this condition. For example, when processing `
get_node_text(); + * break; + * } + * } + * return trim( "# {$title}\n\n{$text_content}\n" ); + * + * ### Tokens and _modifiable text_. + * + * #### Special "atomic" HTML elements. + * + * Not all HTML elements are able to contain other elements inside of them. + * For instance, the contents inside a TITLE element are plaintext (except + * that character references like & will be decoded). This means that + * if the string `` appears inside a TITLE element, then it's not an + * image tag, but rather it's text describing an image tag. Likewise, the + * contents of a SCRIPT or STYLE element are handled entirely separately in + * a browser than the contents of other elements because they represent a + * different language than HTML. + * + * For these elements the Tag Processor treats the entire sequence as one, + * from the opening tag, including its contents, through its closing tag. + * This means that the it's not possible to match the closing tag for a + * SCRIPT element unless it's unexpected; the Tag Processor already matched + * it when it found the opening tag. + * + * The inner contents of these elements are that element's _modifiable text_. + * + * The special elements are: + * - `SCRIPT` whose contents are treated as raw plaintext but supports a legacy + * style of including Javascript inside of HTML comments to avoid accidentally + * closing the SCRIPT from inside a Javascript string. E.g. `console.log( '' )`. + * - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any + * character references are decoded. E.g. "1 &lt; 2 < 3" becomes "1 < 2 < 3". + * - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as + * raw plaintext and left as-si. E.g. "1 &lt; 2 < 3" remains "1 &lt; 2 < 3". + * + * #### Other tokens with modifiable text. + * + * There are also non-elements which are atomic in nature and contain modifiable text. + * + * - `#text` nodes, whose entire token _is_ the modifiable text. + * - Comment nodes and nodes that became comments because of some syntax error. The + * text for these nodes is the portion of the comment inside of the syntax. E.g. for + * "<!-- comment -->" the text is " comment " (note that the spaces are part of it). + * - `CDATA` sections, whose text is the content inside of the section itself. E.g. for + * "<![CDATA[some content]]>" the text is "some content". + * - "Funky comments," which are a special case of invalid closing tags whose name is + * invalid. The text for these nodes is the text that a browser would transform into + * an HTML when parsing. E.g. for "</%post_author>" the text is "%post_author". + * + * And there are non-elements which are atomic in nature but have no modifiable text. + * - `DOCTYPE` nodes like "<DOCTYPE html>" which have no closing tag. + * - XML Processing instruction nodes like "<". + * - The empty end tag "<" which is ignored in the browser and DOM but exposed + * to the HTML API. + * * ## Design and limitations * * The Tag Processor is designed to linearly scan HTML documents and tokenize @@ -320,7 +400,8 @@ * @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive. * @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE. * @since 6.5.0 Pauses processor when input ends in an incomplete syntax token. - * Introduces "special" elements which act like void elements, e.g. STYLE. + * Introduces "special" elements which act like void elements, e.g. TITLE, STYLE. + * Allows scanning through all tokens and processing modifiable text, where applicable. */ class WP_HTML_Tag_Processor { /** @@ -396,12 +477,18 @@ class WP_HTML_Tag_Processor { /** * Specifies mode of operation of the parser at any given time. * - * | State | Meaning | - * | --------------|----------------------------------------------------------------------| - * | *Ready* | The parser is ready to run. | - * | *Complete* | There is nothing left to parse. | - * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. | - * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. | + * | State | Meaning | + * | ----------------|----------------------------------------------------------------------| + * | *Ready* | The parser is ready to run. | + * | *Complete* | There is nothing left to parse. | + * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. | + * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. | + * | *Text node* | Found a #text node; this is plaintext and modifiable. | + * | *CDATA node* | Found a CDATA section; this is modifiable. | + * | *PI node* | Found a Processing Instruction; this is modifiable. | + * | *Comment* | Found a comment or bogus comment; this is modifiable. | + * | *Presumptuous* | Found an empty tag closer: ``. | + * | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable. | * * @since 6.5.0 * @@ -409,6 +496,13 @@ class WP_HTML_Tag_Processor { * @see WP_HTML_Tag_Processor::STATE_COMPLETE * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE * @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG + * @see WP_HTML_Tag_Processor::STATE_TEXT_NODE + * @see WP_HTML_Tag_Processor::STATE_CDATA_NODE + * @see WP_HTML_Tag_Processor::STATE_PI_NODE + * @see WP_HTML_Tag_Processor::STATE_COMMENT + * @see WP_HTML_Tag_Processor::STATE_DOCTYPE + * @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG + * @see WP_HTML_Tag_Processor::STATE_FUNKY_COMMENT * * @var string */ @@ -490,6 +584,24 @@ class WP_HTML_Tag_Processor { */ private $tag_name_length; + /** + * Byte offset into input document where current modifiable text starts. + * + * @since 6.5.0 + * + * @var int + */ + private $text_starts_at; + + /** + * Byte length of modifiable text. + * + * @since 6.5.0 + * + * @var string + */ + private $text_length; + /** * Whether the current tag is an opening tag, e.g.
, or a closing tag, e.g.
. * @@ -705,8 +817,8 @@ public function next_tag( $query = null ) { * @return bool Whether a token was parsed. */ public function next_token() { - $this->get_updated_html(); $was_at = $this->bytes_already_parsed; + $this->get_updated_html(); // Don't proceed if there's nothing more to scan. if ( @@ -736,6 +848,14 @@ public function next_token() { return false; } + if ( + self::STATE_INCOMPLETE !== $this->parser_state && + self::STATE_COMPLETE !== $this->parser_state && + self::STATE_MATCHED_TAG !== $this->parser_state + ) { + return true; + } + // Parse all of its attributes. while ( $this->parse_next_attribute() ) { continue; @@ -762,7 +882,7 @@ public function next_token() { } $this->parser_state = self::STATE_MATCHED_TAG; $this->token_length = $tag_ends_at - $this->token_starts_at; - $this->bytes_already_parsed = $tag_ends_at; + $this->bytes_already_parsed = $tag_ends_at + 1; /* * For non-DATA sections which might contain text that looks like HTML tags but @@ -771,8 +891,8 @@ public function next_token() { */ $t = $this->html[ $this->tag_name_starts_at ]; if ( - ! $this->is_closing_tag && - ( + $this->is_closing_tag || + ! ( 'i' === $t || 'I' === $t || 'n' === $t || 'N' === $t || 's' === $t || 'S' === $t || @@ -780,38 +900,62 @@ public function next_token() { 'x' === $t || 'X' === $t ) ) { - $tag_name = $this->get_tag(); + return true; + } - if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) { - $this->parser_state = self::STATE_INCOMPLETE; - $this->bytes_already_parsed = $was_at; + $tag_name = $this->get_tag(); + if ( + 'SCRIPT' !== $tag_name && + 'TEXTAREA' !== $tag_name && + 'TITLE' !== $tag_name && + 'IFRAME' !== $tag_name && + 'NOEMBED' !== $tag_name && + 'NOFRAMES' !== $tag_name && + 'STYLE' !== $tag_name && + 'XMP' !== $tag_name + ) { + return true; + } - return false; - } elseif ( - ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) && - ! $this->skip_rcdata( $tag_name ) - ) { - $this->parser_state = self::STATE_INCOMPLETE; - $this->bytes_already_parsed = $was_at; + // Preserve the opening tag pointers. + $tag_name_starts_at = $this->tag_name_starts_at; + $tag_name_length = $this->tag_name_length; + $tag_ends_at = $this->token_starts_at + $this->token_length; - return false; - } elseif ( - ( - 'IFRAME' === $tag_name || - 'NOEMBED' === $tag_name || - 'NOFRAMES' === $tag_name || - 'STYLE' === $tag_name || - 'XMP' === $tag_name - ) && - ! $this->skip_rawtext( $tag_name ) - ) { - $this->parser_state = self::STATE_INCOMPLETE; - $this->bytes_already_parsed = $was_at; + // Find the closing tag. + $found_closer = false; + switch ( $tag_name ) { + case 'SCRIPT': + $found_closer = $this->skip_script_data(); + break; - return false; - } + case 'TEXTAREA': + case 'TITLE': + $found_closer = $this->skip_rcdata( $tag_name ); + break; + + case 'IFRAME': + case 'NOEMBED': + case 'NOFRAMES': + case 'STYLE': + case 'XMP': + $found_closer = $this->skip_rawtext( $tag_name ); + break; } + if ( ! $found_closer ) { + $this->parser_state = self::STATE_INCOMPLETE; + $this->bytes_already_parsed = $was_at; + return false; + } + + $this->token_starts_at = $was_at; + $this->token_length = $this->bytes_already_parsed - $this->token_starts_at; + $this->text_starts_at = $tag_ends_at + 1; + $this->text_length = $this->tag_name_starts_at - $this->text_starts_at; + $this->tag_name_starts_at = $tag_name_starts_at; + $this->tag_name_length = $tag_name_length; + return true; } @@ -1007,7 +1151,7 @@ public function has_class( $wanted_class ) { */ public function set_bookmark( $name ) { // It only makes sense to set a bookmark if the parser has paused on a concrete token. - if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { + if ( self::STATE_INCOMPLETE === $this->parser_state ) { return false; } @@ -1082,15 +1226,15 @@ private function skip_rcdata( $tag_name ) { $at = $this->bytes_already_parsed; while ( false !== $at && $at < $doc_length ) { - $at = strpos( $this->html, 'html, 'tag_name_starts_at = $at; // Fail if there is no possible tag closer. if ( false === $at || ( $at + $tag_length ) >= $doc_length ) { return false; } - $closer_potentially_starts_at = $at; - $at += 2; + $at += 2; /* * Find a case-insensitive match to the tag name. @@ -1136,8 +1280,17 @@ private function skip_rcdata( $tag_name ) { return false; } - if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) { - $this->bytes_already_parsed = $closer_potentially_starts_at; + if ( '>' === $html[ $at ] ) { + $this->bytes_already_parsed = $at + 1; + return true; + } + + if ( $at + 1 >= strlen( $this->html ) ) { + return false; + } + + if ( '/' === $html[ $at ] && '>' === $html[ $at + 1 ] ) { + $this->bytes_already_parsed = $at + 2; return true; } } @@ -1259,6 +1412,7 @@ private function skip_script_data() { if ( $is_closing ) { $this->bytes_already_parsed = $closer_potentially_starts_at; + $this->tag_name_starts_at = $closer_potentially_starts_at; if ( $this->bytes_already_parsed >= $doc_length ) { return false; } @@ -1274,7 +1428,7 @@ private function skip_script_data() { } if ( '>' === $html[ $this->bytes_already_parsed ] ) { - $this->bytes_already_parsed = $closer_potentially_starts_at; + ++$this->bytes_already_parsed; return true; } } @@ -1303,17 +1457,34 @@ private function parse_next_tag() { $html = $this->html; $doc_length = strlen( $html ); - $at = $this->bytes_already_parsed; + $was_at = $this->bytes_already_parsed; + $at = $was_at; - while ( false !== $at && $at < $doc_length ) { + while ( false !== $at && $at <= $doc_length ) { $at = strpos( $html, '<', $at ); + if ( $at > $was_at ) { + $this->parser_state = self::STATE_TEXT_NODE; + $this->token_starts_at = $was_at; + $this->token_length = $at - $was_at; + $this->text_starts_at = $was_at; + $this->text_length = $this->token_length; + $this->bytes_already_parsed = $at; + return true; + } + /* * This does not imply an incomplete parse; it indicates that there * can be nothing left in the document other than a #text node. */ if ( false === $at ) { - return false; + $this->parser_state = self::STATE_TEXT_NODE; + $this->token_starts_at = $was_at; + $this->token_length = strlen( $html ) - $was_at; + $this->text_starts_at = $was_at; + $this->text_length = $this->token_length; + $this->bytes_already_parsed = strlen( $html ); + return true; } $this->token_starts_at = $at; @@ -1342,8 +1513,9 @@ private function parse_next_tag() { $tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 ); if ( $tag_name_prefix_length > 0 ) { ++$at; - $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); + $this->parser_state = self::STATE_MATCHED_TAG; $this->tag_name_starts_at = $at; + $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length ); $this->bytes_already_parsed = $at + $this->tag_name_length; return true; } @@ -1383,8 +1555,13 @@ private function parse_next_tag() { // Abruptly-closed empty comments are a sequence of dashes followed by `>`. $span_of_dashes = strspn( $html, '-', $closer_at ); if ( '>' === $html[ $closer_at + $span_of_dashes ] ) { - $at = $closer_at + $span_of_dashes + 1; - continue; + // @todo This could go wrong if the closer is shorter than `` because there's no inside. + $this->parser_state = self::STATE_COMMENT; + $this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 4; + $this->text_length = max( 0, $closer_at - $this->text_starts_at ); + $this->bytes_already_parsed = $closer_at + $span_of_dashes + 1; + return true; } /* @@ -1403,13 +1580,25 @@ private function parse_next_tag() { } if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) { - $at = $closer_at + 3; - continue 2; + $this->parser_state = self::STATE_COMMENT; + $this->token_length = $closer_at + 3 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 4; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 3; + return true; } - if ( $closer_at + 3 < $doc_length && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) { - $at = $closer_at + 4; - continue 2; + if ( + $closer_at + 3 < $doc_length && + '!' === $html[ $closer_at + 2 ] && + '>' === $html[ $closer_at + 3 ] + ) { + $this->parser_state = self::STATE_COMMENT; + $this->token_length = $closer_at + 4 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 4; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 4; + return true; } } } @@ -1436,8 +1625,12 @@ private function parse_next_tag() { return false; } - $at = $closer_at + 3; - continue; + $this->parser_state = self::STATE_CDATA_NODE; + $this->token_length = $closer_at + 4 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 9; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 3; + return true; } /* @@ -1462,8 +1655,12 @@ private function parse_next_tag() { return false; } - $at = $closer_at + 1; - continue; + $this->parser_state = self::STATE_DOCTYPE; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 9; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 1; + return true; } /* @@ -1477,8 +1674,6 @@ private function parse_next_tag() { return false; } - - continue; } /* @@ -1491,8 +1686,10 @@ private function parse_next_tag() { * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name */ if ( '>' === $html[ $at + 1 ] ) { - ++$at; - continue; + $this->parser_state = self::STATE_PRESUMPTUOUS_TAG; + $this->token_length = $at + 2 - $this->token_starts_at; + $this->bytes_already_parsed = $at + 2; + return true; } /* @@ -1507,8 +1704,12 @@ private function parse_next_tag() { return false; } - $at = $closer_at + 1; - continue; + $this->parser_state = self::STATE_DOCTYPE; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 2; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 1; + return true; } /* @@ -1530,8 +1731,12 @@ private function parse_next_tag() { return false; } - $at = $closer_at + 1; - continue; + $this->parser_state = self::STATE_FUNKY_COMMENT; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->text_starts_at = $this->token_starts_at + 2; + $this->text_length = $closer_at - $this->text_starts_at; + $this->bytes_already_parsed = $closer_at + 1; + return true; } ++$at; @@ -1692,6 +1897,8 @@ private function after_tag() { $this->token_length = null; $this->tag_name_starts_at = null; $this->tag_name_length = null; + $this->text_starts_at = 0; + $this->text_length = 0; $this->is_closing_tag = null; $this->attributes = array(); $this->duplicate_attributes = null; @@ -1895,7 +2102,7 @@ private function apply_attributes_updates( $shift_this_point = 0 ) { * replacements adjust offsets in the input document. */ foreach ( $this->bookmarks as $bookmark_name => $bookmark ) { - $bookmark_end = $bookmark->start + $bookmark->length; + $bookmark_end = $bookmark->start + $bookmark->length; /* * Each lexical update which appears before the bookmark's endpoints @@ -2281,6 +2488,70 @@ public function is_tag_closer() { ); } + public function get_node_type() { + switch ( $this->parser_state ) { + case self::STATE_MATCHED_TAG: + return '#tag'; + + case self::STATE_DOCTYPE: + return '#doctype'; + + case self::STATE_PI_NODE: + return '#processing-instruction'; + + default: + return $this->get_node_name(); + } + } + + public function get_node_name() { + switch ( $this->parser_state ) { + case self::STATE_MATCHED_TAG: + return $this->get_tag(); + + case self::STATE_TEXT_NODE: + return '#text'; + + case self::STATE_CDATA_NODE: + return '#cdata-section'; + + case self::STATE_PI_NODE: + // @todo add the PI tag. + return '?'; + + case self::STATE_COMMENT: + return '#comment'; + + case self::STATE_DOCTYPE: + return 'html'; + + case self::STATE_PRESUMPTUOUS_TAG: + return '#presumptuous-tag'; + + case self::STATE_FUNKY_COMMENT: + return '#funky-comment'; + } + } + + public function get_node_text() { + $at = $this->text_starts_at; + $length = $this->text_length; + + if ( self::STATE_MATCHED_TAG === $this->parser_state ) { + switch ( $this->get_tag() ) { + case 'PRE': + case 'TEXTAREA': + if ( "\n" === $this->html[ $at ] ) { + ++$at; + --$length; + } + break; + } + } + + return substr( $this->html, $at, $length ); + } + /** * Updates or creates a new attribute on the currently matched tag with the passed value. * @@ -2797,4 +3068,12 @@ private function matches() { * @access private */ const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG'; + + const STATE_TEXT_NODE = 'STATE_TEXT_NODE'; + const STATE_CDATA_NODE = 'STATE_CDATA_NODE'; + const STATE_PI_NODE = 'STATE_PI_NODE'; + const STATE_COMMENT = 'STATE_COMMENT'; + const STATE_DOCTYPE = 'STATE_DOCTYPE'; + const STATE_PRESUMPTUOUS_TAG = 'STATE_PRESUMPTUOUS_TAG'; + const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY'; } diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php index 8a681d2cb0042..355c03d941183 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php @@ -557,8 +557,10 @@ public function test_next_tag_should_stop_on_rcdata_and_script_tag_closers_when_ $p = new WP_HTML_Tag_Processor( '' ); $p->next_tag(); - $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the tag closer' ); - $this->assertTrue( $p->is_tag_closer(), 'Indicated a ' ); $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the tag closer when there was no tag opener' ); @@ -566,8 +568,10 @@ public function test_next_tag_should_stop_on_rcdata_and_script_tag_closers_when_ $p = new WP_HTML_Tag_Processor( '' ); $p->next_tag(); - $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the tag closer' ); - $this->assertTrue( $p->is_tag_closer(), 'Indicated a ' ); $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the tag closer when there was no tag opener' ); @@ -575,8 +579,10 @@ public function test_next_tag_should_stop_on_rcdata_and_script_tag_closers_when_ $p = new WP_HTML_Tag_Processor( 'abc' ); $p->next_tag(); - $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the tag closer' ); - $this->assertTrue( $p->is_tag_closer(), 'Indicated a tag opener is a tag closer' ); + $this->assertFalse( + $p->next_tag( array( 'tag_closers' => 'visit' ) ), + 'Should not have found closing TITLE when closing an opener.' + ); $p = new WP_HTML_Tag_Processor( 'abc' ); $this->assertTrue( $p->next_tag( array( 'tag_closers' => 'visit' ) ), 'Did not find the tag closer when there was no tag opener' );