diff --git a/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-processor-6-5.php b/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-processor-6-5.php
index eae2b1eda815ca..7e83a54e708093 100644
--- a/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-processor-6-5.php
+++ b/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-processor-6-5.php
@@ -101,18 +101,18 @@
*
* - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY.
* - Custom elements: All custom elements are supported. :)
- * - Form elements: BUTTON, DATALIST, FIELDSET, LABEL, LEGEND, METER, PROGRESS, SEARCH.
- * - Formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U.
+ * - Form elements: BUTTON, DATALIST, FIELDSET, INPUT, LABEL, LEGEND, METER, PROGRESS, SEARCH.
+ * - Formatting elements: B, BIG, CODE, EM, FONT, I, PRE, SMALL, STRIKE, STRONG, TT, U, WBR.
* - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP.
* - Links: A.
- * - Lists: DD, DL, DT, LI, OL, LI.
- * - Media elements: AUDIO, CANVAS, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, VIDEO.
- * - Paragraph: P.
- * - Phrasing elements: ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR.
- * - Sectioning elements: ARTICLE, ASIDE, NAV, SECTION.
+ * - Lists: DD, DL, DT, LI, OL, UL.
+ * - Media elements: AUDIO, CANVAS, EMBED, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, SOURCE, TRACK, VIDEO.
+ * - Paragraph: BR, P.
+ * - Phrasing elements: ABBR, AREA, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR.
+ * - Sectioning elements: ARTICLE, ASIDE, HR, NAV, SECTION.
* - Templating elements: SLOT.
* - Text decoration: RUBY.
- * - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, MULTICOL, NEXTID, SPACER.
+ * - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, KEYGEN, LISTING, MULTICOL, NEXTID, PARAM, SPACER.
*
* ### Supported markup
*
@@ -149,17 +149,6 @@ class Gutenberg_HTML_Processor_6_5 extends Gutenberg_HTML_Tag_Processor_6_5 {
*/
const MAX_BOOKMARKS = 100;
- /**
- * Static query for instructing the Tag Processor to visit every token.
- *
- * @access private
- *
- * @since 6.4.0
- *
- * @var array
- */
- const VISIT_EVERYTHING = array( 'tag_closers' => 'visit' );
-
/**
* Holds the working state of the parser, including the stack of
* open elements and the stack of active formatting elements.
@@ -424,6 +413,30 @@ public function next_tag( $query = null ) {
return false;
}
+ /**
+ * Ensures internal accounting is maintained for HTML semantic rules while
+ * the underlying Tag Processor class is seeking to a bookmark.
+ *
+ * This doesn't currently have a way to represent non-tags and doesn't process
+ * semantic rules for text nodes. For access to the raw tokens consider using
+ * WP_HTML_Tag_Processor instead.
+ *
+ * @since 6.5.0 Added for internal support; do not use.
+ *
+ * @access private
+ *
+ * @return bool
+ */
+ public function next_token() {
+ $found_a_token = parent::next_token();
+
+ if ( '#tag' === $this->get_token_type() ) {
+ $this->step( self::REPROCESS_CURRENT_NODE );
+ }
+
+ return $found_a_token;
+ }
+
/**
* Indicates if the currently-matched tag matches the given breadcrumbs.
*
@@ -520,7 +533,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
$this->state->stack_of_open_elements->pop();
}
- parent::next_tag( self::VISIT_EVERYTHING );
+ while ( parent::next_token() && '#tag' !== $this->get_token_type() ) {
+ continue;
+ }
}
// Finish stepping when there are no more tokens in the document.
@@ -684,10 +699,12 @@ private function step_in_body() {
case '-FOOTER':
case '-HEADER':
case '-HGROUP':
+ case '-LISTING':
case '-MAIN':
case '-MENU':
case '-NAV':
case '-OL':
+ case '-PRE':
case '-SEARCH':
case '-SECTION':
case '-SUMMARY':
@@ -732,6 +749,18 @@ private function step_in_body() {
$this->insert_html_element( $this->state->current_token );
return true;
+ /*
+ * > A start tag whose tag name is one of: "pre", "listing"
+ */
+ case '+PRE':
+ case '+LISTING':
+ if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+ $this->close_a_p_element();
+ }
+ $this->insert_html_element( $this->state->current_token );
+ $this->state->frameset_ok = false;
+ return true;
+
/*
* > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
*/
@@ -934,11 +963,64 @@ private function step_in_body() {
$this->run_adoption_agency_algorithm();
return true;
+ /*
+ * > An end tag whose tag name is "br"
+ * > Parse error. Drop the attributes from the token, and act as described in the next
+ * > entry; i.e. act as if this was a "br" start tag token with no attributes, rather
+ * > than the end tag token that it actually is.
+ */
+ case '-BR':
+ $this->last_error = self::ERROR_UNSUPPORTED;
+ throw new WP_HTML_Unsupported_Exception( 'Closing BR tags require unimplemented special handling.' );
+
/*
* > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr"
*/
+ case '+AREA':
+ case '+BR':
+ case '+EMBED':
case '+IMG':
+ case '+KEYGEN':
+ case '+WBR':
$this->reconstruct_active_formatting_elements();
+ $this->insert_html_element( $this->state->current_token );
+ $this->state->frameset_ok = false;
+ return true;
+
+ /*
+ * > A start tag whose tag name is "input"
+ */
+ case '+INPUT':
+ $this->reconstruct_active_formatting_elements();
+ $this->insert_html_element( $this->state->current_token );
+ $type_attribute = $this->get_attribute( 'type' );
+ /*
+ * > If the token does not have an attribute with the name "type", or if it does,
+ * > but that attribute's value is not an ASCII case-insensitive match for the
+ * > string "hidden", then: set the frameset-ok flag to "not ok".
+ */
+ if ( ! is_string( $type_attribute ) || 'hidden' !== strtolower( $type_attribute ) ) {
+ $this->state->frameset_ok = false;
+ }
+ return true;
+
+ /*
+ * > A start tag whose tag name is "hr"
+ */
+ case '+HR':
+ if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+ $this->close_a_p_element();
+ }
+ $this->insert_html_element( $this->state->current_token );
+ $this->state->frameset_ok = false;
+ return true;
+
+ /*
+ * > A start tag whose tag name is one of: "param", "source", "track"
+ */
+ case '+PARAM':
+ case '+SOURCE':
+ case '+TRACK':
$this->insert_html_element( $this->state->current_token );
return true;
}
@@ -961,30 +1043,20 @@ private function step_in_body() {
*/
switch ( $tag_name ) {
case 'APPLET':
- case 'AREA':
case 'BASE':
case 'BASEFONT':
case 'BGSOUND':
case 'BODY':
- case 'BR':
case 'CAPTION':
case 'COL':
case 'COLGROUP':
- case 'DD':
- case 'DT':
- case 'EMBED':
case 'FORM':
case 'FRAME':
case 'FRAMESET':
case 'HEAD':
- case 'HR':
case 'HTML':
case 'IFRAME':
- case 'INPUT':
- case 'KEYGEN':
- case 'LI':
case 'LINK':
- case 'LISTING':
case 'MARQUEE':
case 'MATH':
case 'META':
@@ -993,12 +1065,9 @@ private function step_in_body() {
case 'NOFRAMES':
case 'NOSCRIPT':
case 'OBJECT':
- case 'OL':
case 'OPTGROUP':
case 'OPTION':
- case 'PARAM':
case 'PLAINTEXT':
- case 'PRE':
case 'RB':
case 'RP':
case 'RT':
@@ -1006,7 +1075,6 @@ private function step_in_body() {
case 'SARCASM':
case 'SCRIPT':
case 'SELECT':
- case 'SOURCE':
case 'STYLE':
case 'SVG':
case 'TABLE':
@@ -1019,9 +1087,6 @@ private function step_in_body() {
case 'THEAD':
case 'TITLE':
case 'TR':
- case 'TRACK':
- case 'UL':
- case 'WBR':
case 'XMP':
$this->last_error = self::ERROR_UNSUPPORTED;
throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
@@ -1675,14 +1740,19 @@ public static function is_void( $tag_name ) {
return (
'AREA' === $tag_name ||
'BASE' === $tag_name ||
+ 'BASEFONT' === $tag_name || // Obsolete but still treated as void.
+ 'BGSOUND' === $tag_name || // Obsolete but still treated as void.
'BR' === $tag_name ||
'COL' === $tag_name ||
'EMBED' === $tag_name ||
+ 'FRAME' === $tag_name ||
'HR' === $tag_name ||
'IMG' === $tag_name ||
'INPUT' === $tag_name ||
+ 'KEYGEN' === $tag_name || // Obsolete but still treated as void.
'LINK' === $tag_name ||
'META' === $tag_name ||
+ 'PARAM' === $tag_name || // Obsolete but still treated as void.
'SOURCE' === $tag_name ||
'TRACK' === $tag_name ||
'WBR' === $tag_name
diff --git a/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-tag-processor-6-5.php b/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-tag-processor-6-5.php
index de3823d2b2703b..1d2430cd455784 100644
--- a/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-tag-processor-6-5.php
+++ b/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-tag-processor-6-5.php
@@ -247,6 +247,95 @@
* }
* }
*
+ * ## Tokens and finer-grained processing.
+ *
+ * It's possible to scan through every lexical token in the
+ * HTML document using the `next_token()` function. This
+ * alternative form takes no argument and provides no built-in
+ * query syntax.
+ *
+ * Example:
+ *
+ * $title = '(untitled)';
+ * $text = '';
+ * while ( $processor->next_token() ) {
+ * switch ( $processor->get_token_name() ) {
+ * case '#text':
+ * $text .= $processor->get_modifiable_text();
+ * break;
+ *
+ * case 'BR':
+ * $text .= "\n";
+ * break;
+ *
+ * case 'TITLE':
+ * $title = $processor->get_modifiable_text();
+ * break;
+ * }
+ * }
+ * return trim( "# {$title}\n\n{$text}" );
+ *
+ * ### Tokens and _modifiable text_.
+ *
+ * #### Special "atomic" HTML elements.
+ *
+ * Not all HTML elements are able to contain other elements inside of them.
+ * For instance, the contents inside a TITLE element are plaintext (except
+ * that character references like & will be decoded). This means that
+ * if the string `` appears inside a TITLE element, then it's not an
+ * image tag, but rather it's text describing an image tag. Likewise, the
+ * contents of a SCRIPT or STYLE element are handled entirely separately in
+ * a browser than the contents of other elements because they represent a
+ * different language than HTML.
+ *
+ * For these elements the Tag Processor treats the entire sequence as one,
+ * from the opening tag, including its contents, through its closing tag.
+ * This means that the it's not possible to match the closing tag for a
+ * SCRIPT element unless it's unexpected; the Tag Processor already matched
+ * it when it found the opening tag.
+ *
+ * The inner contents of these elements are that element's _modifiable text_.
+ *
+ * The special elements are:
+ * - `SCRIPT` whose contents are treated as raw plaintext but supports a legacy
+ * style of including Javascript inside of HTML comments to avoid accidentally
+ * closing the SCRIPT from inside a Javascript string. E.g. `console.log( '' )`.
+ * - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any
+ * character references are decoded. E.g. `1 < 2 < 3` becomes `1 < 2 < 3`.
+ * - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as
+ * raw plaintext and left as-is. E.g. `1 < 2 < 3` remains `1 < 2 < 3`.
+ *
+ * #### Other tokens with modifiable text.
+ *
+ * There are also non-elements which are void/self-closing in nature and contain
+ * modifiable text that is part of that individual syntax token itself.
+ *
+ * - `#text` nodes, whose entire token _is_ the modifiable text.
+ * - HTML comments and tokens that become comments due to some syntax error. The
+ * text for these tokens is the portion of the comment inside of the syntax.
+ * E.g. for `` the text is `" comment "` (note the spaces are included).
+ * - `CDATA` sections, whose text is the content inside of the section itself. E.g. for
+ * `` the text is `"some content"` (with restrictions [1]).
+ * - "Funky comments," which are a special case of invalid closing tags whose name is
+ * invalid. The text for these nodes is the text that a browser would transform into
+ * an HTML comment when parsing. E.g. for `%post_author>` the text is `%post_author`.
+ * - `DOCTYPE` declarations like `` which have no closing tag.
+ * - XML Processing instruction nodes like `` (with restrictions [2]).
+ * - The empty end tag `>` which is ignored in the browser and DOM.
+ *
+ * [1]: There are no CDATA sections in HTML. When encountering `` becomes a bogus HTML comment, meaning there can be no CDATA
+ * section in an HTML document containing `>`. The Tag Processor will first find
+ * all valid and bogus HTML comments, and then if the comment _would_ have been a
+ * CDATA section _were they to exist_, it will indicate this as the type of comment.
+ *
+ * [2]: XML allows a broader range of characters in a processing instruction's target name
+ * and disallows "xml" as a name, since it's special. The Tag Processor only recognizes
+ * target names with an ASCII-representable subset of characters. It also exhibits the
+ * same constraint as with CDATA sections, in that `>` cannot exist within the token
+ * since Processing Instructions do no exist within HTML and their syntax transforms
+ * into a bogus comment in the DOM.
+ *
* ## Design and limitations
*
* The Tag Processor is designed to linearly scan HTML documents and tokenize
@@ -320,7 +409,8 @@
* @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive.
* @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE.
* @since 6.5.0 Pauses processor when input ends in an incomplete syntax token.
- * Introduces "special" elements which act like void elements, e.g. STYLE.
+ * Introduces "special" elements which act like void elements, e.g. TITLE, STYLE.
+ * Allows scanning through all tokens and processing modifiable text, where applicable.
*/
class Gutenberg_HTML_Tag_Processor_6_5 {
/**
@@ -396,23 +486,47 @@ class Gutenberg_HTML_Tag_Processor_6_5 {
/**
* Specifies mode of operation of the parser at any given time.
*
- * | State | Meaning |
- * | --------------|----------------------------------------------------------------------|
- * | *Ready* | The parser is ready to run. |
- * | *Complete* | There is nothing left to parse. |
- * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. |
- * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. |
+ * | State | Meaning |
+ * | ----------------|----------------------------------------------------------------------|
+ * | *Ready* | The parser is ready to run. |
+ * | *Complete* | There is nothing left to parse. |
+ * | *Incomplete* | The HTML ended in the middle of a token; nothing more can be parsed. |
+ * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes. |
+ * | *Text node* | Found a #text node; this is plaintext and modifiable. |
+ * | *CDATA node* | Found a CDATA section; this is modifiable. |
+ * | *Comment* | Found a comment or bogus comment; this is modifiable. |
+ * | *Presumptuous* | Found an empty tag closer: `>`. |
+ * | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable. |
*
* @since 6.5.0
*
* @see WP_HTML_Tag_Processor::STATE_READY
* @see WP_HTML_Tag_Processor::STATE_COMPLETE
- * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE
+ * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT
* @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG
+ * @see WP_HTML_Tag_Processor::STATE_TEXT_NODE
+ * @see WP_HTML_Tag_Processor::STATE_CDATA_NODE
+ * @see WP_HTML_Tag_Processor::STATE_COMMENT
+ * @see WP_HTML_Tag_Processor::STATE_DOCTYPE
+ * @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG
+ * @see WP_HTML_Tag_Processor::STATE_FUNKY_COMMENT
*
* @var string
*/
- private $parser_state = self::STATE_READY;
+ protected $parser_state = self::STATE_READY;
+
+ /**
+ * What kind of syntax token became an HTML comment.
+ *
+ * Since there are many ways in which HTML syntax can create an HTML comment,
+ * this indicates which of those caused it. This allows the Tag Processor to
+ * represent more from the original input document than would appear in the DOM.
+ *
+ * @since 6.5.0
+ *
+ * @var string|null
+ */
+ protected $comment_type = null;
/**
* How many bytes from the original HTML document have been read and parsed.
@@ -490,6 +604,24 @@ class Gutenberg_HTML_Tag_Processor_6_5 {
*/
private $tag_name_length;
+ /**
+ * Byte offset into input document where current modifiable text starts.
+ *
+ * @since 6.5.0
+ *
+ * @var int
+ */
+ private $text_starts_at;
+
+ /**
+ * Byte length of modifiable text.
+ *
+ * @since 6.5.0
+ *
+ * @var string
+ */
+ private $text_length;
+
/**
* Whether the current tag is an opening tag, e.g. , or a closing tag, e.g.
.
*
@@ -705,13 +837,13 @@ public function next_tag( $query = null ) {
* @return bool Whether a token was parsed.
*/
public function next_token() {
- $this->get_updated_html();
$was_at = $this->bytes_already_parsed;
+ $this->get_updated_html();
// Don't proceed if there's nothing more to scan.
if (
self::STATE_COMPLETE === $this->parser_state ||
- self::STATE_INCOMPLETE === $this->parser_state
+ self::STATE_INCOMPLETE_INPUT === $this->parser_state
) {
return false;
}
@@ -729,13 +861,27 @@ public function next_token() {
// Find the next tag if it exists.
if ( false === $this->parse_next_tag() ) {
- if ( self::STATE_INCOMPLETE === $this->parser_state ) {
+ if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) {
$this->bytes_already_parsed = $was_at;
}
return false;
}
+ /*
+ * For legacy reasons the rest of this function handles tags and their
+ * attributes. If the processor has reached the end of the document
+ * or if it matched any other token then it should return here to avoid
+ * attempting to process tag-specific syntax.
+ */
+ if (
+ self::STATE_INCOMPLETE_INPUT !== $this->parser_state &&
+ self::STATE_COMPLETE !== $this->parser_state &&
+ self::STATE_MATCHED_TAG !== $this->parser_state
+ ) {
+ return true;
+ }
+
// Parse all of its attributes.
while ( $this->parse_next_attribute() ) {
continue;
@@ -743,11 +889,11 @@ public function next_token() {
// Ensure that the tag closes before the end of the document.
if (
- self::STATE_INCOMPLETE === $this->parser_state ||
+ self::STATE_INCOMPLETE_INPUT === $this->parser_state ||
$this->bytes_already_parsed >= strlen( $this->html )
) {
// Does this appropriately clear state (parsed attributes)?
- $this->parser_state = self::STATE_INCOMPLETE;
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
$this->bytes_already_parsed = $was_at;
return false;
@@ -755,14 +901,14 @@ public function next_token() {
$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
if ( false === $tag_ends_at ) {
- $this->parser_state = self::STATE_INCOMPLETE;
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
$this->bytes_already_parsed = $was_at;
return false;
}
$this->parser_state = self::STATE_MATCHED_TAG;
$this->token_length = $tag_ends_at - $this->token_starts_at;
- $this->bytes_already_parsed = $tag_ends_at;
+ $this->bytes_already_parsed = $tag_ends_at + 1;
/*
* For non-DATA sections which might contain text that looks like HTML tags but
@@ -771,8 +917,8 @@ public function next_token() {
*/
$t = $this->html[ $this->tag_name_starts_at ];
if (
- ! $this->is_closing_tag &&
- (
+ $this->is_closing_tag ||
+ ! (
'i' === $t || 'I' === $t ||
'n' === $t || 'N' === $t ||
's' === $t || 'S' === $t ||
@@ -780,38 +926,81 @@ public function next_token() {
'x' === $t || 'X' === $t
)
) {
- $tag_name = $this->get_tag();
+ return true;
+ }
- if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) {
- $this->parser_state = self::STATE_INCOMPLETE;
- $this->bytes_already_parsed = $was_at;
+ $tag_name = $this->get_tag();
- return false;
- } elseif (
- ( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) &&
- ! $this->skip_rcdata( $tag_name )
- ) {
- $this->parser_state = self::STATE_INCOMPLETE;
- $this->bytes_already_parsed = $was_at;
+ /*
+ * Preserve the opening tag pointers, as these will be overwritten
+ * when finding the closing tag. They will be reset after finding
+ * the closing to tag to point to the opening of the special atomic
+ * tag sequence.
+ */
+ $tag_name_starts_at = $this->tag_name_starts_at;
+ $tag_name_length = $this->tag_name_length;
+ $tag_ends_at = $this->token_starts_at + $this->token_length;
+ $attributes = $this->attributes;
+ $duplicate_attributes = $this->duplicate_attributes;
+
+ // Find the closing tag if necessary.
+ $found_closer = false;
+ switch ( $tag_name ) {
+ case 'SCRIPT':
+ $found_closer = $this->skip_script_data();
+ break;
- return false;
- } elseif (
- (
- 'IFRAME' === $tag_name ||
- 'NOEMBED' === $tag_name ||
- 'NOFRAMES' === $tag_name ||
- 'STYLE' === $tag_name ||
- 'XMP' === $tag_name
- ) &&
- ! $this->skip_rawtext( $tag_name )
- ) {
- $this->parser_state = self::STATE_INCOMPLETE;
- $this->bytes_already_parsed = $was_at;
+ case 'TEXTAREA':
+ case 'TITLE':
+ $found_closer = $this->skip_rcdata( $tag_name );
+ break;
- return false;
- }
+ /*
+ * In the browser this list would include the NOSCRIPT element,
+ * but the Tag Processor is an environment with the scripting
+ * flag disabled, meaning that it needs to descend into the
+ * NOSCRIPT element to be able to properly process what will be
+ * sent to a browser.
+ *
+ * Note that this rule makes HTML5 syntax incompatible with XML,
+ * because the parsing of this token depends on client application.
+ * The NOSCRIPT element cannot be represented in the XHTML syntax.
+ */
+ case 'IFRAME':
+ case 'NOEMBED':
+ case 'NOFRAMES':
+ case 'STYLE':
+ case 'XMP':
+ $found_closer = $this->skip_rawtext( $tag_name );
+ break;
+
+ // No other tags should be treated in their entirety here.
+ default:
+ return true;
}
+ if ( ! $found_closer ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
+ $this->bytes_already_parsed = $was_at;
+ return false;
+ }
+
+ /*
+ * The values here look like they reference the opening tag but they reference
+ * the closing tag instead. This is why the opening tag values were stored
+ * above in a variable. It reads confusingly here, but that's because the
+ * functions that skip the contents have moved all the internal cursors past
+ * the inner content of the tag.
+ */
+ $this->token_starts_at = $was_at;
+ $this->token_length = $this->bytes_already_parsed - $this->token_starts_at;
+ $this->text_starts_at = $tag_ends_at + 1;
+ $this->text_length = $this->tag_name_starts_at - $this->text_starts_at;
+ $this->tag_name_starts_at = $tag_name_starts_at;
+ $this->tag_name_length = $tag_name_length;
+ $this->attributes = $attributes;
+ $this->duplicate_attributes = $duplicate_attributes;
+
return true;
}
@@ -830,7 +1019,7 @@ public function next_token() {
* @return bool Whether the parse paused at the start of an incomplete token.
*/
public function paused_at_incomplete_token() {
- return self::STATE_INCOMPLETE === $this->parser_state;
+ return self::STATE_INCOMPLETE_INPUT === $this->parser_state;
}
/**
@@ -1007,7 +1196,10 @@ public function has_class( $wanted_class ) {
*/
public function set_bookmark( $name ) {
// It only makes sense to set a bookmark if the parser has paused on a concrete token.
- if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
+ if (
+ self::STATE_COMPLETE === $this->parser_state ||
+ self::STATE_INCOMPLETE_INPUT === $this->parser_state
+ ) {
return false;
}
@@ -1082,15 +1274,15 @@ private function skip_rcdata( $tag_name ) {
$at = $this->bytes_already_parsed;
while ( false !== $at && $at < $doc_length ) {
- $at = strpos( $this->html, '', $at );
+ $at = strpos( $this->html, '', $at );
+ $this->tag_name_starts_at = $at;
// Fail if there is no possible tag closer.
if ( false === $at || ( $at + $tag_length ) >= $doc_length ) {
return false;
}
- $closer_potentially_starts_at = $at;
- $at += 2;
+ $at += 2;
/*
* Find a case-insensitive match to the tag name.
@@ -1131,13 +1323,23 @@ private function skip_rcdata( $tag_name ) {
while ( $this->parse_next_attribute() ) {
continue;
}
+
$at = $this->bytes_already_parsed;
if ( $at >= strlen( $this->html ) ) {
return false;
}
- if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) {
- $this->bytes_already_parsed = $closer_potentially_starts_at;
+ if ( '>' === $html[ $at ] ) {
+ $this->bytes_already_parsed = $at + 1;
+ return true;
+ }
+
+ if ( $at + 1 >= strlen( $this->html ) ) {
+ return false;
+ }
+
+ if ( '/' === $html[ $at ] && '>' === $html[ $at + 1 ] ) {
+ $this->bytes_already_parsed = $at + 2;
return true;
}
}
@@ -1259,6 +1461,7 @@ private function skip_script_data() {
if ( $is_closing ) {
$this->bytes_already_parsed = $closer_potentially_starts_at;
+ $this->tag_name_starts_at = $closer_potentially_starts_at;
if ( $this->bytes_already_parsed >= $doc_length ) {
return false;
}
@@ -1268,13 +1471,13 @@ private function skip_script_data() {
}
if ( $this->bytes_already_parsed >= $doc_length ) {
- $this->parser_state = self::STATE_INCOMPLETE;
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
if ( '>' === $html[ $this->bytes_already_parsed ] ) {
- $this->bytes_already_parsed = $closer_potentially_starts_at;
+ ++$this->bytes_already_parsed;
return true;
}
}
@@ -1303,17 +1506,34 @@ private function parse_next_tag() {
$html = $this->html;
$doc_length = strlen( $html );
- $at = $this->bytes_already_parsed;
+ $was_at = $this->bytes_already_parsed;
+ $at = $was_at;
while ( false !== $at && $at < $doc_length ) {
$at = strpos( $html, '<', $at );
+ if ( $at > $was_at ) {
+ $this->parser_state = self::STATE_TEXT_NODE;
+ $this->token_starts_at = $was_at;
+ $this->token_length = $at - $was_at;
+ $this->text_starts_at = $was_at;
+ $this->text_length = $this->token_length;
+ $this->bytes_already_parsed = $at;
+ return true;
+ }
+
/*
* This does not imply an incomplete parse; it indicates that there
* can be nothing left in the document other than a #text node.
*/
if ( false === $at ) {
- return false;
+ $this->parser_state = self::STATE_TEXT_NODE;
+ $this->token_starts_at = $was_at;
+ $this->token_length = strlen( $html ) - $was_at;
+ $this->text_starts_at = $was_at;
+ $this->text_length = $this->token_length;
+ $this->bytes_already_parsed = strlen( $html );
+ return true;
}
$this->token_starts_at = $at;
@@ -1342,8 +1562,9 @@ private function parse_next_tag() {
$tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 );
if ( $tag_name_prefix_length > 0 ) {
++$at;
- $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
+ $this->parser_state = self::STATE_MATCHED_TAG;
$this->tag_name_starts_at = $at;
+ $this->tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
$this->bytes_already_parsed = $at + $this->tag_name_length;
return true;
}
@@ -1353,18 +1574,18 @@ private function parse_next_tag() {
* the document. There is nothing left to parse.
*/
if ( $at + 1 >= $doc_length ) {
- $this->parser_state = self::STATE_INCOMPLETE;
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
/*
- *
+ * ``. Unlike other comment
+ * and bogus comment syntax, these leave no clear insertion point for text and
+ * they need to be modified specially in order to contain text. E.g. to store
+ * `?` as the modifiable text, the `` needs to become ``, which
+ * involves inserting an additional `-` into the token after the modifiable text.
+ */
+ $this->parser_state = self::STATE_COMMENT;
+ $this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT;
+ $this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at;
+
+ // Only provide modifiable text if the token is long enough to contain it.
+ if ( $span_of_dashes >= 2 ) {
+ $this->comment_type = self::COMMENT_AS_HTML_COMMENT;
+ $this->text_starts_at = $this->token_starts_at + 4;
+ $this->text_length = $span_of_dashes - 2;
+ }
+
+ $this->bytes_already_parsed = $closer_at + $span_of_dashes + 1;
+ return true;
}
/*
@@ -1397,51 +1637,39 @@ private function parse_next_tag() {
while ( ++$closer_at < $doc_length ) {
$closer_at = strpos( $html, '--', $closer_at );
if ( false === $closer_at ) {
- $this->parser_state = self::STATE_INCOMPLETE;
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) {
- $at = $closer_at + 3;
- continue 2;
+ $this->parser_state = self::STATE_COMMENT;
+ $this->comment_type = self::COMMENT_AS_HTML_COMMENT;
+ $this->token_length = $closer_at + 3 - $this->token_starts_at;
+ $this->text_starts_at = $this->token_starts_at + 4;
+ $this->text_length = $closer_at - $this->text_starts_at;
+ $this->bytes_already_parsed = $closer_at + 3;
+ return true;
}
- if ( $closer_at + 3 < $doc_length && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) {
- $at = $closer_at + 4;
- continue 2;
+ if (
+ $closer_at + 3 < $doc_length &&
+ '!' === $html[ $closer_at + 2 ] &&
+ '>' === $html[ $closer_at + 3 ]
+ ) {
+ $this->parser_state = self::STATE_COMMENT;
+ $this->comment_type = self::COMMENT_AS_HTML_COMMENT;
+ $this->token_length = $closer_at + 4 - $this->token_starts_at;
+ $this->text_starts_at = $this->token_starts_at + 4;
+ $this->text_length = $closer_at - $this->text_starts_at;
+ $this->bytes_already_parsed = $closer_at + 4;
+ return true;
}
}
}
/*
- *
- * The CDATA is case-sensitive.
- * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
- */
- if (
- $doc_length > $at + 8 &&
- '[' === $html[ $at + 2 ] &&
- 'C' === $html[ $at + 3 ] &&
- 'D' === $html[ $at + 4 ] &&
- 'A' === $html[ $at + 5 ] &&
- 'T' === $html[ $at + 6 ] &&
- 'A' === $html[ $at + 7 ] &&
- '[' === $html[ $at + 8 ]
- ) {
- $closer_at = strpos( $html, ']]>', $at + 9 );
- if ( false === $closer_at ) {
- $this->parser_state = self::STATE_INCOMPLETE;
-
- return false;
- }
-
- $at = $closer_at + 3;
- continue;
- }
-
- /*
- *
+ * `
* These are ASCII-case-insensitive.
* https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
*/
@@ -1457,13 +1685,17 @@ private function parse_next_tag() {
) {
$closer_at = strpos( $html, '>', $at + 9 );
if ( false === $closer_at ) {
- $this->parser_state = self::STATE_INCOMPLETE;
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
- $at = $closer_at + 1;
- continue;
+ $this->parser_state = self::STATE_DOCTYPE;
+ $this->token_length = $closer_at + 1 - $this->token_starts_at;
+ $this->text_starts_at = $this->token_starts_at + 9;
+ $this->text_length = $closer_at - $this->text_starts_at;
+ $this->bytes_already_parsed = $closer_at + 1;
+ return true;
}
/*
@@ -1471,14 +1703,53 @@ private function parse_next_tag() {
* to the bogus comment state - skip to the nearest >. If no closer is
* found then the HTML was truncated inside the markup declaration.
*/
- $at = strpos( $html, '>', $at + 1 );
- if ( false === $at ) {
- $this->parser_state = self::STATE_INCOMPLETE;
+ $closer_at = strpos( $html, '>', $at + 1 );
+ if ( false === $closer_at ) {
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
- continue;
+ $this->parser_state = self::STATE_COMMENT;
+ $this->comment_type = self::COMMENT_AS_INVALID_HTML;
+ $this->token_length = $closer_at + 1 - $this->token_starts_at;
+ $this->text_starts_at = $this->token_starts_at + 2;
+ $this->text_length = $closer_at - $this->text_starts_at;
+ $this->bytes_already_parsed = $closer_at + 1;
+
+ /*
+ * Identify nodes that would be CDATA if HTML had CDATA sections.
+ *
+ * This section must occur after identifying the bogus comment end
+ * because in an HTML parser it will span to the nearest `>`, even
+ * if there's no `]]>` as would be required in an XML document. It
+ * is therefore not possible to parse a CDATA section containing
+ * a `>` in the HTML syntax.
+ *
+ * Inside foreign elements there is a discrepancy between browsers
+ * and the specification on this.
+ *
+ * @todo Track whether the Tag Processor is inside a foreign element
+ * and require the proper closing `]]>` in those cases.
+ */
+ if (
+ $this->token_length >= 10 &&
+ '[' === $html[ $this->token_starts_at + 2 ] &&
+ 'C' === $html[ $this->token_starts_at + 3 ] &&
+ 'D' === $html[ $this->token_starts_at + 4 ] &&
+ 'A' === $html[ $this->token_starts_at + 5 ] &&
+ 'T' === $html[ $this->token_starts_at + 6 ] &&
+ 'A' === $html[ $this->token_starts_at + 7 ] &&
+ '[' === $html[ $this->token_starts_at + 8 ] &&
+ ']' === $html[ $closer_at - 1 ]
+ ) {
+ $this->parser_state = self::STATE_COMMENT;
+ $this->comment_type = self::COMMENT_AS_CDATA_LOOKALIKE;
+ $this->text_starts_at += 7;
+ $this->text_length -= 9;
+ }
+
+ return true;
}
/*
@@ -1491,30 +1762,80 @@ private function parse_next_tag() {
* See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name
*/
if ( '>' === $html[ $at + 1 ] ) {
- ++$at;
- continue;
+ $this->parser_state = self::STATE_PRESUMPTUOUS_TAG;
+ $this->token_length = $at + 2 - $this->token_starts_at;
+ $this->bytes_already_parsed = $at + 2;
+ return true;
}
/*
- * transitions to a bogus comment state – skip to the nearest >
+ * `` transitions to a bogus comment state – skip to the nearest >
* See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
*/
if ( '?' === $html[ $at + 1 ] ) {
$closer_at = strpos( $html, '>', $at + 2 );
if ( false === $closer_at ) {
- $this->parser_state = self::STATE_INCOMPLETE;
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
- $at = $closer_at + 1;
- continue;
+ $this->parser_state = self::STATE_COMMENT;
+ $this->comment_type = self::COMMENT_AS_INVALID_HTML;
+ $this->token_length = $closer_at + 1 - $this->token_starts_at;
+ $this->text_starts_at = $this->token_starts_at + 2;
+ $this->text_length = $closer_at - $this->text_starts_at;
+ $this->bytes_already_parsed = $closer_at + 1;
+
+ /*
+ * Identify a Processing Instruction node were HTML to have them.
+ *
+ * This section must occur after identifying the bogus comment end
+ * because in an HTML parser it will span to the nearest `>`, even
+ * if there's no `?>` as would be required in an XML document. It
+ * is therefore not possible to parse a Processing Instruction node
+ * containing a `>` in the HTML syntax.
+ *
+ * XML allows for more target names, but this code only identifies
+ * those with ASCII-representable target names. This means that it
+ * may identify some Processing Instruction nodes as bogus comments,
+ * but it will not misinterpret the HTML structure. By limiting the
+ * identification to these target names the Tag Processor can avoid
+ * the need to start parsing UTF-8 sequences.
+ *
+ * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
+ * [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
+ * [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
+ * [#x10000-#xEFFFF]
+ * > NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+ *
+ * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
+ */
+ if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) {
+ $comment_text = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 );
+ $pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' );
+
+ if ( 0 < $pi_target_length ) {
+ $pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );
+
+ $this->comment_type = self::COMMENT_AS_PI_NODE_LOOKALIKE;
+ $this->tag_name_starts_at = $this->token_starts_at + 2;
+ $this->tag_name_length = $pi_target_length;
+ $this->text_starts_at += $pi_target_length;
+ $this->text_length -= $pi_target_length + 1;
+ }
+ }
+
+ return true;
}
/*
* If a non-alpha starts the tag name in a tag closer it's a comment.
* Find the first `>`, which closes the comment.
*
+ * This parser classifies these particular comments as special "funky comments"
+ * which are made available for further processing.
+ *
* See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name
*/
if ( $this->is_closing_tag ) {
@@ -1525,13 +1846,17 @@ private function parse_next_tag() {
$closer_at = strpos( $html, '>', $at + 3 );
if ( false === $closer_at ) {
- $this->parser_state = self::STATE_INCOMPLETE;
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
- $at = $closer_at + 1;
- continue;
+ $this->parser_state = self::STATE_FUNKY_COMMENT;
+ $this->token_length = $closer_at + 1 - $this->token_starts_at;
+ $this->text_starts_at = $this->token_starts_at + 2;
+ $this->text_length = $closer_at - $this->text_starts_at;
+ $this->bytes_already_parsed = $closer_at + 1;
+ return true;
}
++$at;
@@ -1551,7 +1876,7 @@ private function parse_next_attribute() {
// Skip whitespace and slashes.
$this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed );
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
- $this->parser_state = self::STATE_INCOMPLETE;
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
@@ -1575,14 +1900,14 @@ private function parse_next_attribute() {
$attribute_name = substr( $this->html, $attribute_start, $name_length );
$this->bytes_already_parsed += $name_length;
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
- $this->parser_state = self::STATE_INCOMPLETE;
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
$this->skip_whitespace();
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
- $this->parser_state = self::STATE_INCOMPLETE;
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
@@ -1592,7 +1917,7 @@ private function parse_next_attribute() {
++$this->bytes_already_parsed;
$this->skip_whitespace();
if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
- $this->parser_state = self::STATE_INCOMPLETE;
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
@@ -1620,7 +1945,7 @@ private function parse_next_attribute() {
}
if ( $attribute_end >= strlen( $this->html ) ) {
- $this->parser_state = self::STATE_INCOMPLETE;
+ $this->parser_state = self::STATE_INCOMPLETE_INPUT;
return false;
}
@@ -1692,8 +2017,11 @@ private function after_tag() {
$this->token_length = null;
$this->tag_name_starts_at = null;
$this->tag_name_length = null;
+ $this->text_starts_at = 0;
+ $this->text_length = 0;
$this->is_closing_tag = null;
$this->attributes = array();
+ $this->comment_type = null;
$this->duplicate_attributes = null;
}
@@ -1985,7 +2313,7 @@ public function seek( $bookmark_name ) {
// Point this tag processor before the sought tag opener and consume it.
$this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start;
- return $this->next_tag( array( 'tag_closers' => 'visit' ) );
+ return $this->next_token();
}
/**
@@ -2216,13 +2544,24 @@ public function get_attribute_names_with_prefix( $prefix ) {
* @return string|null Name of currently matched tag in input HTML, or `null` if none found.
*/
public function get_tag() {
- if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
+ if ( null === $this->tag_name_starts_at ) {
return null;
}
$tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
- return strtoupper( $tag_name );
+ if ( self::STATE_MATCHED_TAG === $this->parser_state ) {
+ return strtoupper( $tag_name );
+ }
+
+ if (
+ self::STATE_COMMENT === $this->parser_state &&
+ self::COMMENT_AS_PI_NODE_LOOKALIKE === $this->get_comment_type()
+ ) {
+ return $tag_name;
+ }
+
+ return null;
}
/**
@@ -2281,6 +2620,191 @@ public function is_tag_closer() {
);
}
+ /**
+ * Indicates the kind of matched token, if any.
+ *
+ * This differs from `get_token_name()` in that it always
+ * returns a static string indicating the type, whereas
+ * `get_token_name()` may return values derived from the
+ * token itself, such as a tag name or processing
+ * instruction tag.
+ *
+ * Possible values:
+ * - `#tag` when matched on a tag.
+ * - `#text` when matched on a text node.
+ * - `#cdata-section` when matched on a CDATA node.
+ * - `#comment` when matched on a comment.
+ * - `#doctype` when matched on a DOCTYPE declaration.
+ * - `#presumptuous-tag` when matched on an empty tag closer.
+ * - `#funky-comment` when matched on a funky comment.
+ *
+ * @since 6.5.0
+ *
+ * @return string|null What kind of token is matched, or null.
+ */
+ public function get_token_type() {
+ switch ( $this->parser_state ) {
+ case self::STATE_MATCHED_TAG:
+ return '#tag';
+
+ case self::STATE_DOCTYPE:
+ return '#doctype';
+
+ default:
+ return $this->get_token_name();
+ }
+ }
+
+ /**
+ * Returns the node name represented by the token.
+ *
+ * This matches the DOM API value `nodeName`. Some values
+ * are static, such as `#text` for a text node, while others
+ * are dynamically generated from the token itself.
+ *
+ * Dynamic names:
+ * - Uppercase tag name for tag matches.
+ * - `html` for DOCTYPE declarations.
+ *
+ * Note that if the Tag Processor is not matched on a token
+ * then this function will return `null`, either because it
+ * hasn't yet found a token or because it reached the end
+ * of the document without matching a token.
+ *
+ * @since 6.5.0
+ *
+ * @return string|null Name of the matched token.
+ */
+ public function get_token_name() {
+ switch ( $this->parser_state ) {
+ case self::STATE_MATCHED_TAG:
+ return $this->get_tag();
+
+ case self::STATE_TEXT_NODE:
+ return '#text';
+
+ case self::STATE_CDATA_NODE:
+ return '#cdata-section';
+
+ case self::STATE_COMMENT:
+ return '#comment';
+
+ case self::STATE_DOCTYPE:
+ return 'html';
+
+ case self::STATE_PRESUMPTUOUS_TAG:
+ return '#presumptuous-tag';
+
+ case self::STATE_FUNKY_COMMENT:
+ return '#funky-comment';
+ }
+ }
+
+ /**
+ * Indicates what kind of comment produced the comment node.
+ *
+ * Because there are different kinds of HTML syntax which produce
+ * comments, the Tag Processor tracks and exposes this as a type
+ * for the comment. Nominally only regular HTML comments exist as
+ * they are commonly known, but a number of unrelated syntax errors
+ * also produce comments.
+ *
+ * @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT
+ * @see self::COMMENT_AS_CDATA_LOOKALIKE
+ * @see self::COMMENT_AS_INVALID_HTML
+ * @see self::COMMENT_AS_HTML_COMMENT
+ * @see self::COMMENT_AS_PI_NODE_LOOKALIKE
+ *
+ * @since 6.5.0
+ *
+ * @return string|null
+ */
+ public function get_comment_type() {
+ if ( self::STATE_COMMENT !== $this->parser_state ) {
+ return null;
+ }
+
+ return $this->comment_type;
+ }
+
+ /**
+ * Returns the modifiable text for a matched token, or an empty string.
+ *
+ * Modifiable text is text content that may be read and changed without
+ * changing the HTML structure of the document around it. This includes
+ * the contents of `#text` nodes in the HTML as well as the inner
+ * contents of HTML comments, Processing Instructions, and others, even
+ * though these nodes aren't part of a parsed DOM tree. They also contain
+ * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
+ * other section in an HTML document which cannot contain HTML markup (DATA).
+ *
+ * If a token has no modifiable text then an empty string is returned to
+ * avoid needless crashing or type errors. An empty string does not mean
+ * that a token has modifiable text, and a token with modifiable text may
+ * have an empty string (e.g. a comment with no contents).
+ *
+ * @since 6.5.0
+ *
+ * @return string
+ */
+ public function get_modifiable_text() {
+ if ( null === $this->text_starts_at ) {
+ return '';
+ }
+
+ $text = substr( $this->html, $this->text_starts_at, $this->text_length );
+
+ // Comment data is not decoded.
+ if (
+ self::STATE_CDATA_NODE === $this->parser_state ||
+ self::STATE_COMMENT === $this->parser_state ||
+ self::STATE_DOCTYPE === $this->parser_state ||
+ self::STATE_FUNKY_COMMENT === $this->parser_state
+ ) {
+ return $text;
+ }
+
+ $tag_name = $this->get_tag();
+ if (
+ // Script data is not decoded.
+ 'SCRIPT' === $tag_name ||
+
+ // RAWTEXT data is not decoded.
+ 'IFRAME' === $tag_name ||
+ 'NOEMBED' === $tag_name ||
+ 'NOFRAMES' === $tag_name ||
+ 'STYLE' === $tag_name ||
+ 'XMP' === $tag_name
+ ) {
+ return $text;
+ }
+
+ $decoded = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
+
+ if ( empty( $decoded ) ) {
+ return '';
+ }
+
+ /*
+ * TEXTAREA skips a leading newline, but this newline may appear not only as the
+ * literal character `\n`, but also as a character reference, such as in the
+ * following markup: ``.
+ *
+ * For these cases it's important to first decode the text content before checking
+ * for a leading newline and removing it.
+ */
+ if (
+ self::STATE_MATCHED_TAG === $this->parser_state &&
+ 'TEXTAREA' === $tag_name &&
+ strlen( $decoded ) > 0 &&
+ "\n" === $decoded[0]
+ ) {
+ return substr( $decoded, 1 );
+ }
+
+ return $decoded;
+ }
+
/**
* Updates or creates a new attribute on the currently matched tag with the passed value.
*
@@ -2746,7 +3270,7 @@ private function matches() {
}
/**
- * Parser Ready State
+ * Parser Ready State.
*
* Indicates that the parser is ready to run and waiting for a state transition.
* It may not have started yet, or it may have just finished parsing a token and
@@ -2759,7 +3283,7 @@ private function matches() {
const STATE_READY = 'STATE_READY';
/**
- * Parser Complete State
+ * Parser Complete State.
*
* Indicates that the parser has reached the end of the document and there is
* nothing left to scan. It finished parsing the last token completely.
@@ -2771,7 +3295,7 @@ private function matches() {
const STATE_COMPLETE = 'STATE_COMPLETE';
/**
- * Parser Incomplete State
+ * Parser Incomplete Input State.
*
* Indicates that the parser has reached the end of the document before finishing
* a token. It started parsing a token but there is a possibility that the input
@@ -2784,10 +3308,10 @@ private function matches() {
*
* @access private
*/
- const STATE_INCOMPLETE = 'STATE_INCOMPLETE';
+ const STATE_INCOMPLETE_INPUT = 'STATE_INCOMPLETE_INPUT';
/**
- * Parser Matched Tag State
+ * Parser Matched Tag State.
*
* Indicates that the parser has found an HTML tag and it's possible to get
* the tag name and read or modify its attributes (if it's not a closing tag).
@@ -2797,4 +3321,153 @@ private function matches() {
* @access private
*/
const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG';
+
+ /**
+ * Parser Text Node State.
+ *
+ * Indicates that the parser has found a text node and it's possible
+ * to read and modify that text.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_TEXT_NODE = 'STATE_TEXT_NODE';
+
+ /**
+ * Parser CDATA Node State.
+ *
+ * Indicates that the parser has found a CDATA node and it's possible
+ * to read and modify its modifiable text. Note that in HTML there are
+ * no CDATA nodes outside of foreign content (SVG and MathML). Outside
+ * of foreign content, they are treated as HTML comments.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_CDATA_NODE = 'STATE_CDATA_NODE';
+
+ /**
+ * Indicates that the parser has found an HTML comment and it's
+ * possible to read and modify its modifiable text.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_COMMENT = 'STATE_COMMENT';
+
+ /**
+ * Indicates that the parser has found a DOCTYPE node and it's
+ * possible to read and modify its modifiable text.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_DOCTYPE = 'STATE_DOCTYPE';
+
+ /**
+ * Indicates that the parser has found an empty tag closer `>`.
+ *
+ * Note that in HTML there are no empty tag closers, and they
+ * are ignored. Nonetheless, the Tag Processor still
+ * recognizes them as they appear in the HTML stream.
+ *
+ * These were historically discussed as a "presumptuous tag
+ * closer," which would close the nearest open tag, but were
+ * dismissed in favor of explicitly-closing tags.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_PRESUMPTUOUS_TAG = 'STATE_PRESUMPTUOUS_TAG';
+
+ /**
+ * Indicates that the parser has found a "funky comment"
+ * and it's possible to read and modify its modifiable text.
+ *
+ * Example:
+ *
+ * %url>
+ * {"wp-bit":"query/post-author"}>
+ * 2>
+ *
+ * Funky comments are tag closers with invalid tag names. Note
+ * that in HTML these are turn into bogus comments. Nonetheless,
+ * the Tag Processor recognizes them in a stream of HTML and
+ * exposes them for inspection and modification.
+ *
+ * @since 6.5.0
+ *
+ * @access private
+ */
+ const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY';
+
+ /**
+ * Indicates that a comment was created when encountering abruptly-closed HTML comment.
+ *
+ * Example:
+ *
+ *
+ *
+ *
+ * @since 6.5.0
+ */
+ const COMMENT_AS_ABRUPTLY_CLOSED_COMMENT = 'COMMENT_AS_ABRUPTLY_CLOSED_COMMENT';
+
+ /**
+ * Indicates that a comment would be parsed as a CDATA node,
+ * were HTML to allow CDATA nodes outside of foreign content.
+ *
+ * Example:
+ *
+ *
+ *
+ * This is an HTML comment, but it looks like a CDATA node.
+ *
+ * @since 6.5.0
+ */
+ const COMMENT_AS_CDATA_LOOKALIKE = 'COMMENT_AS_CDATA_LOOKALIKE';
+
+ /**
+ * Indicates that a comment was created when encountering
+ * normative HTML comment syntax.
+ *
+ * Example:
+ *
+ *
+ *
+ * @since 6.5.0
+ */
+ const COMMENT_AS_HTML_COMMENT = 'COMMENT_AS_HTML_COMMENT';
+
+ /**
+ * Indicates that a comment would be parsed as a Processing
+ * Instruction node, were they to exist within HTML.
+ *
+ * Example:
+ *
+ *
+ *
+ * This is an HTML comment, but it looks like a CDATA node.
+ *
+ * @since 6.5.0
+ */
+ const COMMENT_AS_PI_NODE_LOOKALIKE = 'COMMENT_AS_PI_NODE_LOOKALIKE';
+
+ /**
+ * Indicates that a comment was created when encountering invalid
+ * HTML input, a so-called "bogus comment."
+ *
+ * Example:
+ *
+ *
+ *
+ *
+ * @since 6.5.0
+ */
+ const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';
}