diff --git a/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-processor-6-5.php b/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-processor-6-5.php
index eae2b1eda815ca..7e83a54e708093 100644
--- a/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-processor-6-5.php
+++ b/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-processor-6-5.php
@@ -101,18 +101,18 @@
  *
  *  - Containers: ADDRESS, BLOCKQUOTE, DETAILS, DIALOG, DIV, FOOTER, HEADER, MAIN, MENU, SPAN, SUMMARY.
  *  - Custom elements: All custom elements are supported. :)
- *  - Form elements: BUTTON, DATALIST, FIELDSET, LABEL, LEGEND, METER, PROGRESS, SEARCH.
- *  - Formatting elements: B, BIG, CODE, EM, FONT, I, SMALL, STRIKE, STRONG, TT, U.
+ *  - Form elements: BUTTON, DATALIST, FIELDSET, INPUT, LABEL, LEGEND, METER, PROGRESS, SEARCH.
+ *  - Formatting elements: B, BIG, CODE, EM, FONT, I, PRE, SMALL, STRIKE, STRONG, TT, U, WBR.
  *  - Heading elements: H1, H2, H3, H4, H5, H6, HGROUP.
  *  - Links: A.
- *  - Lists: DD, DL, DT, LI, OL, LI.
- *  - Media elements: AUDIO, CANVAS, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, VIDEO.
- *  - Paragraph: P.
- *  - Phrasing elements: ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR.
- *  - Sectioning elements: ARTICLE, ASIDE, NAV, SECTION.
+ *  - Lists: DD, DL, DT, LI, OL, UL.
+ *  - Media elements: AUDIO, CANVAS, EMBED, FIGCAPTION, FIGURE, IMG, MAP, PICTURE, SOURCE, TRACK, VIDEO.
+ *  - Paragraph: BR, P.
+ *  - Phrasing elements: ABBR, AREA, BDI, BDO, CITE, DATA, DEL, DFN, INS, MARK, OUTPUT, Q, SAMP, SUB, SUP, TIME, VAR.
+ *  - Sectioning elements: ARTICLE, ASIDE, HR, NAV, SECTION.
  *  - Templating elements: SLOT.
  *  - Text decoration: RUBY.
- *  - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, MULTICOL, NEXTID, SPACER.
+ *  - Deprecated elements: ACRONYM, BLINK, CENTER, DIR, ISINDEX, KEYGEN, LISTING, MULTICOL, NEXTID, PARAM, SPACER.
  *
  * ### Supported markup
  *
@@ -149,17 +149,6 @@ class Gutenberg_HTML_Processor_6_5 extends Gutenberg_HTML_Tag_Processor_6_5 {
 	 */
 	const MAX_BOOKMARKS = 100;
 
-	/**
-	 * Static query for instructing the Tag Processor to visit every token.
-	 *
-	 * @access private
-	 *
-	 * @since 6.4.0
-	 *
-	 * @var array
-	 */
-	const VISIT_EVERYTHING = array( 'tag_closers' => 'visit' );
-
 	/**
 	 * Holds the working state of the parser, including the stack of
 	 * open elements and the stack of active formatting elements.
@@ -424,6 +413,30 @@ public function next_tag( $query = null ) {
 		return false;
 	}
 
+	/**
+	 * Ensures internal accounting is maintained for HTML semantic rules while
+	 * the underlying Tag Processor class is seeking to a bookmark.
+	 *
+	 * This doesn't currently have a way to represent non-tags and doesn't process
+	 * semantic rules for text nodes. For access to the raw tokens consider using
+	 * WP_HTML_Tag_Processor instead.
+	 *
+	 * @since 6.5.0 Added for internal support; do not use.
+	 *
+	 * @access private
+	 *
+	 * @return bool
+	 */
+	public function next_token() {
+		$found_a_token = parent::next_token();
+
+		if ( '#tag' === $this->get_token_type() ) {
+			$this->step( self::REPROCESS_CURRENT_NODE );
+		}
+
+		return $found_a_token;
+	}
+
 	/**
 	 * Indicates if the currently-matched tag matches the given breadcrumbs.
 	 *
@@ -520,7 +533,9 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ) {
 				$this->state->stack_of_open_elements->pop();
 			}
 
-			parent::next_tag( self::VISIT_EVERYTHING );
+			while ( parent::next_token() && '#tag' !== $this->get_token_type() ) {
+				continue;
+			}
 		}
 
 		// Finish stepping when there are no more tokens in the document.
@@ -684,10 +699,12 @@ private function step_in_body() {
 			case '-FOOTER':
 			case '-HEADER':
 			case '-HGROUP':
+			case '-LISTING':
 			case '-MAIN':
 			case '-MENU':
 			case '-NAV':
 			case '-OL':
+			case '-PRE':
 			case '-SEARCH':
 			case '-SECTION':
 			case '-SUMMARY':
@@ -732,6 +749,18 @@ private function step_in_body() {
 				$this->insert_html_element( $this->state->current_token );
 				return true;
 
+			/*
+			 * > A start tag whose tag name is one of: "pre", "listing"
+			 */
+			case '+PRE':
+			case '+LISTING':
+				if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+					$this->close_a_p_element();
+				}
+				$this->insert_html_element( $this->state->current_token );
+				$this->state->frameset_ok = false;
+				return true;
+
 			/*
 			 * > An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", "h6"
 			 */
@@ -934,11 +963,64 @@ private function step_in_body() {
 				$this->run_adoption_agency_algorithm();
 				return true;
 
+			/*
+			 * > An end tag whose tag name is "br"
+			 * >   Parse error. Drop the attributes from the token, and act as described in the next
+			 * >   entry; i.e. act as if this was a "br" start tag token with no attributes, rather
+			 * >   than the end tag token that it actually is.
+			 */
+			case '-BR':
+				$this->last_error = self::ERROR_UNSUPPORTED;
+				throw new WP_HTML_Unsupported_Exception( 'Closing BR tags require unimplemented special handling.' );
+
 			/*
 			 * > A start tag whose tag name is one of: "area", "br", "embed", "img", "keygen", "wbr"
 			 */
+			case '+AREA':
+			case '+BR':
+			case '+EMBED':
 			case '+IMG':
+			case '+KEYGEN':
+			case '+WBR':
 				$this->reconstruct_active_formatting_elements();
+				$this->insert_html_element( $this->state->current_token );
+				$this->state->frameset_ok = false;
+				return true;
+
+			/*
+			 * > A start tag whose tag name is "input"
+			 */
+			case '+INPUT':
+				$this->reconstruct_active_formatting_elements();
+				$this->insert_html_element( $this->state->current_token );
+				$type_attribute = $this->get_attribute( 'type' );
+				/*
+				 * > If the token does not have an attribute with the name "type", or if it does,
+				 * > but that attribute's value is not an ASCII case-insensitive match for the
+				 * > string "hidden", then: set the frameset-ok flag to "not ok".
+				 */
+				if ( ! is_string( $type_attribute ) || 'hidden' !== strtolower( $type_attribute ) ) {
+					$this->state->frameset_ok = false;
+				}
+				return true;
+
+			/*
+			 * > A start tag whose tag name is "hr"
+			 */
+			case '+HR':
+				if ( $this->state->stack_of_open_elements->has_p_in_button_scope() ) {
+					$this->close_a_p_element();
+				}
+				$this->insert_html_element( $this->state->current_token );
+				$this->state->frameset_ok = false;
+				return true;
+
+			/*
+			 * > A start tag whose tag name is one of: "param", "source", "track"
+			 */
+			case '+PARAM':
+			case '+SOURCE':
+			case '+TRACK':
 				$this->insert_html_element( $this->state->current_token );
 				return true;
 		}
@@ -961,30 +1043,20 @@ private function step_in_body() {
 		 */
 		switch ( $tag_name ) {
 			case 'APPLET':
-			case 'AREA':
 			case 'BASE':
 			case 'BASEFONT':
 			case 'BGSOUND':
 			case 'BODY':
-			case 'BR':
 			case 'CAPTION':
 			case 'COL':
 			case 'COLGROUP':
-			case 'DD':
-			case 'DT':
-			case 'EMBED':
 			case 'FORM':
 			case 'FRAME':
 			case 'FRAMESET':
 			case 'HEAD':
-			case 'HR':
 			case 'HTML':
 			case 'IFRAME':
-			case 'INPUT':
-			case 'KEYGEN':
-			case 'LI':
 			case 'LINK':
-			case 'LISTING':
 			case 'MARQUEE':
 			case 'MATH':
 			case 'META':
@@ -993,12 +1065,9 @@ private function step_in_body() {
 			case 'NOFRAMES':
 			case 'NOSCRIPT':
 			case 'OBJECT':
-			case 'OL':
 			case 'OPTGROUP':
 			case 'OPTION':
-			case 'PARAM':
 			case 'PLAINTEXT':
-			case 'PRE':
 			case 'RB':
 			case 'RP':
 			case 'RT':
@@ -1006,7 +1075,6 @@ private function step_in_body() {
 			case 'SARCASM':
 			case 'SCRIPT':
 			case 'SELECT':
-			case 'SOURCE':
 			case 'STYLE':
 			case 'SVG':
 			case 'TABLE':
@@ -1019,9 +1087,6 @@ private function step_in_body() {
 			case 'THEAD':
 			case 'TITLE':
 			case 'TR':
-			case 'TRACK':
-			case 'UL':
-			case 'WBR':
 			case 'XMP':
 				$this->last_error = self::ERROR_UNSUPPORTED;
 				throw new WP_HTML_Unsupported_Exception( "Cannot process {$tag_name} element." );
@@ -1675,14 +1740,19 @@ public static function is_void( $tag_name ) {
 		return (
 			'AREA' === $tag_name ||
 			'BASE' === $tag_name ||
+			'BASEFONT' === $tag_name || // Obsolete but still treated as void.
+			'BGSOUND' === $tag_name || // Obsolete but still treated as void.
 			'BR' === $tag_name ||
 			'COL' === $tag_name ||
 			'EMBED' === $tag_name ||
+			'FRAME' === $tag_name ||
 			'HR' === $tag_name ||
 			'IMG' === $tag_name ||
 			'INPUT' === $tag_name ||
+			'KEYGEN' === $tag_name || // Obsolete but still treated as void.
 			'LINK' === $tag_name ||
 			'META' === $tag_name ||
+			'PARAM' === $tag_name || // Obsolete but still treated as void.
 			'SOURCE' === $tag_name ||
 			'TRACK' === $tag_name ||
 			'WBR' === $tag_name
diff --git a/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-tag-processor-6-5.php b/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-tag-processor-6-5.php
index de3823d2b2703b..1d2430cd455784 100644
--- a/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-tag-processor-6-5.php
+++ b/lib/compat/wordpress-6.5/html-api/class-gutenberg-html-tag-processor-6-5.php
@@ -247,6 +247,95 @@
  *         }
  *     }
  *
+ * ## Tokens and finer-grained processing.
+ *
+ * It's possible to scan through every lexical token in the
+ * HTML document using the `next_token()` function. This
+ * alternative form takes no argument and provides no built-in
+ * query syntax.
+ *
+ * Example:
+ *
+ *      $title = '(untitled)';
+ *      $text  = '';
+ *      while ( $processor->next_token() ) {
+ *          switch ( $processor->get_token_name() ) {
+ *              case '#text':
+ *                  $text .= $processor->get_modifiable_text();
+ *                  break;
+ *
+ *              case 'BR':
+ *                  $text .= "\n";
+ *                  break;
+ *
+ *              case 'TITLE':
+ *                  $title = $processor->get_modifiable_text();
+ *                  break;
+ *          }
+ *      }
+ *      return trim( "# {$title}\n\n{$text}" );
+ *
+ * ### Tokens and _modifiable text_.
+ *
+ * #### Special "atomic" HTML elements.
+ *
+ * Not all HTML elements are able to contain other elements inside of them.
+ * For instance, the contents inside a TITLE element are plaintext (except
+ * that character references like &amp; will be decoded). This means that
+ * if the string `<img>` appears inside a TITLE element, then it's not an
+ * image tag, but rather it's text describing an image tag. Likewise, the
+ * contents of a SCRIPT or STYLE element are handled entirely separately in
+ * a browser than the contents of other elements because they represent a
+ * different language than HTML.
+ *
+ * For these elements the Tag Processor treats the entire sequence as one,
+ * from the opening tag, including its contents, through its closing tag.
+ * This means that the it's not possible to match the closing tag for a
+ * SCRIPT element unless it's unexpected; the Tag Processor already matched
+ * it when it found the opening tag.
+ *
+ * The inner contents of these elements are that element's _modifiable text_.
+ *
+ * The special elements are:
+ *  - `SCRIPT` whose contents are treated as raw plaintext but supports a legacy
+ *    style of including Javascript inside of HTML comments to avoid accidentally
+ *    closing the SCRIPT from inside a Javascript string. E.g. `console.log( '</script>' )`.
+ *  - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any
+ *    character references are decoded. E.g. `1 &lt; 2 < 3` becomes `1 < 2 < 3`.
+ *  - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as
+ *    raw plaintext and left as-is. E.g. `1 &lt; 2 < 3` remains `1 &lt; 2 < 3`.
+ *
+ * #### Other tokens with modifiable text.
+ *
+ * There are also non-elements which are void/self-closing in nature and contain
+ * modifiable text that is part of that individual syntax token itself.
+ *
+ *  - `#text` nodes, whose entire token _is_ the modifiable text.
+ *  - HTML comments and tokens that become comments due to some syntax error. The
+ *    text for these tokens is the portion of the comment inside of the syntax.
+ *    E.g. for `<!-- comment -->` the text is `" comment "` (note the spaces are included).
+ *  - `CDATA` sections, whose text is the content inside of the section itself. E.g. for
+ *    `<![CDATA[some content]]>` the text is `"some content"` (with restrictions [1]).
+ *  - "Funky comments," which are a special case of invalid closing tags whose name is
+ *    invalid. The text for these nodes is the text that a browser would transform into
+ *    an HTML comment when parsing. E.g. for `</%post_author>` the text is `%post_author`.
+ *  - `DOCTYPE` declarations like `<DOCTYPE html>` which have no closing tag.
+ *  - XML Processing instruction nodes like `<?wp __( "Like" ); ?>` (with restrictions [2]).
+ *  - The empty end tag `</>` which is ignored in the browser and DOM.
+ *
+ * [1]: There are no CDATA sections in HTML. When encountering `<![CDATA[`, everything
+ *      until the next `>` becomes a bogus HTML comment, meaning there can be no CDATA
+ *      section in an HTML document containing `>`. The Tag Processor will first find
+ *      all valid and bogus HTML comments, and then if the comment _would_ have been a
+ *      CDATA section _were they to exist_, it will indicate this as the type of comment.
+ *
+ * [2]: XML allows a broader range of characters in a processing instruction's target name
+ *      and disallows "xml" as a name, since it's special. The Tag Processor only recognizes
+ *      target names with an ASCII-representable subset of characters. It also exhibits the
+ *      same constraint as with CDATA sections, in that `>` cannot exist within the token
+ *      since Processing Instructions do no exist within HTML and their syntax transforms
+ *      into a bogus comment in the DOM.
+ *
  * ## Design and limitations
  *
  * The Tag Processor is designed to linearly scan HTML documents and tokenize
@@ -320,7 +409,8 @@
  * @since 6.2.1 Fix: Support for various invalid comments; attribute updates are case-insensitive.
  * @since 6.3.2 Fix: Skip HTML-like content inside rawtext elements such as STYLE.
  * @since 6.5.0 Pauses processor when input ends in an incomplete syntax token.
- *              Introduces "special" elements which act like void elements, e.g. STYLE.
+ *              Introduces "special" elements which act like void elements, e.g. TITLE, STYLE.
+ *              Allows scanning through all tokens and processing modifiable text, where applicable.
  */
 class Gutenberg_HTML_Tag_Processor_6_5 {
 	/**
@@ -396,23 +486,47 @@ class Gutenberg_HTML_Tag_Processor_6_5 {
 	/**
 	 * Specifies mode of operation of the parser at any given time.
 	 *
-	 * | State         | Meaning                                                              |
-	 * | --------------|----------------------------------------------------------------------|
-	 * | *Ready*       | The parser is ready to run.                                          |
-	 * | *Complete*    | There is nothing left to parse.                                      |
-	 * | *Incomplete*  | The HTML ended in the middle of a token; nothing more can be parsed. |
-	 * | *Matched tag* | Found an HTML tag; it's possible to modify its attributes.           |
+	 * | State           | Meaning                                                              |
+	 * | ----------------|----------------------------------------------------------------------|
+	 * | *Ready*         | The parser is ready to run.                                          |
+	 * | *Complete*      | There is nothing left to parse.                                      |
+	 * | *Incomplete*    | The HTML ended in the middle of a token; nothing more can be parsed. |
+	 * | *Matched tag*   | Found an HTML tag; it's possible to modify its attributes.           |
+	 * | *Text node*     | Found a #text node; this is plaintext and modifiable.                |
+	 * | *CDATA node*    | Found a CDATA section; this is modifiable.                           |
+	 * | *Comment*       | Found a comment or bogus comment; this is modifiable.                |
+	 * | *Presumptuous*  | Found an empty tag closer: `</>`.                                    |
+	 * | *Funky comment* | Found a tag closer with an invalid tag name; this is modifiable.     |
 	 *
 	 * @since 6.5.0
 	 *
 	 * @see WP_HTML_Tag_Processor::STATE_READY
 	 * @see WP_HTML_Tag_Processor::STATE_COMPLETE
-	 * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE
+	 * @see WP_HTML_Tag_Processor::STATE_INCOMPLETE_INPUT
 	 * @see WP_HTML_Tag_Processor::STATE_MATCHED_TAG
+	 * @see WP_HTML_Tag_Processor::STATE_TEXT_NODE
+	 * @see WP_HTML_Tag_Processor::STATE_CDATA_NODE
+	 * @see WP_HTML_Tag_Processor::STATE_COMMENT
+	 * @see WP_HTML_Tag_Processor::STATE_DOCTYPE
+	 * @see WP_HTML_Tag_Processor::STATE_PRESUMPTUOUS_TAG
+	 * @see WP_HTML_Tag_Processor::STATE_FUNKY_COMMENT
 	 *
 	 * @var string
 	 */
-	private $parser_state = self::STATE_READY;
+	protected $parser_state = self::STATE_READY;
+
+	/**
+	 * What kind of syntax token became an HTML comment.
+	 *
+	 * Since there are many ways in which HTML syntax can create an HTML comment,
+	 * this indicates which of those caused it. This allows the Tag Processor to
+	 * represent more from the original input document than would appear in the DOM.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @var string|null
+	 */
+	protected $comment_type = null;
 
 	/**
 	 * How many bytes from the original HTML document have been read and parsed.
@@ -490,6 +604,24 @@ class Gutenberg_HTML_Tag_Processor_6_5 {
 	 */
 	private $tag_name_length;
 
+	/**
+	 * Byte offset into input document where current modifiable text starts.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @var int
+	 */
+	private $text_starts_at;
+
+	/**
+	 * Byte length of modifiable text.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @var string
+	 */
+	private $text_length;
+
 	/**
 	 * Whether the current tag is an opening tag, e.g. <div>, or a closing tag, e.g. </div>.
 	 *
@@ -705,13 +837,13 @@ public function next_tag( $query = null ) {
 	 * @return bool Whether a token was parsed.
 	 */
 	public function next_token() {
-		$this->get_updated_html();
 		$was_at = $this->bytes_already_parsed;
+		$this->get_updated_html();
 
 		// Don't proceed if there's nothing more to scan.
 		if (
 			self::STATE_COMPLETE === $this->parser_state ||
-			self::STATE_INCOMPLETE === $this->parser_state
+			self::STATE_INCOMPLETE_INPUT === $this->parser_state
 		) {
 			return false;
 		}
@@ -729,13 +861,27 @@ public function next_token() {
 
 		// Find the next tag if it exists.
 		if ( false === $this->parse_next_tag() ) {
-			if ( self::STATE_INCOMPLETE === $this->parser_state ) {
+			if ( self::STATE_INCOMPLETE_INPUT === $this->parser_state ) {
 				$this->bytes_already_parsed = $was_at;
 			}
 
 			return false;
 		}
 
+		/*
+		 * For legacy reasons the rest of this function handles tags and their
+		 * attributes. If the processor has reached the end of the document
+		 * or if it matched any other token then it should return here to avoid
+		 * attempting to process tag-specific syntax.
+		 */
+		if (
+			self::STATE_INCOMPLETE_INPUT !== $this->parser_state &&
+			self::STATE_COMPLETE !== $this->parser_state &&
+			self::STATE_MATCHED_TAG !== $this->parser_state
+		) {
+			return true;
+		}
+
 		// Parse all of its attributes.
 		while ( $this->parse_next_attribute() ) {
 			continue;
@@ -743,11 +889,11 @@ public function next_token() {
 
 		// Ensure that the tag closes before the end of the document.
 		if (
-			self::STATE_INCOMPLETE === $this->parser_state ||
+			self::STATE_INCOMPLETE_INPUT === $this->parser_state ||
 			$this->bytes_already_parsed >= strlen( $this->html )
 		) {
 			// Does this appropriately clear state (parsed attributes)?
-			$this->parser_state         = self::STATE_INCOMPLETE;
+			$this->parser_state         = self::STATE_INCOMPLETE_INPUT;
 			$this->bytes_already_parsed = $was_at;
 
 			return false;
@@ -755,14 +901,14 @@ public function next_token() {
 
 		$tag_ends_at = strpos( $this->html, '>', $this->bytes_already_parsed );
 		if ( false === $tag_ends_at ) {
-			$this->parser_state         = self::STATE_INCOMPLETE;
+			$this->parser_state         = self::STATE_INCOMPLETE_INPUT;
 			$this->bytes_already_parsed = $was_at;
 
 			return false;
 		}
 		$this->parser_state         = self::STATE_MATCHED_TAG;
 		$this->token_length         = $tag_ends_at - $this->token_starts_at;
-		$this->bytes_already_parsed = $tag_ends_at;
+		$this->bytes_already_parsed = $tag_ends_at + 1;
 
 		/*
 		 * For non-DATA sections which might contain text that looks like HTML tags but
@@ -771,8 +917,8 @@ public function next_token() {
 		 */
 		$t = $this->html[ $this->tag_name_starts_at ];
 		if (
-			! $this->is_closing_tag &&
-			(
+			$this->is_closing_tag ||
+			! (
 				'i' === $t || 'I' === $t ||
 				'n' === $t || 'N' === $t ||
 				's' === $t || 'S' === $t ||
@@ -780,38 +926,81 @@ public function next_token() {
 				'x' === $t || 'X' === $t
 			)
 		) {
-			$tag_name = $this->get_tag();
+			return true;
+		}
 
-			if ( 'SCRIPT' === $tag_name && ! $this->skip_script_data() ) {
-				$this->parser_state         = self::STATE_INCOMPLETE;
-				$this->bytes_already_parsed = $was_at;
+		$tag_name = $this->get_tag();
 
-				return false;
-			} elseif (
-				( 'TEXTAREA' === $tag_name || 'TITLE' === $tag_name ) &&
-				! $this->skip_rcdata( $tag_name )
-			) {
-				$this->parser_state         = self::STATE_INCOMPLETE;
-				$this->bytes_already_parsed = $was_at;
+		/*
+		 * Preserve the opening tag pointers, as these will be overwritten
+		 * when finding the closing tag. They will be reset after finding
+		 * the closing to tag to point to the opening of the special atomic
+		 * tag sequence.
+		 */
+		$tag_name_starts_at   = $this->tag_name_starts_at;
+		$tag_name_length      = $this->tag_name_length;
+		$tag_ends_at          = $this->token_starts_at + $this->token_length;
+		$attributes           = $this->attributes;
+		$duplicate_attributes = $this->duplicate_attributes;
+
+		// Find the closing tag if necessary.
+		$found_closer = false;
+		switch ( $tag_name ) {
+			case 'SCRIPT':
+				$found_closer = $this->skip_script_data();
+				break;
 
-				return false;
-			} elseif (
-				(
-					'IFRAME' === $tag_name ||
-					'NOEMBED' === $tag_name ||
-					'NOFRAMES' === $tag_name ||
-					'STYLE' === $tag_name ||
-					'XMP' === $tag_name
-				) &&
-				! $this->skip_rawtext( $tag_name )
-			) {
-				$this->parser_state         = self::STATE_INCOMPLETE;
-				$this->bytes_already_parsed = $was_at;
+			case 'TEXTAREA':
+			case 'TITLE':
+				$found_closer = $this->skip_rcdata( $tag_name );
+				break;
 
-				return false;
-			}
+			/*
+			 * In the browser this list would include the NOSCRIPT element,
+			 * but the Tag Processor is an environment with the scripting
+			 * flag disabled, meaning that it needs to descend into the
+			 * NOSCRIPT element to be able to properly process what will be
+			 * sent to a browser.
+			 *
+			 * Note that this rule makes HTML5 syntax incompatible with XML,
+			 * because the parsing of this token depends on client application.
+			 * The NOSCRIPT element cannot be represented in the XHTML syntax.
+			 */
+			case 'IFRAME':
+			case 'NOEMBED':
+			case 'NOFRAMES':
+			case 'STYLE':
+			case 'XMP':
+				$found_closer = $this->skip_rawtext( $tag_name );
+				break;
+
+			// No other tags should be treated in their entirety here.
+			default:
+				return true;
 		}
 
+		if ( ! $found_closer ) {
+			$this->parser_state         = self::STATE_INCOMPLETE_INPUT;
+			$this->bytes_already_parsed = $was_at;
+			return false;
+		}
+
+		/*
+		 * The values here look like they reference the opening tag but they reference
+		 * the closing tag instead. This is why the opening tag values were stored
+		 * above in a variable. It reads confusingly here, but that's because the
+		 * functions that skip the contents have moved all the internal cursors past
+		 * the inner content of the tag.
+		 */
+		$this->token_starts_at      = $was_at;
+		$this->token_length         = $this->bytes_already_parsed - $this->token_starts_at;
+		$this->text_starts_at       = $tag_ends_at + 1;
+		$this->text_length          = $this->tag_name_starts_at - $this->text_starts_at;
+		$this->tag_name_starts_at   = $tag_name_starts_at;
+		$this->tag_name_length      = $tag_name_length;
+		$this->attributes           = $attributes;
+		$this->duplicate_attributes = $duplicate_attributes;
+
 		return true;
 	}
 
@@ -830,7 +1019,7 @@ public function next_token() {
 	 * @return bool Whether the parse paused at the start of an incomplete token.
 	 */
 	public function paused_at_incomplete_token() {
-		return self::STATE_INCOMPLETE === $this->parser_state;
+		return self::STATE_INCOMPLETE_INPUT === $this->parser_state;
 	}
 
 	/**
@@ -1007,7 +1196,10 @@ public function has_class( $wanted_class ) {
 	 */
 	public function set_bookmark( $name ) {
 		// It only makes sense to set a bookmark if the parser has paused on a concrete token.
-		if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
+		if (
+			self::STATE_COMPLETE === $this->parser_state ||
+			self::STATE_INCOMPLETE_INPUT === $this->parser_state
+		) {
 			return false;
 		}
 
@@ -1082,15 +1274,15 @@ private function skip_rcdata( $tag_name ) {
 		$at = $this->bytes_already_parsed;
 
 		while ( false !== $at && $at < $doc_length ) {
-			$at = strpos( $this->html, '</', $at );
+			$at                       = strpos( $this->html, '</', $at );
+			$this->tag_name_starts_at = $at;
 
 			// Fail if there is no possible tag closer.
 			if ( false === $at || ( $at + $tag_length ) >= $doc_length ) {
 				return false;
 			}
 
-			$closer_potentially_starts_at = $at;
-			$at                          += 2;
+			$at += 2;
 
 			/*
 			 * Find a case-insensitive match to the tag name.
@@ -1131,13 +1323,23 @@ private function skip_rcdata( $tag_name ) {
 			while ( $this->parse_next_attribute() ) {
 				continue;
 			}
+
 			$at = $this->bytes_already_parsed;
 			if ( $at >= strlen( $this->html ) ) {
 				return false;
 			}
 
-			if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) {
-				$this->bytes_already_parsed = $closer_potentially_starts_at;
+			if ( '>' === $html[ $at ] ) {
+				$this->bytes_already_parsed = $at + 1;
+				return true;
+			}
+
+			if ( $at + 1 >= strlen( $this->html ) ) {
+				return false;
+			}
+
+			if ( '/' === $html[ $at ] && '>' === $html[ $at + 1 ] ) {
+				$this->bytes_already_parsed = $at + 2;
 				return true;
 			}
 		}
@@ -1259,6 +1461,7 @@ private function skip_script_data() {
 
 			if ( $is_closing ) {
 				$this->bytes_already_parsed = $closer_potentially_starts_at;
+				$this->tag_name_starts_at   = $closer_potentially_starts_at;
 				if ( $this->bytes_already_parsed >= $doc_length ) {
 					return false;
 				}
@@ -1268,13 +1471,13 @@ private function skip_script_data() {
 				}
 
 				if ( $this->bytes_already_parsed >= $doc_length ) {
-					$this->parser_state = self::STATE_INCOMPLETE;
+					$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 					return false;
 				}
 
 				if ( '>' === $html[ $this->bytes_already_parsed ] ) {
-					$this->bytes_already_parsed = $closer_potentially_starts_at;
+					++$this->bytes_already_parsed;
 					return true;
 				}
 			}
@@ -1303,17 +1506,34 @@ private function parse_next_tag() {
 
 		$html       = $this->html;
 		$doc_length = strlen( $html );
-		$at         = $this->bytes_already_parsed;
+		$was_at     = $this->bytes_already_parsed;
+		$at         = $was_at;
 
 		while ( false !== $at && $at < $doc_length ) {
 			$at = strpos( $html, '<', $at );
 
+			if ( $at > $was_at ) {
+				$this->parser_state         = self::STATE_TEXT_NODE;
+				$this->token_starts_at      = $was_at;
+				$this->token_length         = $at - $was_at;
+				$this->text_starts_at       = $was_at;
+				$this->text_length          = $this->token_length;
+				$this->bytes_already_parsed = $at;
+				return true;
+			}
+
 			/*
 			 * This does not imply an incomplete parse; it indicates that there
 			 * can be nothing left in the document other than a #text node.
 			 */
 			if ( false === $at ) {
-				return false;
+				$this->parser_state         = self::STATE_TEXT_NODE;
+				$this->token_starts_at      = $was_at;
+				$this->token_length         = strlen( $html ) - $was_at;
+				$this->text_starts_at       = $was_at;
+				$this->text_length          = $this->token_length;
+				$this->bytes_already_parsed = strlen( $html );
+				return true;
 			}
 
 			$this->token_starts_at = $at;
@@ -1342,8 +1562,9 @@ private function parse_next_tag() {
 			$tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 );
 			if ( $tag_name_prefix_length > 0 ) {
 				++$at;
-				$this->tag_name_length      = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
+				$this->parser_state         = self::STATE_MATCHED_TAG;
 				$this->tag_name_starts_at   = $at;
+				$this->tag_name_length      = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
 				$this->bytes_already_parsed = $at + $this->tag_name_length;
 				return true;
 			}
@@ -1353,18 +1574,18 @@ private function parse_next_tag() {
 			 * the document. There is nothing left to parse.
 			 */
 			if ( $at + 1 >= $doc_length ) {
-				$this->parser_state = self::STATE_INCOMPLETE;
+				$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 				return false;
 			}
 
 			/*
-			 * <! transitions to markup declaration open state
+			 * `<!` transitions to markup declaration open state
 			 * https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
 			 */
 			if ( '!' === $html[ $at + 1 ] ) {
 				/*
-				 * <!-- transitions to a bogus comment state – skip to the nearest -->
+				 * `<!--` transitions to a comment state – apply further comment rules.
 				 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
 				 */
 				if (
@@ -1375,7 +1596,7 @@ private function parse_next_tag() {
 					$closer_at = $at + 4;
 					// If it's not possible to close the comment then there is nothing more to scan.
 					if ( $doc_length <= $closer_at ) {
-						$this->parser_state = self::STATE_INCOMPLETE;
+						$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 						return false;
 					}
@@ -1383,8 +1604,27 @@ private function parse_next_tag() {
 					// Abruptly-closed empty comments are a sequence of dashes followed by `>`.
 					$span_of_dashes = strspn( $html, '-', $closer_at );
 					if ( '>' === $html[ $closer_at + $span_of_dashes ] ) {
-						$at = $closer_at + $span_of_dashes + 1;
-						continue;
+						/*
+						 * @todo When implementing `set_modifiable_text()` ensure that updates to this token
+						 *       don't break the syntax for short comments, e.g. `<!--->`. Unlike other comment
+						 *       and bogus comment syntax, these leave no clear insertion point for text and
+						 *       they need to be modified specially in order to contain text. E.g. to store
+						 *       `?` as the modifiable text, the `<!--->` needs to become `<!--?-->`, which
+						 *       involves inserting an additional `-` into the token after the modifiable text.
+						 */
+						$this->parser_state = self::STATE_COMMENT;
+						$this->comment_type = self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT;
+						$this->token_length = $closer_at + $span_of_dashes + 1 - $this->token_starts_at;
+
+						// Only provide modifiable text if the token is long enough to contain it.
+						if ( $span_of_dashes >= 2 ) {
+							$this->comment_type   = self::COMMENT_AS_HTML_COMMENT;
+							$this->text_starts_at = $this->token_starts_at + 4;
+							$this->text_length    = $span_of_dashes - 2;
+						}
+
+						$this->bytes_already_parsed = $closer_at + $span_of_dashes + 1;
+						return true;
 					}
 
 					/*
@@ -1397,51 +1637,39 @@ private function parse_next_tag() {
 					while ( ++$closer_at < $doc_length ) {
 						$closer_at = strpos( $html, '--', $closer_at );
 						if ( false === $closer_at ) {
-							$this->parser_state = self::STATE_INCOMPLETE;
+							$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 							return false;
 						}
 
 						if ( $closer_at + 2 < $doc_length && '>' === $html[ $closer_at + 2 ] ) {
-							$at = $closer_at + 3;
-							continue 2;
+							$this->parser_state         = self::STATE_COMMENT;
+							$this->comment_type         = self::COMMENT_AS_HTML_COMMENT;
+							$this->token_length         = $closer_at + 3 - $this->token_starts_at;
+							$this->text_starts_at       = $this->token_starts_at + 4;
+							$this->text_length          = $closer_at - $this->text_starts_at;
+							$this->bytes_already_parsed = $closer_at + 3;
+							return true;
 						}
 
-						if ( $closer_at + 3 < $doc_length && '!' === $html[ $closer_at + 2 ] && '>' === $html[ $closer_at + 3 ] ) {
-							$at = $closer_at + 4;
-							continue 2;
+						if (
+							$closer_at + 3 < $doc_length &&
+							'!' === $html[ $closer_at + 2 ] &&
+							'>' === $html[ $closer_at + 3 ]
+						) {
+							$this->parser_state         = self::STATE_COMMENT;
+							$this->comment_type         = self::COMMENT_AS_HTML_COMMENT;
+							$this->token_length         = $closer_at + 4 - $this->token_starts_at;
+							$this->text_starts_at       = $this->token_starts_at + 4;
+							$this->text_length          = $closer_at - $this->text_starts_at;
+							$this->bytes_already_parsed = $closer_at + 4;
+							return true;
 						}
 					}
 				}
 
 				/*
-				 * <![CDATA[ transitions to CDATA section state – skip to the nearest ]]>
-				 * The CDATA is case-sensitive.
-				 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
-				 */
-				if (
-					$doc_length > $at + 8 &&
-					'[' === $html[ $at + 2 ] &&
-					'C' === $html[ $at + 3 ] &&
-					'D' === $html[ $at + 4 ] &&
-					'A' === $html[ $at + 5 ] &&
-					'T' === $html[ $at + 6 ] &&
-					'A' === $html[ $at + 7 ] &&
-					'[' === $html[ $at + 8 ]
-				) {
-					$closer_at = strpos( $html, ']]>', $at + 9 );
-					if ( false === $closer_at ) {
-						$this->parser_state = self::STATE_INCOMPLETE;
-
-						return false;
-					}
-
-					$at = $closer_at + 3;
-					continue;
-				}
-
-				/*
-				 * <!DOCTYPE transitions to DOCTYPE state – skip to the nearest >
+				 * `<!DOCTYPE` transitions to DOCTYPE state – skip to the nearest >
 				 * These are ASCII-case-insensitive.
 				 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
 				 */
@@ -1457,13 +1685,17 @@ private function parse_next_tag() {
 				) {
 					$closer_at = strpos( $html, '>', $at + 9 );
 					if ( false === $closer_at ) {
-						$this->parser_state = self::STATE_INCOMPLETE;
+						$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 						return false;
 					}
 
-					$at = $closer_at + 1;
-					continue;
+					$this->parser_state         = self::STATE_DOCTYPE;
+					$this->token_length         = $closer_at + 1 - $this->token_starts_at;
+					$this->text_starts_at       = $this->token_starts_at + 9;
+					$this->text_length          = $closer_at - $this->text_starts_at;
+					$this->bytes_already_parsed = $closer_at + 1;
+					return true;
 				}
 
 				/*
@@ -1471,14 +1703,53 @@ private function parse_next_tag() {
 				 * to the bogus comment state - skip to the nearest >. If no closer is
 				 * found then the HTML was truncated inside the markup declaration.
 				 */
-				$at = strpos( $html, '>', $at + 1 );
-				if ( false === $at ) {
-					$this->parser_state = self::STATE_INCOMPLETE;
+				$closer_at = strpos( $html, '>', $at + 1 );
+				if ( false === $closer_at ) {
+					$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 					return false;
 				}
 
-				continue;
+				$this->parser_state         = self::STATE_COMMENT;
+				$this->comment_type         = self::COMMENT_AS_INVALID_HTML;
+				$this->token_length         = $closer_at + 1 - $this->token_starts_at;
+				$this->text_starts_at       = $this->token_starts_at + 2;
+				$this->text_length          = $closer_at - $this->text_starts_at;
+				$this->bytes_already_parsed = $closer_at + 1;
+
+				/*
+				 * Identify nodes that would be CDATA if HTML had CDATA sections.
+				 *
+				 * This section must occur after identifying the bogus comment end
+				 * because in an HTML parser it will span to the nearest `>`, even
+				 * if there's no `]]>` as would be required in an XML document. It
+				 * is therefore not possible to parse a CDATA section containing
+				 * a `>` in the HTML syntax.
+				 *
+				 * Inside foreign elements there is a discrepancy between browsers
+				 * and the specification on this.
+				 *
+				 * @todo Track whether the Tag Processor is inside a foreign element
+				 *       and require the proper closing `]]>` in those cases.
+				 */
+				if (
+					$this->token_length >= 10 &&
+					'[' === $html[ $this->token_starts_at + 2 ] &&
+					'C' === $html[ $this->token_starts_at + 3 ] &&
+					'D' === $html[ $this->token_starts_at + 4 ] &&
+					'A' === $html[ $this->token_starts_at + 5 ] &&
+					'T' === $html[ $this->token_starts_at + 6 ] &&
+					'A' === $html[ $this->token_starts_at + 7 ] &&
+					'[' === $html[ $this->token_starts_at + 8 ] &&
+					']' === $html[ $closer_at - 1 ]
+				) {
+					$this->parser_state    = self::STATE_COMMENT;
+					$this->comment_type    = self::COMMENT_AS_CDATA_LOOKALIKE;
+					$this->text_starts_at += 7;
+					$this->text_length    -= 9;
+				}
+
+				return true;
 			}
 
 			/*
@@ -1491,30 +1762,80 @@ private function parse_next_tag() {
 			 * See https://html.spec.whatwg.org/#parse-error-missing-end-tag-name
 			 */
 			if ( '>' === $html[ $at + 1 ] ) {
-				++$at;
-				continue;
+				$this->parser_state         = self::STATE_PRESUMPTUOUS_TAG;
+				$this->token_length         = $at + 2 - $this->token_starts_at;
+				$this->bytes_already_parsed = $at + 2;
+				return true;
 			}
 
 			/*
-			 * <? transitions to a bogus comment state – skip to the nearest >
+			 * `<?` transitions to a bogus comment state – skip to the nearest >
 			 * See https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
 			 */
 			if ( '?' === $html[ $at + 1 ] ) {
 				$closer_at = strpos( $html, '>', $at + 2 );
 				if ( false === $closer_at ) {
-					$this->parser_state = self::STATE_INCOMPLETE;
+					$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 					return false;
 				}
 
-				$at = $closer_at + 1;
-				continue;
+				$this->parser_state         = self::STATE_COMMENT;
+				$this->comment_type         = self::COMMENT_AS_INVALID_HTML;
+				$this->token_length         = $closer_at + 1 - $this->token_starts_at;
+				$this->text_starts_at       = $this->token_starts_at + 2;
+				$this->text_length          = $closer_at - $this->text_starts_at;
+				$this->bytes_already_parsed = $closer_at + 1;
+
+				/*
+				 * Identify a Processing Instruction node were HTML to have them.
+				 *
+				 * This section must occur after identifying the bogus comment end
+				 * because in an HTML parser it will span to the nearest `>`, even
+				 * if there's no `?>` as would be required in an XML document. It
+				 * is therefore not possible to parse a Processing Instruction node
+				 * containing a `>` in the HTML syntax.
+				 *
+				 * XML allows for more target names, but this code only identifies
+				 * those with ASCII-representable target names. This means that it
+				 * may identify some Processing Instruction nodes as bogus comments,
+				 * but it will not misinterpret the HTML structure. By limiting the
+				 * identification to these target names the Tag Processor can avoid
+				 * the need to start parsing UTF-8 sequences.
+				 *
+				 * > NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] |
+				 *                     [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] |
+				 *                     [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
+				 *                     [#x10000-#xEFFFF]
+				 * > NameChar      ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+				 *
+				 * @see https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
+				 */
+				if ( $this->token_length >= 5 && '?' === $html[ $closer_at - 1 ] ) {
+					$comment_text     = substr( $html, $this->token_starts_at + 2, $this->token_length - 4 );
+					$pi_target_length = strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ:_' );
+
+					if ( 0 < $pi_target_length ) {
+						$pi_target_length += strspn( $comment_text, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789:_-.', $pi_target_length );
+
+						$this->comment_type       = self::COMMENT_AS_PI_NODE_LOOKALIKE;
+						$this->tag_name_starts_at = $this->token_starts_at + 2;
+						$this->tag_name_length    = $pi_target_length;
+						$this->text_starts_at    += $pi_target_length;
+						$this->text_length       -= $pi_target_length + 1;
+					}
+				}
+
+				return true;
 			}
 
 			/*
 			 * If a non-alpha starts the tag name in a tag closer it's a comment.
 			 * Find the first `>`, which closes the comment.
 			 *
+			 * This parser classifies these particular comments as special "funky comments"
+			 * which are made available for further processing.
+			 *
 			 * See https://html.spec.whatwg.org/#parse-error-invalid-first-character-of-tag-name
 			 */
 			if ( $this->is_closing_tag ) {
@@ -1525,13 +1846,17 @@ private function parse_next_tag() {
 
 				$closer_at = strpos( $html, '>', $at + 3 );
 				if ( false === $closer_at ) {
-					$this->parser_state = self::STATE_INCOMPLETE;
+					$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 					return false;
 				}
 
-				$at = $closer_at + 1;
-				continue;
+				$this->parser_state         = self::STATE_FUNKY_COMMENT;
+				$this->token_length         = $closer_at + 1 - $this->token_starts_at;
+				$this->text_starts_at       = $this->token_starts_at + 2;
+				$this->text_length          = $closer_at - $this->text_starts_at;
+				$this->bytes_already_parsed = $closer_at + 1;
+				return true;
 			}
 
 			++$at;
@@ -1551,7 +1876,7 @@ private function parse_next_attribute() {
 		// Skip whitespace and slashes.
 		$this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed );
 		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
-			$this->parser_state = self::STATE_INCOMPLETE;
+			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 			return false;
 		}
@@ -1575,14 +1900,14 @@ private function parse_next_attribute() {
 		$attribute_name              = substr( $this->html, $attribute_start, $name_length );
 		$this->bytes_already_parsed += $name_length;
 		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
-			$this->parser_state = self::STATE_INCOMPLETE;
+			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 			return false;
 		}
 
 		$this->skip_whitespace();
 		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
-			$this->parser_state = self::STATE_INCOMPLETE;
+			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 			return false;
 		}
@@ -1592,7 +1917,7 @@ private function parse_next_attribute() {
 			++$this->bytes_already_parsed;
 			$this->skip_whitespace();
 			if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
-				$this->parser_state = self::STATE_INCOMPLETE;
+				$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 				return false;
 			}
@@ -1620,7 +1945,7 @@ private function parse_next_attribute() {
 		}
 
 		if ( $attribute_end >= strlen( $this->html ) ) {
-			$this->parser_state = self::STATE_INCOMPLETE;
+			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 			return false;
 		}
@@ -1692,8 +2017,11 @@ private function after_tag() {
 		$this->token_length         = null;
 		$this->tag_name_starts_at   = null;
 		$this->tag_name_length      = null;
+		$this->text_starts_at       = 0;
+		$this->text_length          = 0;
 		$this->is_closing_tag       = null;
 		$this->attributes           = array();
+		$this->comment_type         = null;
 		$this->duplicate_attributes = null;
 	}
 
@@ -1985,7 +2313,7 @@ public function seek( $bookmark_name ) {
 
 		// Point this tag processor before the sought tag opener and consume it.
 		$this->bytes_already_parsed = $this->bookmarks[ $bookmark_name ]->start;
-		return $this->next_tag( array( 'tag_closers' => 'visit' ) );
+		return $this->next_token();
 	}
 
 	/**
@@ -2216,13 +2544,24 @@ public function get_attribute_names_with_prefix( $prefix ) {
 	 * @return string|null Name of currently matched tag in input HTML, or `null` if none found.
 	 */
 	public function get_tag() {
-		if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
+		if ( null === $this->tag_name_starts_at ) {
 			return null;
 		}
 
 		$tag_name = substr( $this->html, $this->tag_name_starts_at, $this->tag_name_length );
 
-		return strtoupper( $tag_name );
+		if ( self::STATE_MATCHED_TAG === $this->parser_state ) {
+			return strtoupper( $tag_name );
+		}
+
+		if (
+			self::STATE_COMMENT === $this->parser_state &&
+			self::COMMENT_AS_PI_NODE_LOOKALIKE === $this->get_comment_type()
+		) {
+			return $tag_name;
+		}
+
+		return null;
 	}
 
 	/**
@@ -2281,6 +2620,191 @@ public function is_tag_closer() {
 		);
 	}
 
+	/**
+	 * Indicates the kind of matched token, if any.
+	 *
+	 * This differs from `get_token_name()` in that it always
+	 * returns a static string indicating the type, whereas
+	 * `get_token_name()` may return values derived from the
+	 * token itself, such as a tag name or processing
+	 * instruction tag.
+	 *
+	 * Possible values:
+	 *  - `#tag` when matched on a tag.
+	 *  - `#text` when matched on a text node.
+	 *  - `#cdata-section` when matched on a CDATA node.
+	 *  - `#comment` when matched on a comment.
+	 *  - `#doctype` when matched on a DOCTYPE declaration.
+	 *  - `#presumptuous-tag` when matched on an empty tag closer.
+	 *  - `#funky-comment` when matched on a funky comment.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @return string|null What kind of token is matched, or null.
+	 */
+	public function get_token_type() {
+		switch ( $this->parser_state ) {
+			case self::STATE_MATCHED_TAG:
+				return '#tag';
+
+			case self::STATE_DOCTYPE:
+				return '#doctype';
+
+			default:
+				return $this->get_token_name();
+		}
+	}
+
+	/**
+	 * Returns the node name represented by the token.
+	 *
+	 * This matches the DOM API value `nodeName`. Some values
+	 * are static, such as `#text` for a text node, while others
+	 * are dynamically generated from the token itself.
+	 *
+	 * Dynamic names:
+	 *  - Uppercase tag name for tag matches.
+	 *  - `html` for DOCTYPE declarations.
+	 *
+	 * Note that if the Tag Processor is not matched on a token
+	 * then this function will return `null`, either because it
+	 * hasn't yet found a token or because it reached the end
+	 * of the document without matching a token.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @return string|null Name of the matched token.
+	 */
+	public function get_token_name() {
+		switch ( $this->parser_state ) {
+			case self::STATE_MATCHED_TAG:
+				return $this->get_tag();
+
+			case self::STATE_TEXT_NODE:
+				return '#text';
+
+			case self::STATE_CDATA_NODE:
+				return '#cdata-section';
+
+			case self::STATE_COMMENT:
+				return '#comment';
+
+			case self::STATE_DOCTYPE:
+				return 'html';
+
+			case self::STATE_PRESUMPTUOUS_TAG:
+				return '#presumptuous-tag';
+
+			case self::STATE_FUNKY_COMMENT:
+				return '#funky-comment';
+		}
+	}
+
+	/**
+	 * Indicates what kind of comment produced the comment node.
+	 *
+	 * Because there are different kinds of HTML syntax which produce
+	 * comments, the Tag Processor tracks and exposes this as a type
+	 * for the comment. Nominally only regular HTML comments exist as
+	 * they are commonly known, but a number of unrelated syntax errors
+	 * also produce comments.
+	 *
+	 * @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT
+	 * @see self::COMMENT_AS_CDATA_LOOKALIKE
+	 * @see self::COMMENT_AS_INVALID_HTML
+	 * @see self::COMMENT_AS_HTML_COMMENT
+	 * @see self::COMMENT_AS_PI_NODE_LOOKALIKE
+	 *
+	 * @since 6.5.0
+	 *
+	 * @return string|null
+	 */
+	public function get_comment_type() {
+		if ( self::STATE_COMMENT !== $this->parser_state ) {
+			return null;
+		}
+
+		return $this->comment_type;
+	}
+
+	/**
+	 * Returns the modifiable text for a matched token, or an empty string.
+	 *
+	 * Modifiable text is text content that may be read and changed without
+	 * changing the HTML structure of the document around it. This includes
+	 * the contents of `#text` nodes in the HTML as well as the inner
+	 * contents of HTML comments, Processing Instructions, and others, even
+	 * though these nodes aren't part of a parsed DOM tree. They also contain
+	 * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
+	 * other section in an HTML document which cannot contain HTML markup (DATA).
+	 *
+	 * If a token has no modifiable text then an empty string is returned to
+	 * avoid needless crashing or type errors. An empty string does not mean
+	 * that a token has modifiable text, and a token with modifiable text may
+	 * have an empty string (e.g. a comment with no contents).
+	 *
+	 * @since 6.5.0
+	 *
+	 * @return string
+	 */
+	public function get_modifiable_text() {
+		if ( null === $this->text_starts_at ) {
+			return '';
+		}
+
+		$text = substr( $this->html, $this->text_starts_at, $this->text_length );
+
+		// Comment data is not decoded.
+		if (
+			self::STATE_CDATA_NODE === $this->parser_state ||
+			self::STATE_COMMENT === $this->parser_state ||
+			self::STATE_DOCTYPE === $this->parser_state ||
+			self::STATE_FUNKY_COMMENT === $this->parser_state
+		) {
+			return $text;
+		}
+
+		$tag_name = $this->get_tag();
+		if (
+			// Script data is not decoded.
+			'SCRIPT' === $tag_name ||
+
+			// RAWTEXT data is not decoded.
+			'IFRAME' === $tag_name ||
+			'NOEMBED' === $tag_name ||
+			'NOFRAMES' === $tag_name ||
+			'STYLE' === $tag_name ||
+			'XMP' === $tag_name
+		) {
+			return $text;
+		}
+
+		$decoded = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5 | ENT_SUBSTITUTE );
+
+		if ( empty( $decoded ) ) {
+			return '';
+		}
+
+		/*
+		 * TEXTAREA skips a leading newline, but this newline may appear not only as the
+		 * literal character `\n`, but also as a character reference, such as in the
+		 * following markup: `<textarea>&#x0a;Content</textarea>`.
+		 *
+		 * For these cases it's important to first decode the text content before checking
+		 * for a leading newline and removing it.
+		 */
+		if (
+			self::STATE_MATCHED_TAG === $this->parser_state &&
+			'TEXTAREA' === $tag_name &&
+			strlen( $decoded ) > 0 &&
+			"\n" === $decoded[0]
+		) {
+			return substr( $decoded, 1 );
+		}
+
+		return $decoded;
+	}
+
 	/**
 	 * Updates or creates a new attribute on the currently matched tag with the passed value.
 	 *
@@ -2746,7 +3270,7 @@ private function matches() {
 	}
 
 	/**
-	 * Parser Ready State
+	 * Parser Ready State.
 	 *
 	 * Indicates that the parser is ready to run and waiting for a state transition.
 	 * It may not have started yet, or it may have just finished parsing a token and
@@ -2759,7 +3283,7 @@ private function matches() {
 	const STATE_READY = 'STATE_READY';
 
 	/**
-	 * Parser Complete State
+	 * Parser Complete State.
 	 *
 	 * Indicates that the parser has reached the end of the document and there is
 	 * nothing left to scan. It finished parsing the last token completely.
@@ -2771,7 +3295,7 @@ private function matches() {
 	const STATE_COMPLETE = 'STATE_COMPLETE';
 
 	/**
-	 * Parser Incomplete State
+	 * Parser Incomplete Input State.
 	 *
 	 * Indicates that the parser has reached the end of the document before finishing
 	 * a token. It started parsing a token but there is a possibility that the input
@@ -2784,10 +3308,10 @@ private function matches() {
 	 *
 	 * @access private
 	 */
-	const STATE_INCOMPLETE = 'STATE_INCOMPLETE';
+	const STATE_INCOMPLETE_INPUT = 'STATE_INCOMPLETE_INPUT';
 
 	/**
-	 * Parser Matched Tag State
+	 * Parser Matched Tag State.
 	 *
 	 * Indicates that the parser has found an HTML tag and it's possible to get
 	 * the tag name and read or modify its attributes (if it's not a closing tag).
@@ -2797,4 +3321,153 @@ private function matches() {
 	 * @access private
 	 */
 	const STATE_MATCHED_TAG = 'STATE_MATCHED_TAG';
+
+	/**
+	 * Parser Text Node State.
+	 *
+	 * Indicates that the parser has found a text node and it's possible
+	 * to read and modify that text.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_TEXT_NODE = 'STATE_TEXT_NODE';
+
+	/**
+	 * Parser CDATA Node State.
+	 *
+	 * Indicates that the parser has found a CDATA node and it's possible
+	 * to read and modify its modifiable text. Note that in HTML there are
+	 * no CDATA nodes outside of foreign content (SVG and MathML). Outside
+	 * of foreign content, they are treated as HTML comments.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_CDATA_NODE = 'STATE_CDATA_NODE';
+
+	/**
+	 * Indicates that the parser has found an HTML comment and it's
+	 * possible to read and modify its modifiable text.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_COMMENT = 'STATE_COMMENT';
+
+	/**
+	 * Indicates that the parser has found a DOCTYPE node and it's
+	 * possible to read and modify its modifiable text.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_DOCTYPE = 'STATE_DOCTYPE';
+
+	/**
+	 * Indicates that the parser has found an empty tag closer `</>`.
+	 *
+	 * Note that in HTML there are no empty tag closers, and they
+	 * are ignored. Nonetheless, the Tag Processor still
+	 * recognizes them as they appear in the HTML stream.
+	 *
+	 * These were historically discussed as a "presumptuous tag
+	 * closer," which would close the nearest open tag, but were
+	 * dismissed in favor of explicitly-closing tags.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_PRESUMPTUOUS_TAG = 'STATE_PRESUMPTUOUS_TAG';
+
+	/**
+	 * Indicates that the parser has found a "funky comment"
+	 * and it's possible to read and modify its modifiable text.
+	 *
+	 * Example:
+	 *
+	 *     </%url>
+	 *     </{"wp-bit":"query/post-author"}>
+	 *     </2>
+	 *
+	 * Funky comments are tag closers with invalid tag names. Note
+	 * that in HTML these are turn into bogus comments. Nonetheless,
+	 * the Tag Processor recognizes them in a stream of HTML and
+	 * exposes them for inspection and modification.
+	 *
+	 * @since 6.5.0
+	 *
+	 * @access private
+	 */
+	const STATE_FUNKY_COMMENT = 'STATE_WP_FUNKY';
+
+	/**
+	 * Indicates that a comment was created when encountering abruptly-closed HTML comment.
+	 *
+	 * Example:
+	 *
+	 *     <!-->
+	 *     <!--->
+	 *
+	 * @since 6.5.0
+	 */
+	const COMMENT_AS_ABRUPTLY_CLOSED_COMMENT = 'COMMENT_AS_ABRUPTLY_CLOSED_COMMENT';
+
+	/**
+	 * Indicates that a comment would be parsed as a CDATA node,
+	 * were HTML to allow CDATA nodes outside of foreign content.
+	 *
+	 * Example:
+	 *
+	 *     <![CDATA[This is a CDATA node.]]>
+	 *
+	 * This is an HTML comment, but it looks like a CDATA node.
+	 *
+	 * @since 6.5.0
+	 */
+	const COMMENT_AS_CDATA_LOOKALIKE = 'COMMENT_AS_CDATA_LOOKALIKE';
+
+	/**
+	 * Indicates that a comment was created when encountering
+	 * normative HTML comment syntax.
+	 *
+	 * Example:
+	 *
+	 *     <!-- this is a comment -->
+	 *
+	 * @since 6.5.0
+	 */
+	const COMMENT_AS_HTML_COMMENT = 'COMMENT_AS_HTML_COMMENT';
+
+	/**
+	 * Indicates that a comment would be parsed as a Processing
+	 * Instruction node, were they to exist within HTML.
+	 *
+	 * Example:
+	 *
+	 *     <?wp __( 'Like' ) ?>
+	 *
+	 * This is an HTML comment, but it looks like a CDATA node.
+	 *
+	 * @since 6.5.0
+	 */
+	const COMMENT_AS_PI_NODE_LOOKALIKE = 'COMMENT_AS_PI_NODE_LOOKALIKE';
+
+	/**
+	 * Indicates that a comment was created when encountering invalid
+	 * HTML input, a so-called "bogus comment."
+	 *
+	 * Example:
+	 *
+	 *     <?nothing special>
+	 *     <!{nothing special}>
+	 *
+	 * @since 6.5.0
+	 */
+	const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';
 }