diff --git a/bootstrap.php b/bootstrap.php
new file mode 100644
index 0000000..1764edc
--- /dev/null
+++ b/bootstrap.php
@@ -0,0 +1,53 @@
+<?php
+
+// Where to find the streaming WP_XML_Processor 
+// Use a version from this PR: https://github.com/adamziel/wordpress-develop/pull/43
+define('WP_XML_API_PATH', __DIR__ );
+define('SITE_TRANSFER_PROTOCOL_PATH', __DIR__ . '/site-transfer-protocol' );
+define('BLUEPRINTS_LIB_PATH', __DIR__ . '/blueprints-library/src/WordPress' );
+if(!file_exists(WP_XML_API_PATH . '/class-wp-token-map.php')) {
+    copy(WP_XML_API_PATH.'/../class-wp-token-map.php', WP_XML_API_PATH . '/class-wp-token-map.php');
+}
+
+$requires[] = WP_XML_API_PATH . "/class-wp-html-token.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-html-span.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-html-text-replacement.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-html-decoder.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-html-attribute-token.php";
+
+$requires[] = WP_XML_API_PATH . "/class-wp-html-decoder.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-html-tag-processor.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-html-open-elements.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-token-map.php";
+$requires[] = WP_XML_API_PATH . "/html5-named-character-references.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-html-active-formatting-elements.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-html-processor-state.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-html-unsupported-exception.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-html-processor.php";
+
+$requires[] = WP_XML_API_PATH . "/class-wp-xml-decoder.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-xml-tag-processor.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-xml-processor.php";
+$requires[] = WP_XML_API_PATH . "/class-wp-wxr-normalizer.php";
+$requires[] = WP_XML_API_PATH . "/functions.php";
+$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamWrapperInterface.php";
+$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamWrapper.php";
+$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamPeekerWrapper.php";
+$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Request.php";
+$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Response.php";
+$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/HttpError.php";
+$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Connection.php";
+$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Client.php";
+$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/StreamWrapper/ChunkedEncodingWrapper.php";
+$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/StreamWrapper/InflateStreamWrapper.php";
+
+$requires[] = SITE_TRANSFER_PROTOCOL_PATH . '/src/WP_Block_Markup_Processor.php';
+$requires[] = SITE_TRANSFER_PROTOCOL_PATH . '/src/WP_Block_Markup_Url_Processor.php';
+$requires[] = SITE_TRANSFER_PROTOCOL_PATH . '/src/WP_Migration_URL_In_Text_Processor.php';
+$requires[] = SITE_TRANSFER_PROTOCOL_PATH . '/src/WP_URL.php';
+$requires[] = SITE_TRANSFER_PROTOCOL_PATH . '/src/functions.php';
+$requires[] = SITE_TRANSFER_PROTOCOL_PATH . '/vendor/autoload.php';
+
+foreach ($requires as $require) {
+    require_once $require;
+}
diff --git a/class-wp-html-tag-processor.php b/class-wp-html-tag-processor.php
index 99f37dc..b3a9874 100644
--- a/class-wp-html-tag-processor.php
+++ b/class-wp-html-tag-processor.php
@@ -294,8 +294,8 @@
  *
  * The special elements are:
  *  - `SCRIPT` whose contents are treated as raw plaintext but supports a legacy
- *    style of including Javascript inside of HTML comments to avoid accidentally
- *    closing the SCRIPT from inside a Javascript string. E.g. `console.log( '</script>' )`.
+ *    style of including JavaScript inside of HTML comments to avoid accidentally
+ *    closing the SCRIPT from inside a JavaScript string. E.g. `console.log( '</script>' )`.
  *  - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any
  *    character references are decoded. E.g. `1 &lt; 2 < 3` becomes `1 < 2 < 3`.
  *  - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as
@@ -1524,21 +1524,10 @@ private function parse_next_tag() {
 		$was_at     = $this->bytes_already_parsed;
 		$at         = $was_at;
 
-		while ( false !== $at && $at < $doc_length ) {
+		while ( $at < $doc_length ) {
 			$at = strpos( $html, '<', $at );
-
-			/*
-			 * This does not imply an incomplete parse; it indicates that there
-			 * can be nothing left in the document other than a #text node.
-			 */
 			if ( false === $at ) {
-				$this->parser_state         = self::STATE_TEXT_NODE;
-				$this->token_starts_at      = $was_at;
-				$this->token_length         = strlen( $html ) - $was_at;
-				$this->text_starts_at       = $was_at;
-				$this->text_length          = $this->token_length;
-				$this->bytes_already_parsed = strlen( $html );
-				return true;
+				break;
 			}
 
 			if ( $at > $was_at ) {
@@ -1554,19 +1543,9 @@ private function parse_next_tag() {
 				 *
 				 * @see https://html.spec.whatwg.org/#tag-open-state
 				 */
-				if ( strlen( $html ) > $at + 1 ) {
-					$next_character  = $html[ $at + 1 ];
-					$at_another_node = (
-						'!' === $next_character ||
-						'/' === $next_character ||
-						'?' === $next_character ||
-						( 'A' <= $next_character && $next_character <= 'Z' ) ||
-						( 'a' <= $next_character && $next_character <= 'z' )
-					);
-					if ( ! $at_another_node ) {
-						++$at;
-						continue;
-					}
+				if ( 1 !== strspn( $html, '!/?abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOPQRSTUVWXYZ', $at + 1, 1 ) ) {
+					++$at;
+					continue;
 				}
 
 				$this->parser_state         = self::STATE_TEXT_NODE;
@@ -1630,11 +1609,7 @@ private function parse_next_tag() {
 				 * `<!--` transitions to a comment state – apply further comment rules.
 				 * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
 				 */
-				if (
-					$doc_length > $at + 3 &&
-					'-' === $html[ $at + 2 ] &&
-					'-' === $html[ $at + 3 ]
-				) {
+				if ( 0 === substr_compare( $html, '--', $at + 2, 2 ) ) {
 					$closer_at = $at + 4;
 					// If it's not possible to close the comment then there is nothing more to scan.
 					if ( $doc_length <= $closer_at ) {
@@ -1911,7 +1886,17 @@ private function parse_next_tag() {
 			++$at;
 		}
 
-		return false;
+		/*
+		 * This does not imply an incomplete parse; it indicates that there
+		 * can be nothing left in the document other than a #text node.
+		 */
+		$this->parser_state         = self::STATE_TEXT_NODE;
+		$this->token_starts_at      = $was_at;
+		$this->token_length         = $doc_length - $was_at;
+		$this->text_starts_at       = $was_at;
+		$this->text_length          = $this->token_length;
+		$this->bytes_already_parsed = $doc_length;
+		return true;
 	}
 
 	/**
@@ -1922,9 +1907,11 @@ private function parse_next_tag() {
 	 * @return bool Whether an attribute was found before the end of the document.
 	 */
 	private function parse_next_attribute() {
+		$doc_length = strlen( $this->html );
+
 		// Skip whitespace and slashes.
 		$this->bytes_already_parsed += strspn( $this->html, " \t\f\r\n/", $this->bytes_already_parsed );
-		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+		if ( $this->bytes_already_parsed >= $doc_length ) {
 			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 			return false;
@@ -1941,21 +1928,21 @@ private function parse_next_attribute() {
 			: strcspn( $this->html, "=/> \t\f\r\n", $this->bytes_already_parsed );
 
 		// No attribute, just tag closer.
-		if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= strlen( $this->html ) ) {
+		if ( 0 === $name_length || $this->bytes_already_parsed + $name_length >= $doc_length ) {
 			return false;
 		}
 
 		$attribute_start             = $this->bytes_already_parsed;
 		$attribute_name              = substr( $this->html, $attribute_start, $name_length );
 		$this->bytes_already_parsed += $name_length;
-		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+		if ( $this->bytes_already_parsed >= $doc_length ) {
 			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 			return false;
 		}
 
 		$this->skip_whitespace();
-		if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+		if ( $this->bytes_already_parsed >= $doc_length ) {
 			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 			return false;
@@ -1965,7 +1952,7 @@ private function parse_next_attribute() {
 		if ( $has_value ) {
 			++$this->bytes_already_parsed;
 			$this->skip_whitespace();
-			if ( $this->bytes_already_parsed >= strlen( $this->html ) ) {
+			if ( $this->bytes_already_parsed >= $doc_length ) {
 				$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 				return false;
@@ -1976,8 +1963,10 @@ private function parse_next_attribute() {
 				case '"':
 					$quote                      = $this->html[ $this->bytes_already_parsed ];
 					$value_start                = $this->bytes_already_parsed + 1;
-					$value_length               = strcspn( $this->html, $quote, $value_start );
-					$attribute_end              = $value_start + $value_length + 1;
+					$end_quote_at               = strpos( $this->html, $quote, $value_start );
+					$end_quote_at               = false === $end_quote_at ? $doc_length : $end_quote_at;
+					$value_length               = $end_quote_at - $value_start;
+					$attribute_end              = $end_quote_at + 1;
 					$this->bytes_already_parsed = $attribute_end;
 					break;
 
@@ -1993,7 +1982,7 @@ private function parse_next_attribute() {
 			$attribute_end = $attribute_start + $name_length;
 		}
 
-		if ( $attribute_end >= strlen( $this->html ) ) {
+		if ( $attribute_end >= $doc_length ) {
 			$this->parser_state = self::STATE_INCOMPLETE_INPUT;
 
 			return false;
@@ -2014,7 +2003,7 @@ private function parse_next_attribute() {
 		$comparable_name = strtolower( $attribute_name );
 
 		// If an attribute is listed many times, only use the first declaration and ignore the rest.
-		if ( ! array_key_exists( $comparable_name, $this->attributes ) ) {
+		if ( ! isset( $this->attributes[ $comparable_name ] ) ) {
 			$this->attributes[ $comparable_name ] = new WP_HTML_Attribute_Token(
 				$attribute_name,
 				$value_start,
@@ -2038,7 +2027,7 @@ private function parse_next_attribute() {
 		$duplicate_span = new WP_HTML_Span( $attribute_start, $attribute_end - $attribute_start );
 		if ( null === $this->duplicate_attributes ) {
 			$this->duplicate_attributes = array( $comparable_name => array( $duplicate_span ) );
-		} elseif ( ! array_key_exists( $comparable_name, $this->duplicate_attributes ) ) {
+		} elseif ( ! isset( $this->duplicate_attributes[ $comparable_name ] ) ) {
 			$this->duplicate_attributes[ $comparable_name ] = array( $duplicate_span );
 		} else {
 			$this->duplicate_attributes[ $comparable_name ][] = $duplicate_span;
@@ -2265,7 +2254,7 @@ private function class_name_updates_to_attributes_updates() {
 	 * @param int $shift_this_point Accumulate and return shift for this position.
 	 * @return int How many bytes the given pointer moved in response to the updates.
 	 */
-	private function apply_attributes_updates( $shift_this_point = 0 ) {
+	private function apply_attributes_updates( $shift_this_point ) {
 		if ( ! count( $this->lexical_updates ) ) {
 			return 0;
 		}
@@ -2787,6 +2776,8 @@ public function get_token_name() {
 			case self::STATE_FUNKY_COMMENT:
 				return '#funky-comment';
 		}
+
+		return null;
 	}
 
 	/**
@@ -2890,6 +2881,157 @@ public function get_modifiable_text() {
 		return $decoded;
 	}
 
+	/**
+	 * Sets the modifiable text for the matched token, if matched.
+	 *
+	 * Modifiable text is text content that may be read and changed without
+	 * changing the HTML structure of the document around it. This includes
+	 * the contents of `#text` nodes in the HTML as well as the inner
+	 * contents of HTML comments, Processing Instructions, and others, even
+	 * though these nodes aren't part of a parsed DOM tree. They also contain
+	 * the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
+	 * other section in an HTML document which cannot contain HTML markup (DATA).
+	 *
+	 * Not all modifiable text may be set by this method, and not all content
+	 * may be set as modifiable text. In the case that this fails it will return
+	 * `false` indicating as much. For instance, it will not allow inserting the
+	 * string `</script` into a SCRIPT element, because the rules for escaping
+	 * that safely are complicated. Similarly, it will not allow setting content
+	 * into a comment which would prematurely terminate the comment.
+	 *
+	 * Example:
+	 *
+	 *     // Add a preface to all STYLE contents.
+	 *     while ( $processor->next_tag( 'STYLE' ) ) {
+	 *         $style = $processor->get_modifiable_text();
+	 *         $processor->set_modifiable_text( "// Made with love on the World Wide Web\n{$style}" );
+	 *     }
+	 *
+	 *     // Replace smiley text with Emoji smilies.
+	 *     while ( $processor->next_token() ) {
+	 *         if ( '#text' !== $processor->get_token_name() ) {
+	 *             continue;
+	 *         }
+	 *
+	 *         $chunk = $processor->get_modifiable_text();
+	 *         if ( ! str_contains( $chunk, ':)' ) ) {
+	 *             continue;
+	 *         }
+	 *
+	 *         $processor->set_modifiable_text( str_replace( ':)', '🙂', $chunk ) );
+	 *     }
+	 *
+	 * @since 6.7.0
+	 *
+	 * @param string $plaintext_content New text content to represent in the matched token.
+	 *
+	 * @return bool Whether the text was able to update.
+	 */
+	public function set_modifiable_text( string $plaintext_content ): bool {
+		if ( self::STATE_TEXT_NODE === $this->parser_state ) {
+			$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+				$this->text_starts_at,
+				$this->text_length,
+				htmlspecialchars( $plaintext_content, ENT_QUOTES | ENT_HTML5 )
+			);
+
+			return true;
+		}
+
+		// Comment data is not encoded.
+		if (
+			self::STATE_COMMENT === $this->parser_state &&
+			self::COMMENT_AS_HTML_COMMENT === $this->comment_type
+		) {
+			// Check if the text could close the comment.
+			if ( 1 === preg_match( '/--!?>/', $plaintext_content ) ) {
+				return false;
+			}
+
+			$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+				$this->text_starts_at,
+				$this->text_length,
+				$plaintext_content
+			);
+
+			return true;
+		}
+
+		if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
+			return false;
+		}
+
+		switch ( $this->get_tag() ) {
+			case 'SCRIPT':
+				/*
+				 * This is over-protective, but ensures the update doesn't break
+				 * out of the SCRIPT element. A more thorough check would need to
+				 * ensure that the script closing tag doesn't exist, and isn't
+				 * also "hidden" inside the script double-escaped state.
+				 *
+				 * It may seem like replacing `</script` with `<\/script` would
+				 * properly escape these things, but this could mask regex patterns
+				 * that previously worked. Resolve this by not sending `</script`
+				 */
+				if ( false !== stripos( $plaintext_content, '</script' ) ) {
+					return false;
+				}
+
+				$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+					$this->text_starts_at,
+					$this->text_length,
+					$plaintext_content
+				);
+
+				return true;
+
+			case 'STYLE':
+				$plaintext_content = preg_replace_callback(
+					'~</(?P<TAG_NAME>style)~i',
+					static function ( $tag_match ) {
+						return "\\3c\\2f{$tag_match['TAG_NAME']}";
+					},
+					$plaintext_content
+				);
+
+				$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+					$this->text_starts_at,
+					$this->text_length,
+					$plaintext_content
+				);
+
+				return true;
+
+			case 'TEXTAREA':
+			case 'TITLE':
+				$plaintext_content = preg_replace_callback(
+					"~</(?P<TAG_NAME>{$this->get_tag()})~i",
+					static function ( $tag_match ) {
+						return "&lt;/{$tag_match['TAG_NAME']}";
+					},
+					$plaintext_content
+				);
+
+				/*
+				 * These don't _need_ to be escaped, but since they are decoded it's
+				 * safe to leave them escaped and this can prevent other code from
+				 * naively detecting tags within the contents.
+				 *
+				 * @todo It would be useful to prefix a multiline replacement text
+				 *       with a newline, but not necessary. This is for aesthetics.
+				 */
+				$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+					$this->text_starts_at,
+					$this->text_length,
+					$plaintext_content
+				);
+
+				return true;
+		}
+
+		return false;
+	}
+
 	/**
 	 * Updates or creates a new attribute on the currently matched tag with the passed value.
 	 *
@@ -2970,7 +3112,14 @@ public function set_attribute( $name, $value ) {
 		if ( true === $value ) {
 			$updated_attribute = $name;
 		} else {
-			$escaped_new_value = esc_attr( $value );
+			$comparable_name = strtolower( $name );
+
+			/*
+			 * Escape URL attributes.
+			 *
+			 * @see https://html.spec.whatwg.org/#attributes-3
+			 */
+			$escaped_new_value = in_array( $comparable_name, wp_kses_uri_attributes() ) ? esc_url( $value ) : esc_attr( $value );
 			$updated_attribute = "{$name}=\"{$escaped_new_value}\"";
 		}
 
@@ -3101,14 +3250,12 @@ public function remove_attribute( $name ) {
 		);
 
 		// Removes any duplicated attributes if they were also present.
-		if ( null !== $this->duplicate_attributes && array_key_exists( $name, $this->duplicate_attributes ) ) {
-			foreach ( $this->duplicate_attributes[ $name ] as $attribute_token ) {
-				$this->lexical_updates[] = new WP_HTML_Text_Replacement(
-					$attribute_token->start,
-					$attribute_token->length,
-					''
-				);
-			}
+		foreach ( $this->duplicate_attributes[ $name ] ?? array() as $attribute_token ) {
+			$this->lexical_updates[] = new WP_HTML_Text_Replacement(
+				$attribute_token->start,
+				$attribute_token->length,
+				''
+			);
 		}
 
 		return true;
@@ -3195,14 +3342,14 @@ public function get_updated_html() {
 		 * Keep track of the position right before the current tag. This will
 		 * be necessary for reparsing the current tag after updating the HTML.
 		 */
-		$before_current_tag = $this->token_starts_at;
+		$before_current_tag = $this->token_starts_at ?? 0;
 
 		/*
 		 * 1. Apply the enqueued edits and update all the pointers to reflect those changes.
 		 */
 		$this->class_name_updates_to_attributes_updates();
-		$before_current_tag += $this->apply_attributes_updates( $before_current_tag );
-
+		$before_current_tag += $this->apply_attributes_updates($before_current_tag);
+		
 		/*
 		 * 2. Rewind to before the current tag and reparse to get updated attributes.
 		 *
@@ -3223,7 +3370,9 @@ public function get_updated_html() {
 		 *                 ↑  │ back up by the length of the tag name plus the opening <
 		 *                 └←─┘ back up by strlen("em") + 1 ==> 3
 		 */
-		$this->bytes_already_parsed = $before_current_tag;
+		if ($this->get_token_type() === '#tag') {
+			$this->bytes_already_parsed = $before_current_tag;
+		}
 		$this->base_class_next_token();
 
 		return $this->html;
@@ -3308,35 +3457,8 @@ private function matches() {
 		}
 
 		// Does the tag name match the requested tag name in a case-insensitive manner?
-		if ( null !== $this->sought_tag_name ) {
-			/*
-			 * String (byte) length lookup is fast. If they aren't the
-			 * same length then they can't be the same string values.
-			 */
-			if ( strlen( $this->sought_tag_name ) !== $this->tag_name_length ) {
-				return false;
-			}
-
-			/*
-			 * Check each character to determine if they are the same.
-			 * Defer calls to `strtoupper()` to avoid them when possible.
-			 * Calling `strcasecmp()` here tested slowed than comparing each
-			 * character, so unless benchmarks show otherwise, it should
-			 * not be used.
-			 *
-			 * It's expected that most of the time that this runs, a
-			 * lower-case tag name will be supplied and the input will
-			 * contain lower-case tag names, thus normally bypassing
-			 * the case comparison code.
-			 */
-			for ( $i = 0; $i < $this->tag_name_length; $i++ ) {
-				$html_char = $this->html[ $this->tag_name_starts_at + $i ];
-				$tag_char  = $this->sought_tag_name[ $i ];
-
-				if ( $html_char !== $tag_char && strtoupper( $html_char ) !== $tag_char ) {
-					return false;
-				}
-			}
+		if ( isset( $this->sought_tag_name ) && 0 !== substr_compare( $this->html, $this->sought_tag_name, $this->tag_name_starts_at, $this->tag_name_length, true ) ) {
+			return false;
 		}
 
 		if ( null !== $this->sought_class_name && ! $this->has_class( $this->sought_class_name ) ) {
diff --git a/class-wp-wxr-normalizer.php b/class-wp-wxr-normalizer.php
new file mode 100644
index 0000000..2fbb3b3
--- /dev/null
+++ b/class-wp-wxr-normalizer.php
@@ -0,0 +1,236 @@
+<?php
+
+class WP_WXR_Normalizer
+{
+
+    private $input_stream;
+    private $output_stream;
+    private $rewrite_url_callback;
+
+    private $found_urls = array();
+
+    public function __construct(
+        $input_stream,
+        $output_stream,
+        $rewrite_url_callback
+    ) {
+        $this->input_stream = $input_stream;
+        $this->output_stream = $output_stream;
+        $this->rewrite_url_callback = $rewrite_url_callback;
+    }
+
+    public function get_found_urls()
+    {
+        return array_keys($this->found_urls);        
+    }
+
+    public function process()
+    {
+        $tokens = WP_XML_Processor::stream_tokens($this->input_stream, $this->output_stream, 1000000);
+        foreach ($tokens as $processor) {
+            if (
+                in_array('item', $processor->get_breadcrumbs())
+                // $processor->matches_breadcrumbs(array('item', 'content:encoded')) ||
+                // $processor->matches_breadcrumbs(array('item', 'excerpt:encoded')) ||
+                // $processor->matches_breadcrumbs(array('wp:comment_content'))
+            ) {
+                switch ($processor->get_token_type()) {
+                    case '#text':
+                    case '#cdata-section':
+                        $text = $processor->get_modifiable_text();
+                        $updated_text = $this->process_content_node($text);
+                        if ($updated_text !== $text) {
+                            $processor->set_modifiable_text($updated_text);
+                        }
+                        break;
+                }
+            }
+        }
+    }
+
+    public function process_content_node($text)
+    {
+        $result = $this->process_as_html($text);
+        if(false !== $result) {
+            return $result;
+        }
+
+        $result = $this->process_as_plaintext($text);
+        if(false !== $result) {
+            return $result;
+        }
+
+        return false;
+    }
+
+    private function process_as_html($text) {
+        $html = new WP_HTML_Tag_Processor($text);
+        if(false === $html->next_token()) {
+            return false;
+        }
+
+        do {
+            switch($html->get_token_type()) {
+                case '#comment':
+                    $text = $html->get_modifiable_text();
+                    // Try to parse as a block. The block parser won't cut it because
+                    // while it can parse blocks, it has no semantics for rewriting the
+                    // block markup. Let's do our best here:
+                    $at = strspn($text, ' \t\f\r\n'); // Whitespace
+                    if(!(
+                        $at + 3 < strlen($text) &&
+                        $text[$at] === 'w' &&
+                        $text[$at+1] === 'p' &&
+                        $text[$at+2] === ':'
+                    )) {
+                        break;
+                    }
+                    $at += 3;
+                    $at += strspn($text, 'abcdefghijklmnopqrstuwxvyzABCDEFGHIJKLMNOPRQSTUWXVYZ0123456789_-', $at); // Block name
+                    $at += strspn($text, ' \t\f\r\n', $at); // Whitespace again
+                    if($at >= strlen($text)) {
+                        // Oh, there were no attributes or this wasn't a block
+                        // Either way, we have nothing more to do here.
+                        break;
+                    }
+
+                    // It seems we may have block attributes here. Let's try to
+                    // parse them as JSON.
+                    $json_maybe = substr($text, $at);
+                    $attributes = json_decode($json_maybe, true);
+                    if(null === $attributes) {
+                        // This wasn't a block after all, let's move on
+                        break;
+                    }
+
+                    // This is a block! Let's process all block attributes and rewrite them
+                    $new_attributes = $this->process_block_attributes($attributes);
+                    $this->set_modifiable_html_text(
+                        $html,
+                        substr($text, 0, $at) . json_encode($new_attributes, JSON_HEX_TAG | JSON_HEX_AMP)
+                    );
+                    break;
+
+                case '#tag':
+                    $attributes = $html->get_attribute_names_with_prefix('');
+                    if(!$attributes) {
+                        break;
+                    }
+                    foreach($attributes as $attribute_name) {
+                        $value = $html->get_attribute($attribute_name);
+                        $updated = $this->process_as_plaintext($value);
+                        if($updated !== $value) {
+                            $html->set_attribute($attribute_name, $updated);
+                        }
+                    }
+                    break;
+                case '#text':
+                    $text = $html->get_modifiable_text();
+                    $updated_text = $this->process_as_plaintext($text);
+                    if($updated_text !== $text) {
+                        $this->set_modifiable_html_text($html, $updated_text);
+                    }
+                    break;
+            }
+        } while($html->next_token());
+
+        return $html->get_updated_html();
+    }
+
+    private function process_block_attributes($attributes) {
+        if(is_string($attributes)) {
+            return $this->process_as_plaintext($attributes);
+        } else if(is_array($attributes)) {
+            $new_attributes = array();
+            foreach($attributes as $key => $value) {
+                $new_attributes[$key] = $this->process_block_attributes($value);
+            }
+            return $new_attributes;
+        } else {
+            return $attributes;
+        }
+    }
+
+    /**
+     * @TODO: Investigate how bad this is – would it stand the test of time, or do we need
+     *        a proper URL-matching state machine?
+     */
+    const URL_REGEXP = '\b((?:(https?):\/\/|www\.)[-a-zA-Z0-9@:%._\+\~#=]+(?:\.[a-zA-Z0-9]{2,})+[-a-zA-Z0-9@:%_\+.\~#?&//=]*)\b';
+    private function process_as_plaintext($text) {
+        return preg_replace_callback(
+            '~'.self::URL_REGEXP.'~',
+            function ($matches) {
+                $this->found_urls[$matches[0]] = true;
+                $replacer = $this->rewrite_url_callback;
+                return $replacer($matches[0]);
+            },
+            $text
+        );
+    }
+
+    private function set_modifiable_html_text(WP_HTML_Tag_Processor $p, $new_value) {
+        $reflection = new ReflectionClass('WP_HTML_Tag_Processor');
+        $accessible_text_starts_at = $reflection->getProperty('text_starts_at');
+        $accessible_text_starts_at->setAccessible(true);
+    
+        $accessible_text_length = $reflection->getProperty('text_length');
+        $accessible_text_length->setAccessible(true);
+    
+        $lexical_updates = $reflection->getProperty('lexical_updates');
+        $lexical_updates->setAccessible(true);
+    
+        switch ( $p->get_token_type() ) {
+            case '#text':
+                $lexical_updates_now = $lexical_updates->getValue($p);
+                $lexical_updates_now[] = new WP_HTML_Text_Replacement(
+                    $accessible_text_starts_at->getValue($p),
+                    $accessible_text_length->getValue($p),
+                    htmlspecialchars( $new_value, ENT_XML1, 'UTF-8' )
+                );
+                $lexical_updates->setValue($p, $lexical_updates_now);
+                return true;
+    
+            case '#comment':
+            case '#cdata-section':
+                if(
+                    $p->get_token_type() === '#comment' && (
+                        strpos($new_value, '-->') !== false ||
+                        strpos($new_value, '--!>') !== false
+                    )
+                ) {
+                    _doing_it_wrong(
+                        __METHOD__,
+                        __( 'Cannot set a comment closer as a text of an HTML comment.' ),
+                        'WP_VERSION'
+                    );
+                    return false;
+                }
+                if(
+                    $p->get_token_type() === '#cdata-section' && 
+                    strpos($new_value, '>') !== false 
+                ) {
+                    _doing_it_wrong(
+                        __METHOD__,
+                        __( 'Cannot set a CDATA closer as text of an HTML CDATA-lookalike section.' ),
+                        'WP_VERSION'
+                    );
+                    return false;
+                }
+                $lexical_updates_now = $lexical_updates->getValue($p);
+                $lexical_updates_now[] = new WP_HTML_Text_Replacement(
+                    $accessible_text_starts_at->getValue($p),
+                    $accessible_text_length->getValue($p),
+                    $new_value
+                );
+                $lexical_updates->setValue($p, $lexical_updates_now);
+                return true;
+            default:
+                _doing_it_wrong(
+                    __METHOD__,
+                    __( 'Cannot set text content on a non-text node.' ),
+                    'WP_VERSION'
+                );
+                return false;
+        }
+    }
+}
diff --git a/class-wp-xml-processor.php b/class-wp-xml-processor.php
index a18e35f..9156f8f 100644
--- a/class-wp-xml-processor.php
+++ b/class-wp-xml-processor.php
@@ -88,6 +88,28 @@ public static function stream_tokens( $input_stream, $output_stream, $buffer_siz
 		}
 	}
 
+	/**
+	 * Wipes out the processed XML and appends the next chunk of XML to
+	 * any remaining unprocessed XML.
+	 * 
+	 * @param string $next_chunk XML to append.
+	 */
+	public function stream_append_xml( $next_chunk )
+	{
+		$this->get_updated_xml();
+
+		$new_xml = $this->get_unprocessed_xml() . $next_chunk;
+		$breadcrumbs = $this->get_breadcrumbs();
+		$parser_context = $this->get_parser_context();
+
+		$this->reset_state();
+
+		$this->xml = $new_xml;
+		$this->stack_of_open_elements = $breadcrumbs;
+		$this->parser_context         = $parser_context;
+		$this->had_previous_chunks    = true;
+	}
+
 	/**
 	 * Constructor.
 	 *
diff --git a/class-wp-xml-tag-processor.php b/class-wp-xml-tag-processor.php
index db06e54..45ea8f4 100644
--- a/class-wp-xml-tag-processor.php
+++ b/class-wp-xml-tag-processor.php
@@ -337,7 +337,7 @@ class WP_XML_Tag_Processor {
 	 * @since WP_VERSION
 	 * @var string
 	 */
-	protected $xml;
+	public $xml;
 
 	/**
 	 * The last query passed to next_tag().
@@ -428,7 +428,7 @@ class WP_XML_Tag_Processor {
 	 * @since WP_VERSION
 	 * @var int
 	 */
-	private $bytes_already_parsed = 0;
+	public $bytes_already_parsed = 0;
 
 	/**
 	 * Byte offset in input document where current token starts.
@@ -1751,6 +1751,31 @@ private function after_tag() {
 		$this->is_closing_tag          = null;
 		$this->attributes              = array();
 	}
+	
+	protected function reset_state()
+	{
+		$this->xml = '';
+		$this->last_query = null;
+		$this->sought_tag_name		   = null;
+		$this->sought_match_offset	   = 0;
+		$this->stop_on_tag_closers	   = false;
+		$this->parser_state            = self::STATE_READY;
+		$this->is_incomplete_text_node = false;
+		$this->bytes_already_parsed	   = 0;
+		$this->token_starts_at         = null;
+		$this->token_length            = null;
+		$this->tag_name_starts_at = null;
+		$this->tag_name_length = null;
+		$this->text_starts_at = null;
+		$this->text_length = null;
+		$this->is_closing_tag          = null;
+		$this->last_error			   = null;
+		$this->attributes              = array();
+		$this->bookmarks              = array();
+		$this->lexical_updates = array();
+		$this->seek_count			   = 0;
+		$this->had_previous_chunks			   = false;
+	}
 
 	/**
 	 * Applies attribute updates to XML document.
diff --git a/functions.php b/functions.php
new file mode 100644
index 0000000..3373d43
--- /dev/null
+++ b/functions.php
@@ -0,0 +1,65 @@
+<?php
+
+use \WordPress\AsyncHttp\Client;
+use \WordPress\AsyncHttp\Request;
+
+function wxr_download_files($options) {
+	$requests = [];
+	$local_paths = [];
+	foreach ($options['assets'] as $asset_url => $local_file) {
+		$request = new Request($asset_url);
+		$requests[] = $request;
+		$local_paths[$request->id] = $local_file;
+	}
+
+	$client = new Client( [
+		'concurrency' => 10,
+	] );
+	$client->enqueue( $requests );
+
+	$results = [];
+	while ( $client->await_next_event() ) {
+		$request = $client->get_request();
+		
+		switch ( $client->get_event() ) {
+			case Client::EVENT_BODY_CHUNK_AVAILABLE:
+				file_put_contents(
+					$local_paths[$request->original_request()->id],
+					$client->get_response_body_chunk(),
+					FILE_APPEND
+				);
+				break;
+			case Client::EVENT_FAILED:
+				$results[$request->original_request()->url] = [
+					'success' => false,
+					'error' => $request->error,
+				];
+				break;
+			case Client::EVENT_FINISHED:
+				$results[$request->original_request()->url] = [
+					'success' => true
+				];
+				break;
+		}
+	}
+	return $results;
+}
+
+/**
+ * WordPress compat
+ */
+if(!function_exists('esc_attr')) {
+    function esc_attr($text) {
+        return htmlspecialchars($text, ENT_XML1, 'UTF-8');
+    }
+}
+
+function serialize_url($parsedUrl) {
+    return (isset($parsedUrl['scheme']) ? $parsedUrl['scheme'] . '://' : '')
+            . (isset($parsedUrl['user']) ? $parsedUrl['user'] . (isset($parsedUrl['pass']) ? ':' . $parsedUrl['pass'] : '') .'@' : '')
+            . $parsedUrl['host']
+            . (isset($parsedUrl['port']) ? ':' . $parsedUrl['port'] : '')
+            . (isset($parsedUrl['path']) ? $parsedUrl['path'] : '')
+            . (isset($parsedUrl['query']) ? '?' . $parsedUrl['query'] : '')
+            . (isset($parsedUrl['fragment']) ? '#' . $parsedUrl['fragment'] : '');
+}
diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php
new file mode 100644
index 0000000..b57e6ca
--- /dev/null
+++ b/rewrite-remote-wxr.php
@@ -0,0 +1,175 @@
+<?php
+/**
+ * Rewrites site URLs in a remote WXR file.
+ * 
+ * It pipes and stream-processes data as follows:
+ * 
+ * AsyncHttp\Client -> WP_XML_Processor -> WP_Block_Markup_Url_Processor -> WP_Migration_URL_In_Text_Processor -> WP_URL
+ * 
+ * The layers of data we're handling here are:
+ * 
+ * * AsyncHttp\Client: HTTPS encrypted data -> Chunked encoding -> Gzip compression
+ * * WP_XML_Processor: XML (entities, attributes, text, comments, CDATA nodes)
+ * * WP_Block_Markup_Url_Processor: HTML (entities, attributes, text, comments, block comments), JSON (in block comments)
+ * * WP_Migration_URL_In_Text_Processor: URLs in text nodes
+ * * WP_URL: URL parsing and serialization
+ * 
+ * It wouldn't be difficult to pipe through additioanl layers such as:
+ * 
+ * * Reading from a remote ZIP file
+ * * Writing to a local ZIP-ped XML file
+ * * Writing to a database
+ * 
+ * ...etc.
+ */
+ 
+require __DIR__ . '/bootstrap.php';
+
+use \WordPress\AsyncHttp\Client;
+use \WordPress\AsyncHttp\Request;
+
+$wxr_url = "https://raw.githubusercontent.com/WordPress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/woo-products.wxr";
+$xml_processor = new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT);
+foreach( stream_remote_file( $wxr_url ) as $chunk ) {
+    $xml_processor->stream_append_xml($chunk);
+    foreach ( xml_next_content_node_for_rewriting( $xml_processor ) as $text ) {
+        $string_new_site_url           = 'https://mynew.site/';
+        $parsed_new_site_url           = WP_URL::parse( $string_new_site_url );
+
+        $current_site_url              = 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/';
+        $parsed_current_site_url       = WP_URL::parse( $current_site_url );
+
+        $base_url = 'https://playground.internal';
+        $url_processor = new WP_Block_Markup_Url_Processor( $text, $base_url );
+
+        foreach ( html_next_url( $url_processor, $current_site_url ) as $parsed_matched_url ) {
+            $updated_raw_url = rewrite_url(
+                $url_processor->get_raw_url(),
+                $parsed_matched_url,
+                $parsed_current_site_url,
+                $parsed_new_site_url
+            );
+            $url_processor->set_raw_url( $updated_raw_url );
+        }
+        
+        $updated_text = $url_processor->get_updated_html();
+        if ($updated_text !== $text) {
+            $xml_processor->set_modifiable_text($updated_text);
+        }
+    }
+    echo $xml_processor->get_processed_xml();
+}
+echo $xml_processor->get_unprocessed_xml();
+
+// The rest of this file are functions used in the above code
+
+function stream_remote_file($url)
+{
+    $requests = [
+        new Request($url)
+    ];
+    $client = new Client();
+    $client->enqueue($requests);
+
+    while ($client->await_next_event()) {
+        switch ($client->get_event()) {
+            case Client::EVENT_BODY_CHUNK_AVAILABLE:
+                yield $client->get_response_body_chunk();
+                break;
+        }
+    }
+}
+
+function xml_next_content_node_for_rewriting(WP_XML_Processor $processor) {
+    while($processor->next_token()) {
+        if (!in_array('item', $processor->get_breadcrumbs())) {
+            continue;
+        }
+        if (
+            !in_array('excerpt:encoded', $processor->get_breadcrumbs())
+            && !in_array('content:encoded', $processor->get_breadcrumbs())
+            && !in_array('wp:attachment_url', $processor->get_breadcrumbs())
+            && !in_array('guid', $processor->get_breadcrumbs())
+            && !in_array('link', $processor->get_breadcrumbs())
+            && !in_array('wp:comment_content', $processor->get_breadcrumbs())
+            // Meta values are not suppoerted yet. We'll need to support
+            // WordPress core options that may be saved as JSON, PHP Deserialization, and XML,
+            // and then provide extension points for plugins authors support
+            // their own options.
+            // !in_array('wp:postmeta', $processor->get_breadcrumbs())
+        ) {
+            continue;
+        }
+                
+        switch ($processor->get_token_type()) {
+            case '#text':
+            case '#cdata-section':
+                $text = $processor->get_modifiable_text();
+                yield $text;
+                break;
+        }
+    }
+}
+
+/**
+ * 
+ * @param mixed $options
+ * @return Generator
+ */
+function html_next_url(WP_Block_Markup_Url_Processor $p, $current_site_url) {
+	$parsed_current_site_url       = WP_URL::parse( $current_site_url );
+	$decoded_current_site_pathname = urldecode( $parsed_current_site_url->pathname );
+
+	while ( $p->next_url() ) {
+		$parsed_matched_url = $p->get_parsed_url();
+		if ( $parsed_matched_url->hostname === $parsed_current_site_url->hostname ) {
+			$decoded_matched_pathname = urldecode( $parsed_matched_url->pathname );
+			$pathname_matches         = str_starts_with( $decoded_matched_pathname, $decoded_current_site_pathname );
+			if ( ! $pathname_matches ) {
+				continue;
+			}
+
+			// It's a match!
+			yield $parsed_matched_url;
+		}
+	}
+}
+
+function rewrite_url(
+    string $raw_matched_url,
+    $parsed_matched_url,
+    $parsed_current_site_url,
+    $parsed_new_site_url,
+) {
+    // Let's rewrite the URL
+    $parsed_matched_url->hostname = $parsed_new_site_url->hostname;
+    $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname );
+
+    // Short-circuit for empty pathnames
+    if ('/' !== $parsed_current_site_url->pathname) {
+        $parsed_matched_url->pathname =
+            $parsed_new_site_url->pathname .
+            substr(
+                $decoded_matched_pathname,
+                strlen(urldecode($parsed_current_site_url->pathname))
+            );
+    }
+
+    /*
+     * Stylistic choice – if the matched URL has no trailing slash,
+     * do not add it to the new URL. The WHATWG URL parser will
+     * add one automatically if the path is empty, so we have to
+     * explicitly remove it.
+     */
+    $new_raw_url = $parsed_matched_url->toString();
+    if (
+        $raw_matched_url[strlen($raw_matched_url) - 1] !== '/' &&
+        $parsed_matched_url->pathname === '/' &&
+        $parsed_matched_url->search === '' &&
+        $parsed_matched_url->hash === ''
+    ) {
+        $new_raw_url = rtrim($new_raw_url, '/');
+    }
+
+    return $new_raw_url;
+}
diff --git a/rewrite-wxr.php b/rewrite-wxr.php
index 3bfa0d1..f1fe965 100644
--- a/rewrite-wxr.php
+++ b/rewrite-wxr.php
@@ -25,51 +25,8 @@
  * [2] ZipStreamWriter: https://github.com/WordPress/blueprints-library/blob/f9fcb5816ab6def0920b25787341342bc88803e3/src/WordPress/Zip/ZipStreamWriter.php
  * [3] AsyncHttpClient: https://github.com/WordPress/blueprints-library/blob/trunk/src/WordPress/AsyncHttp/Client.php
  */
-
-use \WordPress\AsyncHttp\Client;
-use \WordPress\AsyncHttp\Request;
  
-// Where to find the streaming WP_XML_Processor 
-// Use a version from this PR: https://github.com/adamziel/wordpress-develop/pull/43
-define('WP_XML_API_PATH', __DIR__ );
-define('BLUEPRINTS_LIB_PATH', __DIR__ . '/blueprints-library/src/WordPress' );
-if(!file_exists(WP_XML_API_PATH . '/class-wp-token-map.php')) {
-    copy(WP_XML_API_PATH.'/../class-wp-token-map.php', WP_XML_API_PATH . '/class-wp-token-map.php');
-}
-
-$requires[] = WP_XML_API_PATH . "/class-wp-html-token.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-span.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-text-replacement.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-decoder.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-attribute-token.php";
-
-$requires[] = WP_XML_API_PATH . "/class-wp-html-decoder.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-tag-processor.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-open-elements.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-token-map.php";
-$requires[] = WP_XML_API_PATH . "/html5-named-character-references.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-active-formatting-elements.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-processor-state.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-unsupported-exception.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-processor.php";
-
-$requires[] = WP_XML_API_PATH . "/class-wp-xml-decoder.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-xml-tag-processor.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-xml-processor.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamWrapperInterface.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamWrapper.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamPeekerWrapper.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Request.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Response.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/HttpError.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Connection.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Client.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/StreamWrapper/ChunkedEncodingWrapper.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/StreamWrapper/InflateStreamWrapper.php";
-
-foreach ($requires as $require) {
-    require_once $require;
-}
+require __DIR__ . '/bootstrap.php';
 
 if (!Phar::running() && in_array('--bundle', $argv)) {
     bundlePhar('preprocess-wxr.phar', array_merge(
@@ -198,7 +155,7 @@ function ($url) { return $url; }
         $url_to_path[$url] = $details['download_path'];
     }
 }
-wp_download_files([
+wxr_download_files([
     'concurrency' => 10,
     'assets' => $url_to_path
 ]);
@@ -227,297 +184,3 @@ function ($url) use($assets_details) {
 $normalizer->process();
 fclose($input_stream);
 fclose($output_stream);
-
-function wp_download_files($options) {
-	$requests = [];
-	$local_paths = [];
-	foreach ($options['assets'] as $asset_url => $local_file) {
-		$request = new Request($asset_url);
-		$requests[] = $request;
-		$local_paths[$request->id] = $local_file;
-	}
-
-	$client = new Client( [
-		'concurrency' => 10,
-	] );
-	$client->enqueue( $requests );
-
-	$results = [];
-	while ( $client->await_next_event() ) {
-		$request = $client->get_request();
-		
-		switch ( $client->get_event() ) {
-			case Client::EVENT_BODY_CHUNK_AVAILABLE:
-				file_put_contents(
-					$local_paths[$request->original_request()->id],
-					$client->get_response_body_chunk(),
-					FILE_APPEND
-				);
-				break;
-			case Client::EVENT_FAILED:
-				$results[$request->original_request()->url] = [
-					'success' => false,
-					'error' => $request->error,
-				];
-				break;
-			case Client::EVENT_FINISHED:
-				$results[$request->original_request()->url] = [
-					'success' => true
-				];
-				break;
-		}
-	}
-	return $results;
-}
-
-/**
- * WordPress compat
- */
-function esc_attr($text) {
-    return htmlspecialchars($text, ENT_XML1, 'UTF-8');
-}
-
-function serialize_url($parsedUrl) {
-    return (isset($parsedUrl['scheme']) ? $parsedUrl['scheme'] . '://' : '')
-            . (isset($parsedUrl['user']) ? $parsedUrl['user'] . (isset($parsedUrl['pass']) ? ':' . $parsedUrl['pass'] : '') .'@' : '')
-            . $parsedUrl['host']
-            . (isset($parsedUrl['port']) ? ':' . $parsedUrl['port'] : '')
-            . (isset($parsedUrl['path']) ? $parsedUrl['path'] : '')
-            . (isset($parsedUrl['query']) ? '?' . $parsedUrl['query'] : '')
-            . (isset($parsedUrl['fragment']) ? '#' . $parsedUrl['fragment'] : '');
-}
-
-class WP_WXR_Normalizer
-{
-
-    private $input_stream;
-    private $output_stream;
-    private $rewrite_url_callback;
-
-    private $found_urls = array();
-
-    public function __construct(
-        $input_stream,
-        $output_stream,
-        $rewrite_url_callback
-    ) {
-        $this->input_stream = $input_stream;
-        $this->output_stream = $output_stream;
-        $this->rewrite_url_callback = $rewrite_url_callback;
-    }
-
-    public function get_found_urls()
-    {
-        return array_keys($this->found_urls);        
-    }
-
-    public function process()
-    {
-        $tokens = WP_XML_Processor::stream_tokens($this->input_stream, $this->output_stream, 1000000);
-        foreach ($tokens as $processor) {
-            if (
-                in_array('item', $processor->get_breadcrumbs())
-                // $processor->matches_breadcrumbs(array('item', 'content:encoded')) ||
-                // $processor->matches_breadcrumbs(array('item', 'excerpt:encoded')) ||
-                // $processor->matches_breadcrumbs(array('wp:comment_content'))
-            ) {
-                switch ($processor->get_token_type()) {
-                    case '#text':
-                    case '#cdata-section':
-                        $text = $processor->get_modifiable_text();
-                        $updated_text = $this->process_content_node($text);
-                        if ($updated_text !== $text) {
-                            $processor->set_modifiable_text($updated_text);
-                        }
-                        break;
-                }
-            }
-        }
-    }
-
-    private function process_content_node($text)
-    {
-        $result = $this->process_as_html($text);
-        if(false !== $result) {
-            return $result;
-        }
-
-        $result = $this->process_as_plaintext($text);
-        if(false !== $result) {
-            return $result;
-        }
-
-        return false;
-    }
-
-    private function process_as_html($text) {
-        $html = new WP_HTML_Tag_Processor($text);
-        if(false === $html->next_token()) {
-            return false;
-        }
-
-        do {
-            switch($html->get_token_type()) {
-                case '#comment':
-                    $text = $html->get_modifiable_text();
-                    // Try to parse as a block. The block parser won't cut it because
-                    // while it can parse blocks, it has no semantics for rewriting the
-                    // block markup. Let's do our best here:
-                    $at = strspn($text, ' \t\f\r\n'); // Whitespace
-                    if(!(
-                        $at + 3 < strlen($text) &&
-                        $text[$at] === 'w' &&
-                        $text[$at+1] === 'p' &&
-                        $text[$at+2] === ':'
-                    )) {
-                        break;
-                    }
-                    $at += 3;
-                    $at += strspn($text, 'abcdefghijklmnopqrstuwxvyzABCDEFGHIJKLMNOPRQSTUWXVYZ0123456789_-', $at); // Block name
-                    $at += strspn($text, ' \t\f\r\n', $at); // Whitespace again
-                    if($at >= strlen($text)) {
-                        // Oh, there were no attributes or this wasn't a block
-                        // Either way, we have nothing more to do here.
-                        break;
-                    }
-
-                    // It seems we may have block attributes here. Let's try to
-                    // parse them as JSON.
-                    $json_maybe = substr($text, $at);
-                    $attributes = json_decode($json_maybe, true);
-                    if(null === $attributes) {
-                        // This wasn't a block after all, let's move on
-                        break;
-                    }
-
-                    // This is a block! Let's process all block attributes and rewrite them
-                    $new_attributes = $this->process_block_attributes($attributes);
-                    $this->set_modifiable_html_text(
-                        $html,
-                        substr($text, 0, $at) . json_encode($new_attributes, JSON_HEX_TAG | JSON_HEX_AMP)
-                    );
-                    break;
-
-                case '#tag':
-                    $attributes = $html->get_attribute_names_with_prefix('');
-                    if(!$attributes) {
-                        break;
-                    }
-                    foreach($attributes as $attribute_name) {
-                        $value = $html->get_attribute($attribute_name);
-                        $updated = $this->process_as_plaintext($value);
-                        if($updated !== $value) {
-                            $html->set_attribute($attribute_name, $updated);
-                        }
-                    }
-                    break;
-                case '#text':
-                    $text = $html->get_modifiable_text();
-                    $updated_text = $this->process_as_plaintext($text);
-                    if($updated_text !== $text) {
-                        $this->set_modifiable_html_text($html, $updated_text);
-                    }
-                    break;
-            }
-        } while($html->next_token());
-
-        return $html->get_updated_html();
-    }
-
-    private function process_block_attributes($attributes) {
-        if(is_string($attributes)) {
-            return $this->process_as_plaintext($attributes);
-        } else if(is_array($attributes)) {
-            $new_attributes = array();
-            foreach($attributes as $key => $value) {
-                $new_attributes[$key] = $this->process_block_attributes($value);
-            }
-            return $new_attributes;
-        } else {
-            return $attributes;
-        }
-    }
-
-    /**
-     * @TODO: Investigate how bad this is – would it stand the test of time, or do we need
-     *        a proper URL-matching state machine?
-     */
-    const URL_REGEXP = '\b((?:(https?):\/\/|www\.)[-a-zA-Z0-9@:%._\+\~#=]+(?:\.[a-zA-Z0-9]{2,})+[-a-zA-Z0-9@:%_\+.\~#?&//=]*)\b';
-    private function process_as_plaintext($text) {
-        return preg_replace_callback(
-            '~'.self::URL_REGEXP.'~',
-            function ($matches) {
-                $this->found_urls[$matches[0]] = true;
-                $replacer = $this->rewrite_url_callback;
-                return $replacer($matches[0]);
-            },
-            $text
-        );
-    }
-
-    private function set_modifiable_html_text(WP_HTML_Tag_Processor $p, $new_value) {
-        $reflection = new ReflectionClass('WP_HTML_Tag_Processor');
-        $accessible_text_starts_at = $reflection->getProperty('text_starts_at');
-        $accessible_text_starts_at->setAccessible(true);
-    
-        $accessible_text_length = $reflection->getProperty('text_length');
-        $accessible_text_length->setAccessible(true);
-    
-        $lexical_updates = $reflection->getProperty('lexical_updates');
-        $lexical_updates->setAccessible(true);
-    
-        switch ( $p->get_token_type() ) {
-            case '#text':
-                $lexical_updates_now = $lexical_updates->getValue($p);
-                $lexical_updates_now[] = new WP_HTML_Text_Replacement(
-                    $accessible_text_starts_at->getValue($p),
-                    $accessible_text_length->getValue($p),
-                    htmlspecialchars( $new_value, ENT_XML1, 'UTF-8' )
-                );
-                $lexical_updates->setValue($p, $lexical_updates_now);
-                return true;
-    
-            case '#comment':
-            case '#cdata-section':
-                if(
-                    $p->get_token_type() === '#comment' && (
-                        strpos($new_value, '-->') !== false ||
-                        strpos($new_value, '--!>') !== false
-                    )
-                ) {
-                    _doing_it_wrong(
-                        __METHOD__,
-                        __( 'Cannot set a comment closer as a text of an HTML comment.' ),
-                        'WP_VERSION'
-                    );
-                    return false;
-                }
-                if(
-                    $p->get_token_type() === '#cdata-section' && 
-                    strpos($new_value, '>') !== false 
-                ) {
-                    _doing_it_wrong(
-                        __METHOD__,
-                        __( 'Cannot set a CDATA closer as text of an HTML CDATA-lookalike section.' ),
-                        'WP_VERSION'
-                    );
-                    return false;
-                }
-                $lexical_updates_now = $lexical_updates->getValue($p);
-                $lexical_updates_now[] = new WP_HTML_Text_Replacement(
-                    $accessible_text_starts_at->getValue($p),
-                    $accessible_text_length->getValue($p),
-                    $new_value
-                );
-                $lexical_updates->setValue($p, $lexical_updates_now);
-                return true;
-            default:
-                _doing_it_wrong(
-                    __METHOD__,
-                    __( 'Cannot set text content on a non-text node.' ),
-                    'WP_VERSION'
-                );
-                return false;
-        }
-    }
-}
diff --git a/site-transfer-protocol b/site-transfer-protocol
new file mode 160000
index 0000000..3486d67
--- /dev/null
+++ b/site-transfer-protocol
@@ -0,0 +1 @@
+Subproject commit 3486d676a5fa2a76b719e3e6159a80e3013bed8a
diff --git a/test-data/woo-products.wxr b/test-data/woo-products.wxr
index 33937b9..034049d 100644
--- a/test-data/woo-products.wxr
+++ b/test-data/woo-products.wxr
@@ -2051,7 +2051,6 @@
   <wp:comment_status>open</wp:comment_status>
   <wp:ping_status>closed</wp:ping_status>
   <wp:post_name>album</wp:post_name>
-  <wp:status>publish</wp:status>
   <wp:post_parent>0</wp:post_parent>
   <wp:menu_order>0</wp:menu_order>
   <wp:post_type>product</wp:post_type>