diff --git a/bootstrap.php b/bootstrap.php
new file mode 100644
index 0000000..1764edc
--- /dev/null
+++ b/bootstrap.php
@@ -0,0 +1,53 @@
+' )`.
+ * style of including JavaScript inside of HTML comments to avoid accidentally
+ * closing the SCRIPT from inside a JavaScript string. E.g. `console.log( '' )`.
* - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any
* character references are decoded. E.g. `1 < 2 < 3` becomes `1 < 2 < 3`.
* - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as
@@ -1524,21 +1524,10 @@ private function parse_next_tag() {
$was_at = $this->bytes_already_parsed;
$at = $was_at;
- while ( false !== $at && $at < $doc_length ) {
+ while ( $at < $doc_length ) {
$at = strpos( $html, '<', $at );
-
- /*
- * This does not imply an incomplete parse; it indicates that there
- * can be nothing left in the document other than a #text node.
- */
if ( false === $at ) {
- $this->parser_state = self::STATE_TEXT_NODE;
- $this->token_starts_at = $was_at;
- $this->token_length = strlen( $html ) - $was_at;
- $this->text_starts_at = $was_at;
- $this->text_length = $this->token_length;
- $this->bytes_already_parsed = strlen( $html );
- return true;
+ break;
}
if ( $at > $was_at ) {
@@ -1554,19 +1543,9 @@ private function parse_next_tag() {
*
* @see https://html.spec.whatwg.org/#tag-open-state
*/
- if ( strlen( $html ) > $at + 1 ) {
- $next_character = $html[ $at + 1 ];
- $at_another_node = (
- '!' === $next_character ||
- '/' === $next_character ||
- '?' === $next_character ||
- ( 'A' <= $next_character && $next_character <= 'Z' ) ||
- ( 'a' <= $next_character && $next_character <= 'z' )
- );
- if ( ! $at_another_node ) {
- ++$at;
- continue;
- }
+ if ( 1 !== strspn( $html, '!/?abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOPQRSTUVWXYZ', $at + 1, 1 ) ) {
+ ++$at;
+ continue;
}
$this->parser_state = self::STATE_TEXT_NODE;
@@ -1630,11 +1609,7 @@ private function parse_next_tag() {
* `') !== false ||
+ strpos($new_value, '--!>') !== false
+ )
+ ) {
+ _doing_it_wrong(
+ __METHOD__,
+ __( 'Cannot set a comment closer as a text of an HTML comment.' ),
+ 'WP_VERSION'
+ );
+ return false;
+ }
+ if(
+ $p->get_token_type() === '#cdata-section' &&
+ strpos($new_value, '>') !== false
+ ) {
+ _doing_it_wrong(
+ __METHOD__,
+ __( 'Cannot set a CDATA closer as text of an HTML CDATA-lookalike section.' ),
+ 'WP_VERSION'
+ );
+ return false;
+ }
+ $lexical_updates_now = $lexical_updates->getValue($p);
+ $lexical_updates_now[] = new WP_HTML_Text_Replacement(
+ $accessible_text_starts_at->getValue($p),
+ $accessible_text_length->getValue($p),
+ $new_value
+ );
+ $lexical_updates->setValue($p, $lexical_updates_now);
+ return true;
+ default:
+ _doing_it_wrong(
+ __METHOD__,
+ __( 'Cannot set text content on a non-text node.' ),
+ 'WP_VERSION'
+ );
+ return false;
+ }
+ }
+}
diff --git a/class-wp-xml-processor.php b/class-wp-xml-processor.php
index a18e35f..9156f8f 100644
--- a/class-wp-xml-processor.php
+++ b/class-wp-xml-processor.php
@@ -88,6 +88,28 @@ public static function stream_tokens( $input_stream, $output_stream, $buffer_siz
}
}
+ /**
+ * Wipes out the processed XML and appends the next chunk of XML to
+ * any remaining unprocessed XML.
+ *
+ * @param string $next_chunk XML to append.
+ */
+ public function stream_append_xml( $next_chunk )
+ {
+ $this->get_updated_xml();
+
+ $new_xml = $this->get_unprocessed_xml() . $next_chunk;
+ $breadcrumbs = $this->get_breadcrumbs();
+ $parser_context = $this->get_parser_context();
+
+ $this->reset_state();
+
+ $this->xml = $new_xml;
+ $this->stack_of_open_elements = $breadcrumbs;
+ $this->parser_context = $parser_context;
+ $this->had_previous_chunks = true;
+ }
+
/**
* Constructor.
*
diff --git a/class-wp-xml-tag-processor.php b/class-wp-xml-tag-processor.php
index db06e54..45ea8f4 100644
--- a/class-wp-xml-tag-processor.php
+++ b/class-wp-xml-tag-processor.php
@@ -337,7 +337,7 @@ class WP_XML_Tag_Processor {
* @since WP_VERSION
* @var string
*/
- protected $xml;
+ public $xml;
/**
* The last query passed to next_tag().
@@ -428,7 +428,7 @@ class WP_XML_Tag_Processor {
* @since WP_VERSION
* @var int
*/
- private $bytes_already_parsed = 0;
+ public $bytes_already_parsed = 0;
/**
* Byte offset in input document where current token starts.
@@ -1751,6 +1751,31 @@ private function after_tag() {
$this->is_closing_tag = null;
$this->attributes = array();
}
+
+ protected function reset_state()
+ {
+ $this->xml = '';
+ $this->last_query = null;
+ $this->sought_tag_name = null;
+ $this->sought_match_offset = 0;
+ $this->stop_on_tag_closers = false;
+ $this->parser_state = self::STATE_READY;
+ $this->is_incomplete_text_node = false;
+ $this->bytes_already_parsed = 0;
+ $this->token_starts_at = null;
+ $this->token_length = null;
+ $this->tag_name_starts_at = null;
+ $this->tag_name_length = null;
+ $this->text_starts_at = null;
+ $this->text_length = null;
+ $this->is_closing_tag = null;
+ $this->last_error = null;
+ $this->attributes = array();
+ $this->bookmarks = array();
+ $this->lexical_updates = array();
+ $this->seek_count = 0;
+ $this->had_previous_chunks = false;
+ }
/**
* Applies attribute updates to XML document.
diff --git a/functions.php b/functions.php
new file mode 100644
index 0000000..3373d43
--- /dev/null
+++ b/functions.php
@@ -0,0 +1,65 @@
+ $local_file) {
+ $request = new Request($asset_url);
+ $requests[] = $request;
+ $local_paths[$request->id] = $local_file;
+ }
+
+ $client = new Client( [
+ 'concurrency' => 10,
+ ] );
+ $client->enqueue( $requests );
+
+ $results = [];
+ while ( $client->await_next_event() ) {
+ $request = $client->get_request();
+
+ switch ( $client->get_event() ) {
+ case Client::EVENT_BODY_CHUNK_AVAILABLE:
+ file_put_contents(
+ $local_paths[$request->original_request()->id],
+ $client->get_response_body_chunk(),
+ FILE_APPEND
+ );
+ break;
+ case Client::EVENT_FAILED:
+ $results[$request->original_request()->url] = [
+ 'success' => false,
+ 'error' => $request->error,
+ ];
+ break;
+ case Client::EVENT_FINISHED:
+ $results[$request->original_request()->url] = [
+ 'success' => true
+ ];
+ break;
+ }
+ }
+ return $results;
+}
+
+/**
+ * WordPress compat
+ */
+if(!function_exists('esc_attr')) {
+ function esc_attr($text) {
+ return htmlspecialchars($text, ENT_XML1, 'UTF-8');
+ }
+}
+
+function serialize_url($parsedUrl) {
+ return (isset($parsedUrl['scheme']) ? $parsedUrl['scheme'] . '://' : '')
+ . (isset($parsedUrl['user']) ? $parsedUrl['user'] . (isset($parsedUrl['pass']) ? ':' . $parsedUrl['pass'] : '') .'@' : '')
+ . $parsedUrl['host']
+ . (isset($parsedUrl['port']) ? ':' . $parsedUrl['port'] : '')
+ . (isset($parsedUrl['path']) ? $parsedUrl['path'] : '')
+ . (isset($parsedUrl['query']) ? '?' . $parsedUrl['query'] : '')
+ . (isset($parsedUrl['fragment']) ? '#' . $parsedUrl['fragment'] : '');
+}
diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php
new file mode 100644
index 0000000..b57e6ca
--- /dev/null
+++ b/rewrite-remote-wxr.php
@@ -0,0 +1,175 @@
+ WP_XML_Processor -> WP_Block_Markup_Url_Processor -> WP_Migration_URL_In_Text_Processor -> WP_URL
+ *
+ * The layers of data we're handling here are:
+ *
+ * * AsyncHttp\Client: HTTPS encrypted data -> Chunked encoding -> Gzip compression
+ * * WP_XML_Processor: XML (entities, attributes, text, comments, CDATA nodes)
+ * * WP_Block_Markup_Url_Processor: HTML (entities, attributes, text, comments, block comments), JSON (in block comments)
+ * * WP_Migration_URL_In_Text_Processor: URLs in text nodes
+ * * WP_URL: URL parsing and serialization
+ *
+ * It wouldn't be difficult to pipe through additioanl layers such as:
+ *
+ * * Reading from a remote ZIP file
+ * * Writing to a local ZIP-ped XML file
+ * * Writing to a database
+ *
+ * ...etc.
+ */
+
+require __DIR__ . '/bootstrap.php';
+
+use \WordPress\AsyncHttp\Client;
+use \WordPress\AsyncHttp\Request;
+
+$wxr_url = "https://raw.githubusercontent.com/WordPress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/woo-products.wxr";
+$xml_processor = new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT);
+foreach( stream_remote_file( $wxr_url ) as $chunk ) {
+ $xml_processor->stream_append_xml($chunk);
+ foreach ( xml_next_content_node_for_rewriting( $xml_processor ) as $text ) {
+ $string_new_site_url = 'https://mynew.site/';
+ $parsed_new_site_url = WP_URL::parse( $string_new_site_url );
+
+ $current_site_url = 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/';
+ $parsed_current_site_url = WP_URL::parse( $current_site_url );
+
+ $base_url = 'https://playground.internal';
+ $url_processor = new WP_Block_Markup_Url_Processor( $text, $base_url );
+
+ foreach ( html_next_url( $url_processor, $current_site_url ) as $parsed_matched_url ) {
+ $updated_raw_url = rewrite_url(
+ $url_processor->get_raw_url(),
+ $parsed_matched_url,
+ $parsed_current_site_url,
+ $parsed_new_site_url
+ );
+ $url_processor->set_raw_url( $updated_raw_url );
+ }
+
+ $updated_text = $url_processor->get_updated_html();
+ if ($updated_text !== $text) {
+ $xml_processor->set_modifiable_text($updated_text);
+ }
+ }
+ echo $xml_processor->get_processed_xml();
+}
+echo $xml_processor->get_unprocessed_xml();
+
+// The rest of this file are functions used in the above code
+
+function stream_remote_file($url)
+{
+ $requests = [
+ new Request($url)
+ ];
+ $client = new Client();
+ $client->enqueue($requests);
+
+ while ($client->await_next_event()) {
+ switch ($client->get_event()) {
+ case Client::EVENT_BODY_CHUNK_AVAILABLE:
+ yield $client->get_response_body_chunk();
+ break;
+ }
+ }
+}
+
+function xml_next_content_node_for_rewriting(WP_XML_Processor $processor) {
+ while($processor->next_token()) {
+ if (!in_array('item', $processor->get_breadcrumbs())) {
+ continue;
+ }
+ if (
+ !in_array('excerpt:encoded', $processor->get_breadcrumbs())
+ && !in_array('content:encoded', $processor->get_breadcrumbs())
+ && !in_array('wp:attachment_url', $processor->get_breadcrumbs())
+ && !in_array('guid', $processor->get_breadcrumbs())
+ && !in_array('link', $processor->get_breadcrumbs())
+ && !in_array('wp:comment_content', $processor->get_breadcrumbs())
+ // Meta values are not suppoerted yet. We'll need to support
+ // WordPress core options that may be saved as JSON, PHP Deserialization, and XML,
+ // and then provide extension points for plugins authors support
+ // their own options.
+ // !in_array('wp:postmeta', $processor->get_breadcrumbs())
+ ) {
+ continue;
+ }
+
+ switch ($processor->get_token_type()) {
+ case '#text':
+ case '#cdata-section':
+ $text = $processor->get_modifiable_text();
+ yield $text;
+ break;
+ }
+ }
+}
+
+/**
+ *
+ * @param mixed $options
+ * @return Generator
+ */
+function html_next_url(WP_Block_Markup_Url_Processor $p, $current_site_url) {
+ $parsed_current_site_url = WP_URL::parse( $current_site_url );
+ $decoded_current_site_pathname = urldecode( $parsed_current_site_url->pathname );
+
+ while ( $p->next_url() ) {
+ $parsed_matched_url = $p->get_parsed_url();
+ if ( $parsed_matched_url->hostname === $parsed_current_site_url->hostname ) {
+ $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname );
+ $pathname_matches = str_starts_with( $decoded_matched_pathname, $decoded_current_site_pathname );
+ if ( ! $pathname_matches ) {
+ continue;
+ }
+
+ // It's a match!
+ yield $parsed_matched_url;
+ }
+ }
+}
+
+function rewrite_url(
+ string $raw_matched_url,
+ $parsed_matched_url,
+ $parsed_current_site_url,
+ $parsed_new_site_url,
+) {
+ // Let's rewrite the URL
+ $parsed_matched_url->hostname = $parsed_new_site_url->hostname;
+ $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname );
+
+ // Short-circuit for empty pathnames
+ if ('/' !== $parsed_current_site_url->pathname) {
+ $parsed_matched_url->pathname =
+ $parsed_new_site_url->pathname .
+ substr(
+ $decoded_matched_pathname,
+ strlen(urldecode($parsed_current_site_url->pathname))
+ );
+ }
+
+ /*
+ * Stylistic choice – if the matched URL has no trailing slash,
+ * do not add it to the new URL. The WHATWG URL parser will
+ * add one automatically if the path is empty, so we have to
+ * explicitly remove it.
+ */
+ $new_raw_url = $parsed_matched_url->toString();
+ if (
+ $raw_matched_url[strlen($raw_matched_url) - 1] !== '/' &&
+ $parsed_matched_url->pathname === '/' &&
+ $parsed_matched_url->search === '' &&
+ $parsed_matched_url->hash === ''
+ ) {
+ $new_raw_url = rtrim($new_raw_url, '/');
+ }
+
+ return $new_raw_url;
+}
diff --git a/rewrite-wxr.php b/rewrite-wxr.php
index 3bfa0d1..f1fe965 100644
--- a/rewrite-wxr.php
+++ b/rewrite-wxr.php
@@ -25,51 +25,8 @@
* [2] ZipStreamWriter: https://github.com/WordPress/blueprints-library/blob/f9fcb5816ab6def0920b25787341342bc88803e3/src/WordPress/Zip/ZipStreamWriter.php
* [3] AsyncHttpClient: https://github.com/WordPress/blueprints-library/blob/trunk/src/WordPress/AsyncHttp/Client.php
*/
-
-use \WordPress\AsyncHttp\Client;
-use \WordPress\AsyncHttp\Request;
-// Where to find the streaming WP_XML_Processor
-// Use a version from this PR: https://github.com/adamziel/wordpress-develop/pull/43
-define('WP_XML_API_PATH', __DIR__ );
-define('BLUEPRINTS_LIB_PATH', __DIR__ . '/blueprints-library/src/WordPress' );
-if(!file_exists(WP_XML_API_PATH . '/class-wp-token-map.php')) {
- copy(WP_XML_API_PATH.'/../class-wp-token-map.php', WP_XML_API_PATH . '/class-wp-token-map.php');
-}
-
-$requires[] = WP_XML_API_PATH . "/class-wp-html-token.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-span.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-text-replacement.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-decoder.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-attribute-token.php";
-
-$requires[] = WP_XML_API_PATH . "/class-wp-html-decoder.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-tag-processor.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-open-elements.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-token-map.php";
-$requires[] = WP_XML_API_PATH . "/html5-named-character-references.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-active-formatting-elements.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-processor-state.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-unsupported-exception.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-html-processor.php";
-
-$requires[] = WP_XML_API_PATH . "/class-wp-xml-decoder.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-xml-tag-processor.php";
-$requires[] = WP_XML_API_PATH . "/class-wp-xml-processor.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamWrapperInterface.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamWrapper.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamPeekerWrapper.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Request.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Response.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/HttpError.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Connection.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Client.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/StreamWrapper/ChunkedEncodingWrapper.php";
-$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/StreamWrapper/InflateStreamWrapper.php";
-
-foreach ($requires as $require) {
- require_once $require;
-}
+require __DIR__ . '/bootstrap.php';
if (!Phar::running() && in_array('--bundle', $argv)) {
bundlePhar('preprocess-wxr.phar', array_merge(
@@ -198,7 +155,7 @@ function ($url) { return $url; }
$url_to_path[$url] = $details['download_path'];
}
}
-wp_download_files([
+wxr_download_files([
'concurrency' => 10,
'assets' => $url_to_path
]);
@@ -227,297 +184,3 @@ function ($url) use($assets_details) {
$normalizer->process();
fclose($input_stream);
fclose($output_stream);
-
-function wp_download_files($options) {
- $requests = [];
- $local_paths = [];
- foreach ($options['assets'] as $asset_url => $local_file) {
- $request = new Request($asset_url);
- $requests[] = $request;
- $local_paths[$request->id] = $local_file;
- }
-
- $client = new Client( [
- 'concurrency' => 10,
- ] );
- $client->enqueue( $requests );
-
- $results = [];
- while ( $client->await_next_event() ) {
- $request = $client->get_request();
-
- switch ( $client->get_event() ) {
- case Client::EVENT_BODY_CHUNK_AVAILABLE:
- file_put_contents(
- $local_paths[$request->original_request()->id],
- $client->get_response_body_chunk(),
- FILE_APPEND
- );
- break;
- case Client::EVENT_FAILED:
- $results[$request->original_request()->url] = [
- 'success' => false,
- 'error' => $request->error,
- ];
- break;
- case Client::EVENT_FINISHED:
- $results[$request->original_request()->url] = [
- 'success' => true
- ];
- break;
- }
- }
- return $results;
-}
-
-/**
- * WordPress compat
- */
-function esc_attr($text) {
- return htmlspecialchars($text, ENT_XML1, 'UTF-8');
-}
-
-function serialize_url($parsedUrl) {
- return (isset($parsedUrl['scheme']) ? $parsedUrl['scheme'] . '://' : '')
- . (isset($parsedUrl['user']) ? $parsedUrl['user'] . (isset($parsedUrl['pass']) ? ':' . $parsedUrl['pass'] : '') .'@' : '')
- . $parsedUrl['host']
- . (isset($parsedUrl['port']) ? ':' . $parsedUrl['port'] : '')
- . (isset($parsedUrl['path']) ? $parsedUrl['path'] : '')
- . (isset($parsedUrl['query']) ? '?' . $parsedUrl['query'] : '')
- . (isset($parsedUrl['fragment']) ? '#' . $parsedUrl['fragment'] : '');
-}
-
-class WP_WXR_Normalizer
-{
-
- private $input_stream;
- private $output_stream;
- private $rewrite_url_callback;
-
- private $found_urls = array();
-
- public function __construct(
- $input_stream,
- $output_stream,
- $rewrite_url_callback
- ) {
- $this->input_stream = $input_stream;
- $this->output_stream = $output_stream;
- $this->rewrite_url_callback = $rewrite_url_callback;
- }
-
- public function get_found_urls()
- {
- return array_keys($this->found_urls);
- }
-
- public function process()
- {
- $tokens = WP_XML_Processor::stream_tokens($this->input_stream, $this->output_stream, 1000000);
- foreach ($tokens as $processor) {
- if (
- in_array('item', $processor->get_breadcrumbs())
- // $processor->matches_breadcrumbs(array('item', 'content:encoded')) ||
- // $processor->matches_breadcrumbs(array('item', 'excerpt:encoded')) ||
- // $processor->matches_breadcrumbs(array('wp:comment_content'))
- ) {
- switch ($processor->get_token_type()) {
- case '#text':
- case '#cdata-section':
- $text = $processor->get_modifiable_text();
- $updated_text = $this->process_content_node($text);
- if ($updated_text !== $text) {
- $processor->set_modifiable_text($updated_text);
- }
- break;
- }
- }
- }
- }
-
- private function process_content_node($text)
- {
- $result = $this->process_as_html($text);
- if(false !== $result) {
- return $result;
- }
-
- $result = $this->process_as_plaintext($text);
- if(false !== $result) {
- return $result;
- }
-
- return false;
- }
-
- private function process_as_html($text) {
- $html = new WP_HTML_Tag_Processor($text);
- if(false === $html->next_token()) {
- return false;
- }
-
- do {
- switch($html->get_token_type()) {
- case '#comment':
- $text = $html->get_modifiable_text();
- // Try to parse as a block. The block parser won't cut it because
- // while it can parse blocks, it has no semantics for rewriting the
- // block markup. Let's do our best here:
- $at = strspn($text, ' \t\f\r\n'); // Whitespace
- if(!(
- $at + 3 < strlen($text) &&
- $text[$at] === 'w' &&
- $text[$at+1] === 'p' &&
- $text[$at+2] === ':'
- )) {
- break;
- }
- $at += 3;
- $at += strspn($text, 'abcdefghijklmnopqrstuwxvyzABCDEFGHIJKLMNOPRQSTUWXVYZ0123456789_-', $at); // Block name
- $at += strspn($text, ' \t\f\r\n', $at); // Whitespace again
- if($at >= strlen($text)) {
- // Oh, there were no attributes or this wasn't a block
- // Either way, we have nothing more to do here.
- break;
- }
-
- // It seems we may have block attributes here. Let's try to
- // parse them as JSON.
- $json_maybe = substr($text, $at);
- $attributes = json_decode($json_maybe, true);
- if(null === $attributes) {
- // This wasn't a block after all, let's move on
- break;
- }
-
- // This is a block! Let's process all block attributes and rewrite them
- $new_attributes = $this->process_block_attributes($attributes);
- $this->set_modifiable_html_text(
- $html,
- substr($text, 0, $at) . json_encode($new_attributes, JSON_HEX_TAG | JSON_HEX_AMP)
- );
- break;
-
- case '#tag':
- $attributes = $html->get_attribute_names_with_prefix('');
- if(!$attributes) {
- break;
- }
- foreach($attributes as $attribute_name) {
- $value = $html->get_attribute($attribute_name);
- $updated = $this->process_as_plaintext($value);
- if($updated !== $value) {
- $html->set_attribute($attribute_name, $updated);
- }
- }
- break;
- case '#text':
- $text = $html->get_modifiable_text();
- $updated_text = $this->process_as_plaintext($text);
- if($updated_text !== $text) {
- $this->set_modifiable_html_text($html, $updated_text);
- }
- break;
- }
- } while($html->next_token());
-
- return $html->get_updated_html();
- }
-
- private function process_block_attributes($attributes) {
- if(is_string($attributes)) {
- return $this->process_as_plaintext($attributes);
- } else if(is_array($attributes)) {
- $new_attributes = array();
- foreach($attributes as $key => $value) {
- $new_attributes[$key] = $this->process_block_attributes($value);
- }
- return $new_attributes;
- } else {
- return $attributes;
- }
- }
-
- /**
- * @TODO: Investigate how bad this is – would it stand the test of time, or do we need
- * a proper URL-matching state machine?
- */
- const URL_REGEXP = '\b((?:(https?):\/\/|www\.)[-a-zA-Z0-9@:%._\+\~#=]+(?:\.[a-zA-Z0-9]{2,})+[-a-zA-Z0-9@:%_\+.\~#?&//=]*)\b';
- private function process_as_plaintext($text) {
- return preg_replace_callback(
- '~'.self::URL_REGEXP.'~',
- function ($matches) {
- $this->found_urls[$matches[0]] = true;
- $replacer = $this->rewrite_url_callback;
- return $replacer($matches[0]);
- },
- $text
- );
- }
-
- private function set_modifiable_html_text(WP_HTML_Tag_Processor $p, $new_value) {
- $reflection = new ReflectionClass('WP_HTML_Tag_Processor');
- $accessible_text_starts_at = $reflection->getProperty('text_starts_at');
- $accessible_text_starts_at->setAccessible(true);
-
- $accessible_text_length = $reflection->getProperty('text_length');
- $accessible_text_length->setAccessible(true);
-
- $lexical_updates = $reflection->getProperty('lexical_updates');
- $lexical_updates->setAccessible(true);
-
- switch ( $p->get_token_type() ) {
- case '#text':
- $lexical_updates_now = $lexical_updates->getValue($p);
- $lexical_updates_now[] = new WP_HTML_Text_Replacement(
- $accessible_text_starts_at->getValue($p),
- $accessible_text_length->getValue($p),
- htmlspecialchars( $new_value, ENT_XML1, 'UTF-8' )
- );
- $lexical_updates->setValue($p, $lexical_updates_now);
- return true;
-
- case '#comment':
- case '#cdata-section':
- if(
- $p->get_token_type() === '#comment' && (
- strpos($new_value, '-->') !== false ||
- strpos($new_value, '--!>') !== false
- )
- ) {
- _doing_it_wrong(
- __METHOD__,
- __( 'Cannot set a comment closer as a text of an HTML comment.' ),
- 'WP_VERSION'
- );
- return false;
- }
- if(
- $p->get_token_type() === '#cdata-section' &&
- strpos($new_value, '>') !== false
- ) {
- _doing_it_wrong(
- __METHOD__,
- __( 'Cannot set a CDATA closer as text of an HTML CDATA-lookalike section.' ),
- 'WP_VERSION'
- );
- return false;
- }
- $lexical_updates_now = $lexical_updates->getValue($p);
- $lexical_updates_now[] = new WP_HTML_Text_Replacement(
- $accessible_text_starts_at->getValue($p),
- $accessible_text_length->getValue($p),
- $new_value
- );
- $lexical_updates->setValue($p, $lexical_updates_now);
- return true;
- default:
- _doing_it_wrong(
- __METHOD__,
- __( 'Cannot set text content on a non-text node.' ),
- 'WP_VERSION'
- );
- return false;
- }
- }
-}
diff --git a/site-transfer-protocol b/site-transfer-protocol
new file mode 160000
index 0000000..3486d67
--- /dev/null
+++ b/site-transfer-protocol
@@ -0,0 +1 @@
+Subproject commit 3486d676a5fa2a76b719e3e6159a80e3013bed8a
diff --git a/test-data/woo-products.wxr b/test-data/woo-products.wxr
index 33937b9..034049d 100644
--- a/test-data/woo-products.wxr
+++ b/test-data/woo-products.wxr
@@ -2051,7 +2051,6 @@
open
closed
album
- publish
0
0
product