From 3650b18b3e800d86937e3209e7eead8872978d01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Mon, 15 Jul 2024 20:24:44 +0200 Subject: [PATCH] Stream rewrite URLs in a remote WXR file Brings together a few explorations to stream-rewrite site URLs in a WXR file coming from a remote server. All of that with no curl, DOMDocument, or other PHP dependencies. It's just a few small libraries built with WordPress core in mind: * [AsyncHttp\Client](https://github.com/WordPress/blueprints/pull/52) * [WP_XML_Processor](https://github.com/WordPress/wordpress-develop/pull/6713) * [WP_Block_Markup_Url_Processor](https://github.com/adamziel/site-transfer-protocol) * [WP_HTML_Tag_Processor](https://developer.wordpress.org/reference/classes/wp_html_tag_processor/) Here's what the rewriter looks like: ```php $wxr_url = "https://raw.githubusercontent.com/WordPress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/woo-products.wxr"; $xml_processor = new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT); foreach( stream_remote_file( $wxr_url ) as $chunk ) { $xml_processor->stream_append_xml($chunk); foreach ( xml_next_content_node_for_rewriting( $xml_processor ) as $text ) { $string_new_site_url = 'https://mynew.site/'; $parsed_new_site_url = WP_URL::parse( $string_new_site_url ); $current_site_url = 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/'; $parsed_current_site_url = WP_URL::parse( $current_site_url ); $base_url = 'https://playground.internal'; $url_processor = new WP_Block_Markup_Url_Processor( $text, $base_url ); foreach ( html_next_url( $url_processor, $current_site_url ) as $parsed_matched_url ) { $updated_raw_url = rewrite_url( $url_processor->get_raw_url(), $parsed_matched_url, $parsed_current_site_url, $parsed_new_site_url ); $url_processor->set_raw_url( $updated_raw_url ); } $updated_text = $url_processor->get_updated_html(); if ($updated_text !== $text) { $xml_processor->set_modifiable_text($updated_text); } } echo $xml_processor->get_processed_xml(); } echo $xml_processor->get_unprocessed_xml(); ``` --- bootstrap.php | 53 +++++ class-wp-html-tag-processor.php | 296 +++++++++++++++++++-------- class-wp-wxr-normalizer.php | 236 ++++++++++++++++++++++ class-wp-xml-processor.php | 22 +++ class-wp-xml-tag-processor.php | 29 ++- functions.php | 65 ++++++ rewrite-remote-wxr.php | 175 ++++++++++++++++ rewrite-wxr.php | 341 +------------------------------- site-transfer-protocol | 1 + test-data/woo-products.wxr | 1 - 10 files changed, 790 insertions(+), 429 deletions(-) create mode 100644 bootstrap.php create mode 100644 class-wp-wxr-normalizer.php create mode 100644 functions.php create mode 100644 rewrite-remote-wxr.php create mode 160000 site-transfer-protocol diff --git a/bootstrap.php b/bootstrap.php new file mode 100644 index 0000000..1764edc --- /dev/null +++ b/bootstrap.php @@ -0,0 +1,53 @@ +' )`. + * style of including JavaScript inside of HTML comments to avoid accidentally + * closing the SCRIPT from inside a JavaScript string. E.g. `console.log( '' )`. * - `TITLE` and `TEXTAREA` whose contents are treated as plaintext and then any * character references are decoded. E.g. `1 < 2 < 3` becomes `1 < 2 < 3`. * - `IFRAME`, `NOSCRIPT`, `NOEMBED`, `NOFRAME`, `STYLE` whose contents are treated as @@ -1524,21 +1524,10 @@ private function parse_next_tag() { $was_at = $this->bytes_already_parsed; $at = $was_at; - while ( false !== $at && $at < $doc_length ) { + while ( $at < $doc_length ) { $at = strpos( $html, '<', $at ); - - /* - * This does not imply an incomplete parse; it indicates that there - * can be nothing left in the document other than a #text node. - */ if ( false === $at ) { - $this->parser_state = self::STATE_TEXT_NODE; - $this->token_starts_at = $was_at; - $this->token_length = strlen( $html ) - $was_at; - $this->text_starts_at = $was_at; - $this->text_length = $this->token_length; - $this->bytes_already_parsed = strlen( $html ); - return true; + break; } if ( $at > $was_at ) { @@ -1554,19 +1543,9 @@ private function parse_next_tag() { * * @see https://html.spec.whatwg.org/#tag-open-state */ - if ( strlen( $html ) > $at + 1 ) { - $next_character = $html[ $at + 1 ]; - $at_another_node = ( - '!' === $next_character || - '/' === $next_character || - '?' === $next_character || - ( 'A' <= $next_character && $next_character <= 'Z' ) || - ( 'a' <= $next_character && $next_character <= 'z' ) - ); - if ( ! $at_another_node ) { - ++$at; - continue; - } + if ( 1 !== strspn( $html, '!/?abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOPQRSTUVWXYZ', $at + 1, 1 ) ) { + ++$at; + continue; } $this->parser_state = self::STATE_TEXT_NODE; @@ -1630,11 +1609,7 @@ private function parse_next_tag() { * `') !== false || + strpos($new_value, '--!>') !== false + ) + ) { + _doing_it_wrong( + __METHOD__, + __( 'Cannot set a comment closer as a text of an HTML comment.' ), + 'WP_VERSION' + ); + return false; + } + if( + $p->get_token_type() === '#cdata-section' && + strpos($new_value, '>') !== false + ) { + _doing_it_wrong( + __METHOD__, + __( 'Cannot set a CDATA closer as text of an HTML CDATA-lookalike section.' ), + 'WP_VERSION' + ); + return false; + } + $lexical_updates_now = $lexical_updates->getValue($p); + $lexical_updates_now[] = new WP_HTML_Text_Replacement( + $accessible_text_starts_at->getValue($p), + $accessible_text_length->getValue($p), + $new_value + ); + $lexical_updates->setValue($p, $lexical_updates_now); + return true; + default: + _doing_it_wrong( + __METHOD__, + __( 'Cannot set text content on a non-text node.' ), + 'WP_VERSION' + ); + return false; + } + } +} diff --git a/class-wp-xml-processor.php b/class-wp-xml-processor.php index a18e35f..9156f8f 100644 --- a/class-wp-xml-processor.php +++ b/class-wp-xml-processor.php @@ -88,6 +88,28 @@ public static function stream_tokens( $input_stream, $output_stream, $buffer_siz } } + /** + * Wipes out the processed XML and appends the next chunk of XML to + * any remaining unprocessed XML. + * + * @param string $next_chunk XML to append. + */ + public function stream_append_xml( $next_chunk ) + { + $this->get_updated_xml(); + + $new_xml = $this->get_unprocessed_xml() . $next_chunk; + $breadcrumbs = $this->get_breadcrumbs(); + $parser_context = $this->get_parser_context(); + + $this->reset_state(); + + $this->xml = $new_xml; + $this->stack_of_open_elements = $breadcrumbs; + $this->parser_context = $parser_context; + $this->had_previous_chunks = true; + } + /** * Constructor. * diff --git a/class-wp-xml-tag-processor.php b/class-wp-xml-tag-processor.php index db06e54..45ea8f4 100644 --- a/class-wp-xml-tag-processor.php +++ b/class-wp-xml-tag-processor.php @@ -337,7 +337,7 @@ class WP_XML_Tag_Processor { * @since WP_VERSION * @var string */ - protected $xml; + public $xml; /** * The last query passed to next_tag(). @@ -428,7 +428,7 @@ class WP_XML_Tag_Processor { * @since WP_VERSION * @var int */ - private $bytes_already_parsed = 0; + public $bytes_already_parsed = 0; /** * Byte offset in input document where current token starts. @@ -1751,6 +1751,31 @@ private function after_tag() { $this->is_closing_tag = null; $this->attributes = array(); } + + protected function reset_state() + { + $this->xml = ''; + $this->last_query = null; + $this->sought_tag_name = null; + $this->sought_match_offset = 0; + $this->stop_on_tag_closers = false; + $this->parser_state = self::STATE_READY; + $this->is_incomplete_text_node = false; + $this->bytes_already_parsed = 0; + $this->token_starts_at = null; + $this->token_length = null; + $this->tag_name_starts_at = null; + $this->tag_name_length = null; + $this->text_starts_at = null; + $this->text_length = null; + $this->is_closing_tag = null; + $this->last_error = null; + $this->attributes = array(); + $this->bookmarks = array(); + $this->lexical_updates = array(); + $this->seek_count = 0; + $this->had_previous_chunks = false; + } /** * Applies attribute updates to XML document. diff --git a/functions.php b/functions.php new file mode 100644 index 0000000..3373d43 --- /dev/null +++ b/functions.php @@ -0,0 +1,65 @@ + $local_file) { + $request = new Request($asset_url); + $requests[] = $request; + $local_paths[$request->id] = $local_file; + } + + $client = new Client( [ + 'concurrency' => 10, + ] ); + $client->enqueue( $requests ); + + $results = []; + while ( $client->await_next_event() ) { + $request = $client->get_request(); + + switch ( $client->get_event() ) { + case Client::EVENT_BODY_CHUNK_AVAILABLE: + file_put_contents( + $local_paths[$request->original_request()->id], + $client->get_response_body_chunk(), + FILE_APPEND + ); + break; + case Client::EVENT_FAILED: + $results[$request->original_request()->url] = [ + 'success' => false, + 'error' => $request->error, + ]; + break; + case Client::EVENT_FINISHED: + $results[$request->original_request()->url] = [ + 'success' => true + ]; + break; + } + } + return $results; +} + +/** + * WordPress compat + */ +if(!function_exists('esc_attr')) { + function esc_attr($text) { + return htmlspecialchars($text, ENT_XML1, 'UTF-8'); + } +} + +function serialize_url($parsedUrl) { + return (isset($parsedUrl['scheme']) ? $parsedUrl['scheme'] . '://' : '') + . (isset($parsedUrl['user']) ? $parsedUrl['user'] . (isset($parsedUrl['pass']) ? ':' . $parsedUrl['pass'] : '') .'@' : '') + . $parsedUrl['host'] + . (isset($parsedUrl['port']) ? ':' . $parsedUrl['port'] : '') + . (isset($parsedUrl['path']) ? $parsedUrl['path'] : '') + . (isset($parsedUrl['query']) ? '?' . $parsedUrl['query'] : '') + . (isset($parsedUrl['fragment']) ? '#' . $parsedUrl['fragment'] : ''); +} diff --git a/rewrite-remote-wxr.php b/rewrite-remote-wxr.php new file mode 100644 index 0000000..b57e6ca --- /dev/null +++ b/rewrite-remote-wxr.php @@ -0,0 +1,175 @@ + WP_XML_Processor -> WP_Block_Markup_Url_Processor -> WP_Migration_URL_In_Text_Processor -> WP_URL + * + * The layers of data we're handling here are: + * + * * AsyncHttp\Client: HTTPS encrypted data -> Chunked encoding -> Gzip compression + * * WP_XML_Processor: XML (entities, attributes, text, comments, CDATA nodes) + * * WP_Block_Markup_Url_Processor: HTML (entities, attributes, text, comments, block comments), JSON (in block comments) + * * WP_Migration_URL_In_Text_Processor: URLs in text nodes + * * WP_URL: URL parsing and serialization + * + * It wouldn't be difficult to pipe through additioanl layers such as: + * + * * Reading from a remote ZIP file + * * Writing to a local ZIP-ped XML file + * * Writing to a database + * + * ...etc. + */ + +require __DIR__ . '/bootstrap.php'; + +use \WordPress\AsyncHttp\Client; +use \WordPress\AsyncHttp\Request; + +$wxr_url = "https://raw.githubusercontent.com/WordPress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/woo-products.wxr"; +$xml_processor = new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT); +foreach( stream_remote_file( $wxr_url ) as $chunk ) { + $xml_processor->stream_append_xml($chunk); + foreach ( xml_next_content_node_for_rewriting( $xml_processor ) as $text ) { + $string_new_site_url = 'https://mynew.site/'; + $parsed_new_site_url = WP_URL::parse( $string_new_site_url ); + + $current_site_url = 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/'; + $parsed_current_site_url = WP_URL::parse( $current_site_url ); + + $base_url = 'https://playground.internal'; + $url_processor = new WP_Block_Markup_Url_Processor( $text, $base_url ); + + foreach ( html_next_url( $url_processor, $current_site_url ) as $parsed_matched_url ) { + $updated_raw_url = rewrite_url( + $url_processor->get_raw_url(), + $parsed_matched_url, + $parsed_current_site_url, + $parsed_new_site_url + ); + $url_processor->set_raw_url( $updated_raw_url ); + } + + $updated_text = $url_processor->get_updated_html(); + if ($updated_text !== $text) { + $xml_processor->set_modifiable_text($updated_text); + } + } + echo $xml_processor->get_processed_xml(); +} +echo $xml_processor->get_unprocessed_xml(); + +// The rest of this file are functions used in the above code + +function stream_remote_file($url) +{ + $requests = [ + new Request($url) + ]; + $client = new Client(); + $client->enqueue($requests); + + while ($client->await_next_event()) { + switch ($client->get_event()) { + case Client::EVENT_BODY_CHUNK_AVAILABLE: + yield $client->get_response_body_chunk(); + break; + } + } +} + +function xml_next_content_node_for_rewriting(WP_XML_Processor $processor) { + while($processor->next_token()) { + if (!in_array('item', $processor->get_breadcrumbs())) { + continue; + } + if ( + !in_array('excerpt:encoded', $processor->get_breadcrumbs()) + && !in_array('content:encoded', $processor->get_breadcrumbs()) + && !in_array('wp:attachment_url', $processor->get_breadcrumbs()) + && !in_array('guid', $processor->get_breadcrumbs()) + && !in_array('link', $processor->get_breadcrumbs()) + && !in_array('wp:comment_content', $processor->get_breadcrumbs()) + // Meta values are not suppoerted yet. We'll need to support + // WordPress core options that may be saved as JSON, PHP Deserialization, and XML, + // and then provide extension points for plugins authors support + // their own options. + // !in_array('wp:postmeta', $processor->get_breadcrumbs()) + ) { + continue; + } + + switch ($processor->get_token_type()) { + case '#text': + case '#cdata-section': + $text = $processor->get_modifiable_text(); + yield $text; + break; + } + } +} + +/** + * + * @param mixed $options + * @return Generator + */ +function html_next_url(WP_Block_Markup_Url_Processor $p, $current_site_url) { + $parsed_current_site_url = WP_URL::parse( $current_site_url ); + $decoded_current_site_pathname = urldecode( $parsed_current_site_url->pathname ); + + while ( $p->next_url() ) { + $parsed_matched_url = $p->get_parsed_url(); + if ( $parsed_matched_url->hostname === $parsed_current_site_url->hostname ) { + $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname ); + $pathname_matches = str_starts_with( $decoded_matched_pathname, $decoded_current_site_pathname ); + if ( ! $pathname_matches ) { + continue; + } + + // It's a match! + yield $parsed_matched_url; + } + } +} + +function rewrite_url( + string $raw_matched_url, + $parsed_matched_url, + $parsed_current_site_url, + $parsed_new_site_url, +) { + // Let's rewrite the URL + $parsed_matched_url->hostname = $parsed_new_site_url->hostname; + $decoded_matched_pathname = urldecode( $parsed_matched_url->pathname ); + + // Short-circuit for empty pathnames + if ('/' !== $parsed_current_site_url->pathname) { + $parsed_matched_url->pathname = + $parsed_new_site_url->pathname . + substr( + $decoded_matched_pathname, + strlen(urldecode($parsed_current_site_url->pathname)) + ); + } + + /* + * Stylistic choice – if the matched URL has no trailing slash, + * do not add it to the new URL. The WHATWG URL parser will + * add one automatically if the path is empty, so we have to + * explicitly remove it. + */ + $new_raw_url = $parsed_matched_url->toString(); + if ( + $raw_matched_url[strlen($raw_matched_url) - 1] !== '/' && + $parsed_matched_url->pathname === '/' && + $parsed_matched_url->search === '' && + $parsed_matched_url->hash === '' + ) { + $new_raw_url = rtrim($new_raw_url, '/'); + } + + return $new_raw_url; +} diff --git a/rewrite-wxr.php b/rewrite-wxr.php index 3bfa0d1..f1fe965 100644 --- a/rewrite-wxr.php +++ b/rewrite-wxr.php @@ -25,51 +25,8 @@ * [2] ZipStreamWriter: https://github.com/WordPress/blueprints-library/blob/f9fcb5816ab6def0920b25787341342bc88803e3/src/WordPress/Zip/ZipStreamWriter.php * [3] AsyncHttpClient: https://github.com/WordPress/blueprints-library/blob/trunk/src/WordPress/AsyncHttp/Client.php */ - -use \WordPress\AsyncHttp\Client; -use \WordPress\AsyncHttp\Request; -// Where to find the streaming WP_XML_Processor -// Use a version from this PR: https://github.com/adamziel/wordpress-develop/pull/43 -define('WP_XML_API_PATH', __DIR__ ); -define('BLUEPRINTS_LIB_PATH', __DIR__ . '/blueprints-library/src/WordPress' ); -if(!file_exists(WP_XML_API_PATH . '/class-wp-token-map.php')) { - copy(WP_XML_API_PATH.'/../class-wp-token-map.php', WP_XML_API_PATH . '/class-wp-token-map.php'); -} - -$requires[] = WP_XML_API_PATH . "/class-wp-html-token.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-span.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-text-replacement.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-decoder.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-attribute-token.php"; - -$requires[] = WP_XML_API_PATH . "/class-wp-html-decoder.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-tag-processor.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-open-elements.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-token-map.php"; -$requires[] = WP_XML_API_PATH . "/html5-named-character-references.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-active-formatting-elements.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-processor-state.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-unsupported-exception.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-html-processor.php"; - -$requires[] = WP_XML_API_PATH . "/class-wp-xml-decoder.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-xml-tag-processor.php"; -$requires[] = WP_XML_API_PATH . "/class-wp-xml-processor.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamWrapperInterface.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamWrapper.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamPeekerWrapper.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Request.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Response.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/HttpError.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Connection.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Client.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/StreamWrapper/ChunkedEncodingWrapper.php"; -$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/StreamWrapper/InflateStreamWrapper.php"; - -foreach ($requires as $require) { - require_once $require; -} +require __DIR__ . '/bootstrap.php'; if (!Phar::running() && in_array('--bundle', $argv)) { bundlePhar('preprocess-wxr.phar', array_merge( @@ -198,7 +155,7 @@ function ($url) { return $url; } $url_to_path[$url] = $details['download_path']; } } -wp_download_files([ +wxr_download_files([ 'concurrency' => 10, 'assets' => $url_to_path ]); @@ -227,297 +184,3 @@ function ($url) use($assets_details) { $normalizer->process(); fclose($input_stream); fclose($output_stream); - -function wp_download_files($options) { - $requests = []; - $local_paths = []; - foreach ($options['assets'] as $asset_url => $local_file) { - $request = new Request($asset_url); - $requests[] = $request; - $local_paths[$request->id] = $local_file; - } - - $client = new Client( [ - 'concurrency' => 10, - ] ); - $client->enqueue( $requests ); - - $results = []; - while ( $client->await_next_event() ) { - $request = $client->get_request(); - - switch ( $client->get_event() ) { - case Client::EVENT_BODY_CHUNK_AVAILABLE: - file_put_contents( - $local_paths[$request->original_request()->id], - $client->get_response_body_chunk(), - FILE_APPEND - ); - break; - case Client::EVENT_FAILED: - $results[$request->original_request()->url] = [ - 'success' => false, - 'error' => $request->error, - ]; - break; - case Client::EVENT_FINISHED: - $results[$request->original_request()->url] = [ - 'success' => true - ]; - break; - } - } - return $results; -} - -/** - * WordPress compat - */ -function esc_attr($text) { - return htmlspecialchars($text, ENT_XML1, 'UTF-8'); -} - -function serialize_url($parsedUrl) { - return (isset($parsedUrl['scheme']) ? $parsedUrl['scheme'] . '://' : '') - . (isset($parsedUrl['user']) ? $parsedUrl['user'] . (isset($parsedUrl['pass']) ? ':' . $parsedUrl['pass'] : '') .'@' : '') - . $parsedUrl['host'] - . (isset($parsedUrl['port']) ? ':' . $parsedUrl['port'] : '') - . (isset($parsedUrl['path']) ? $parsedUrl['path'] : '') - . (isset($parsedUrl['query']) ? '?' . $parsedUrl['query'] : '') - . (isset($parsedUrl['fragment']) ? '#' . $parsedUrl['fragment'] : ''); -} - -class WP_WXR_Normalizer -{ - - private $input_stream; - private $output_stream; - private $rewrite_url_callback; - - private $found_urls = array(); - - public function __construct( - $input_stream, - $output_stream, - $rewrite_url_callback - ) { - $this->input_stream = $input_stream; - $this->output_stream = $output_stream; - $this->rewrite_url_callback = $rewrite_url_callback; - } - - public function get_found_urls() - { - return array_keys($this->found_urls); - } - - public function process() - { - $tokens = WP_XML_Processor::stream_tokens($this->input_stream, $this->output_stream, 1000000); - foreach ($tokens as $processor) { - if ( - in_array('item', $processor->get_breadcrumbs()) - // $processor->matches_breadcrumbs(array('item', 'content:encoded')) || - // $processor->matches_breadcrumbs(array('item', 'excerpt:encoded')) || - // $processor->matches_breadcrumbs(array('wp:comment_content')) - ) { - switch ($processor->get_token_type()) { - case '#text': - case '#cdata-section': - $text = $processor->get_modifiable_text(); - $updated_text = $this->process_content_node($text); - if ($updated_text !== $text) { - $processor->set_modifiable_text($updated_text); - } - break; - } - } - } - } - - private function process_content_node($text) - { - $result = $this->process_as_html($text); - if(false !== $result) { - return $result; - } - - $result = $this->process_as_plaintext($text); - if(false !== $result) { - return $result; - } - - return false; - } - - private function process_as_html($text) { - $html = new WP_HTML_Tag_Processor($text); - if(false === $html->next_token()) { - return false; - } - - do { - switch($html->get_token_type()) { - case '#comment': - $text = $html->get_modifiable_text(); - // Try to parse as a block. The block parser won't cut it because - // while it can parse blocks, it has no semantics for rewriting the - // block markup. Let's do our best here: - $at = strspn($text, ' \t\f\r\n'); // Whitespace - if(!( - $at + 3 < strlen($text) && - $text[$at] === 'w' && - $text[$at+1] === 'p' && - $text[$at+2] === ':' - )) { - break; - } - $at += 3; - $at += strspn($text, 'abcdefghijklmnopqrstuwxvyzABCDEFGHIJKLMNOPRQSTUWXVYZ0123456789_-', $at); // Block name - $at += strspn($text, ' \t\f\r\n', $at); // Whitespace again - if($at >= strlen($text)) { - // Oh, there were no attributes or this wasn't a block - // Either way, we have nothing more to do here. - break; - } - - // It seems we may have block attributes here. Let's try to - // parse them as JSON. - $json_maybe = substr($text, $at); - $attributes = json_decode($json_maybe, true); - if(null === $attributes) { - // This wasn't a block after all, let's move on - break; - } - - // This is a block! Let's process all block attributes and rewrite them - $new_attributes = $this->process_block_attributes($attributes); - $this->set_modifiable_html_text( - $html, - substr($text, 0, $at) . json_encode($new_attributes, JSON_HEX_TAG | JSON_HEX_AMP) - ); - break; - - case '#tag': - $attributes = $html->get_attribute_names_with_prefix(''); - if(!$attributes) { - break; - } - foreach($attributes as $attribute_name) { - $value = $html->get_attribute($attribute_name); - $updated = $this->process_as_plaintext($value); - if($updated !== $value) { - $html->set_attribute($attribute_name, $updated); - } - } - break; - case '#text': - $text = $html->get_modifiable_text(); - $updated_text = $this->process_as_plaintext($text); - if($updated_text !== $text) { - $this->set_modifiable_html_text($html, $updated_text); - } - break; - } - } while($html->next_token()); - - return $html->get_updated_html(); - } - - private function process_block_attributes($attributes) { - if(is_string($attributes)) { - return $this->process_as_plaintext($attributes); - } else if(is_array($attributes)) { - $new_attributes = array(); - foreach($attributes as $key => $value) { - $new_attributes[$key] = $this->process_block_attributes($value); - } - return $new_attributes; - } else { - return $attributes; - } - } - - /** - * @TODO: Investigate how bad this is – would it stand the test of time, or do we need - * a proper URL-matching state machine? - */ - const URL_REGEXP = '\b((?:(https?):\/\/|www\.)[-a-zA-Z0-9@:%._\+\~#=]+(?:\.[a-zA-Z0-9]{2,})+[-a-zA-Z0-9@:%_\+.\~#?&//=]*)\b'; - private function process_as_plaintext($text) { - return preg_replace_callback( - '~'.self::URL_REGEXP.'~', - function ($matches) { - $this->found_urls[$matches[0]] = true; - $replacer = $this->rewrite_url_callback; - return $replacer($matches[0]); - }, - $text - ); - } - - private function set_modifiable_html_text(WP_HTML_Tag_Processor $p, $new_value) { - $reflection = new ReflectionClass('WP_HTML_Tag_Processor'); - $accessible_text_starts_at = $reflection->getProperty('text_starts_at'); - $accessible_text_starts_at->setAccessible(true); - - $accessible_text_length = $reflection->getProperty('text_length'); - $accessible_text_length->setAccessible(true); - - $lexical_updates = $reflection->getProperty('lexical_updates'); - $lexical_updates->setAccessible(true); - - switch ( $p->get_token_type() ) { - case '#text': - $lexical_updates_now = $lexical_updates->getValue($p); - $lexical_updates_now[] = new WP_HTML_Text_Replacement( - $accessible_text_starts_at->getValue($p), - $accessible_text_length->getValue($p), - htmlspecialchars( $new_value, ENT_XML1, 'UTF-8' ) - ); - $lexical_updates->setValue($p, $lexical_updates_now); - return true; - - case '#comment': - case '#cdata-section': - if( - $p->get_token_type() === '#comment' && ( - strpos($new_value, '-->') !== false || - strpos($new_value, '--!>') !== false - ) - ) { - _doing_it_wrong( - __METHOD__, - __( 'Cannot set a comment closer as a text of an HTML comment.' ), - 'WP_VERSION' - ); - return false; - } - if( - $p->get_token_type() === '#cdata-section' && - strpos($new_value, '>') !== false - ) { - _doing_it_wrong( - __METHOD__, - __( 'Cannot set a CDATA closer as text of an HTML CDATA-lookalike section.' ), - 'WP_VERSION' - ); - return false; - } - $lexical_updates_now = $lexical_updates->getValue($p); - $lexical_updates_now[] = new WP_HTML_Text_Replacement( - $accessible_text_starts_at->getValue($p), - $accessible_text_length->getValue($p), - $new_value - ); - $lexical_updates->setValue($p, $lexical_updates_now); - return true; - default: - _doing_it_wrong( - __METHOD__, - __( 'Cannot set text content on a non-text node.' ), - 'WP_VERSION' - ); - return false; - } - } -} diff --git a/site-transfer-protocol b/site-transfer-protocol new file mode 160000 index 0000000..3486d67 --- /dev/null +++ b/site-transfer-protocol @@ -0,0 +1 @@ +Subproject commit 3486d676a5fa2a76b719e3e6159a80e3013bed8a diff --git a/test-data/woo-products.wxr b/test-data/woo-products.wxr index 33937b9..034049d 100644 --- a/test-data/woo-products.wxr +++ b/test-data/woo-products.wxr @@ -2051,7 +2051,6 @@ open closed album - publish 0 0 product