Skip to content

Commit

Permalink
Stream rewrite URLs in a remote WXR file
Browse files Browse the repository at this point in the history
Brings together a few explorations to stream-rewrite site URLs in a WXR file coming
from a remote server. All of that with no curl, DOMDocument, or other
PHP dependencies. It's just a few small libraries built with WordPress
core in mind:

* [AsyncHttp\Client](WordPress/blueprints#52)
* [WP_XML_Processor](WordPress/wordpress-develop#6713)
* [WP_Block_Markup_Url_Processor](https://github.com/adamziel/site-transfer-protocol)
* [WP_HTML_Tag_Processor](https://developer.wordpress.org/reference/classes/wp_html_tag_processor/)

Here's what the rewriter looks like:

```php
$wxr_url = "https://raw.githubusercontent.com/WordPress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/woo-products.wxr";
$xml_processor = new WP_XML_Processor('', [], WP_XML_Processor::IN_PROLOG_CONTEXT);
foreach( stream_remote_file( $wxr_url ) as $chunk ) {
    $xml_processor->stream_append_xml($chunk);
    foreach ( xml_next_content_node_for_rewriting( $xml_processor ) as $text ) {
        $string_new_site_url           = 'https://mynew.site/';
        $parsed_new_site_url           = WP_URL::parse( $string_new_site_url );

        $current_site_url              = 'https://raw.githubusercontent.com/wordpress/blueprints/normalize-wxr-assets/blueprints/stylish-press-clone/wxr-assets/';
        $parsed_current_site_url       = WP_URL::parse( $current_site_url );

        $base_url = 'https://playground.internal';
        $url_processor = new WP_Block_Markup_Url_Processor( $text, $base_url );

        foreach ( html_next_url( $url_processor, $current_site_url ) as $parsed_matched_url ) {
            $updated_raw_url = rewrite_url(
                $url_processor->get_raw_url(),
                $parsed_matched_url,
                $parsed_current_site_url,
                $parsed_new_site_url
            );
            $url_processor->set_raw_url( $updated_raw_url );
        }

        $updated_text = $url_processor->get_updated_html();
        if ($updated_text !== $text) {
            $xml_processor->set_modifiable_text($updated_text);
        }
    }
    echo $xml_processor->get_processed_xml();
}
echo $xml_processor->get_unprocessed_xml();
```
  • Loading branch information
adamziel committed Jul 15, 2024
1 parent 7d628e8 commit 3650b18
Show file tree
Hide file tree
Showing 10 changed files with 790 additions and 429 deletions.
53 changes: 53 additions & 0 deletions bootstrap.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<?php

// Where to find the streaming WP_XML_Processor
// Use a version from this PR: https://github.com/adamziel/wordpress-develop/pull/43
define('WP_XML_API_PATH', __DIR__ );
define('SITE_TRANSFER_PROTOCOL_PATH', __DIR__ . '/site-transfer-protocol' );
define('BLUEPRINTS_LIB_PATH', __DIR__ . '/blueprints-library/src/WordPress' );
if(!file_exists(WP_XML_API_PATH . '/class-wp-token-map.php')) {
copy(WP_XML_API_PATH.'/../class-wp-token-map.php', WP_XML_API_PATH . '/class-wp-token-map.php');
}

$requires[] = WP_XML_API_PATH . "/class-wp-html-token.php";
$requires[] = WP_XML_API_PATH . "/class-wp-html-span.php";
$requires[] = WP_XML_API_PATH . "/class-wp-html-text-replacement.php";
$requires[] = WP_XML_API_PATH . "/class-wp-html-decoder.php";
$requires[] = WP_XML_API_PATH . "/class-wp-html-attribute-token.php";

$requires[] = WP_XML_API_PATH . "/class-wp-html-decoder.php";
$requires[] = WP_XML_API_PATH . "/class-wp-html-tag-processor.php";
$requires[] = WP_XML_API_PATH . "/class-wp-html-open-elements.php";
$requires[] = WP_XML_API_PATH . "/class-wp-token-map.php";
$requires[] = WP_XML_API_PATH . "/html5-named-character-references.php";
$requires[] = WP_XML_API_PATH . "/class-wp-html-active-formatting-elements.php";
$requires[] = WP_XML_API_PATH . "/class-wp-html-processor-state.php";
$requires[] = WP_XML_API_PATH . "/class-wp-html-unsupported-exception.php";
$requires[] = WP_XML_API_PATH . "/class-wp-html-processor.php";

$requires[] = WP_XML_API_PATH . "/class-wp-xml-decoder.php";
$requires[] = WP_XML_API_PATH . "/class-wp-xml-tag-processor.php";
$requires[] = WP_XML_API_PATH . "/class-wp-xml-processor.php";
$requires[] = WP_XML_API_PATH . "/class-wp-wxr-normalizer.php";
$requires[] = WP_XML_API_PATH . "/functions.php";
$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamWrapperInterface.php";
$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamWrapper.php";
$requires[] = BLUEPRINTS_LIB_PATH . "/Streams/StreamPeekerWrapper.php";
$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Request.php";
$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Response.php";
$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/HttpError.php";
$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Connection.php";
$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/Client.php";
$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/StreamWrapper/ChunkedEncodingWrapper.php";
$requires[] = BLUEPRINTS_LIB_PATH . "/AsyncHttp/StreamWrapper/InflateStreamWrapper.php";

$requires[] = SITE_TRANSFER_PROTOCOL_PATH . '/src/WP_Block_Markup_Processor.php';
$requires[] = SITE_TRANSFER_PROTOCOL_PATH . '/src/WP_Block_Markup_Url_Processor.php';
$requires[] = SITE_TRANSFER_PROTOCOL_PATH . '/src/WP_Migration_URL_In_Text_Processor.php';
$requires[] = SITE_TRANSFER_PROTOCOL_PATH . '/src/WP_URL.php';
$requires[] = SITE_TRANSFER_PROTOCOL_PATH . '/src/functions.php';
$requires[] = SITE_TRANSFER_PROTOCOL_PATH . '/vendor/autoload.php';

foreach ($requires as $require) {
require_once $require;
}
Loading

0 comments on commit 3650b18

Please sign in to comment.