Skip to content

Commit

Permalink
Additional rigor when parsing URLs, actually use the public suffix list
Browse files Browse the repository at this point in the history
  • Loading branch information
adamziel committed Jun 24, 2024
1 parent 719eee4 commit 19008b7
Show file tree
Hide file tree
Showing 19 changed files with 2,145 additions and 1,812 deletions.
2 changes: 1 addition & 1 deletion transfer-protocol/.phpunit.result.cache

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion transfer-protocol/bin/regenerate_public_suffix_list.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@
fwrite($fp, "\n * Public suffix list for detecting URLs with known domains within text.");
fwrite($fp, "\n * This file is automatically generated by regenerate_public_suffix_list.php.");
fwrite($fp, "\n * Do not edit it directly.");
fwrite($fp, "\n * @TODO: Process wildcards and exceptions, not just raw TLDs.");
fwrite($fp, "\n */\n\n");
fwrite($fp, "return array(\n");
foreach($tlds as $tld) {
fwrite($fp, "\t'".$tld."',\n");
fwrite($fp, "\t'".$tld."' => 1,\n");
}

fwrite($fp, ");\n");
Expand Down
70 changes: 42 additions & 28 deletions transfer-protocol/bin/rewrite-urls.php
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
<?php

use Rowbot\URL\URL;

require_once __DIR__ . "/../bootstrap.php";

if ( $argc < 2 ) {
echo "Usage: php script.php <command> --file <input-file> --from-site-url <current site url> --to-url <target url>\n";
echo "Usage: php script.php <command> --file <input-file> --current-site-url <current site url> --new-site-url <target url>\n";
echo "Commands:\n";
echo " list_urls: List all the URLs found in the input file.\n";
echo " migrate_urls: Migrate all the URLs found in the input file from the current site to the target site.\n";
Expand All @@ -27,26 +25,24 @@
exit( 1 );
}

$inputFile = $options['file'];
$targetDomain = @$options['target-domain'];

$inputFile = $options['file'];
if ( ! file_exists( $inputFile ) ) {
echo "The file $inputFile does not exist.\n";
exit( 1 );
}

$block_markup = file_get_contents( $inputFile );

// @TODO: Should a base URL be always required?
$previous_url = $options['from-site-url'] ?? 'https://w.org';
$p = new WP_Block_Markup_Url_Processor( $block_markup, $previous_url );
// @TODO: Decide – should the current site URL be always required to
// populate $base_url?
$base_url = $options['current-site-url'] ?? 'https://playground.internal';
$p = new WP_Block_Markup_Url_Processor( $block_markup, $base_url );

switch ( $command ) {
case 'list_urls':
echo "URLs found in the markup:\n\n";
while ( $p->next_url() ) {
// Skip empty relative URLs.
if ( ! trim( $p->get_url() ) ) {
if ( ! trim( $p->get_raw_url() ) ) {
continue;
}
echo '* ';
Expand All @@ -61,35 +57,53 @@
echo 'In #text: ';
break;
}
echo $p->get_url() . "\n";
echo $p->get_raw_url() . "\n";
}
echo "\n";
break;
case 'migrate_urls':
if ( ! isset( $options['from-site-url'] ) ) {
echo "The --from-site-url option is required for the migrate_urls command.\n";
if ( ! isset( $options['current-site-url'] ) ) {
echo "The --current-site-url option is required for the migrate_urls command.\n";
exit( 1 );
}
if ( ! isset( $options['to-url'] ) ) {
echo "The --to-url option is required for the migrate_urls command.\n";
if ( ! isset( $options['new-site-url'] ) ) {
echo "The --new-site-url option is required for the migrate_urls command.\n";
exit( 1 );
}
$parsed_prev_url = URL::parse( $options['from-site-url'] );
$next_url = $options['to-url'];
$parsed_new_url = URL::parse( $next_url );
echo "Replacing $previous_url with $next_url in the input.\n";
$parsed_current_site_url = WP_URL::parse( $options['current-site-url'] );
$string_new_site_url = $options['new-site-url'];
$parsed_new_site_url = WP_URL::parse( $string_new_site_url );

echo "Replacing $base_url with $string_new_site_url in the input.\n";
echo "Note this is not yet enough to migrate the site as both the previous and the new";
echo "site might be hosted on specific paths.\n\n";
while ( $p->next_url() ) {
$updated = false;
$url = $p->get_url();
$parsed_url = URL::parse( $url, $parsed_prev_url );
if ( $parsed_url->hostname === $parsed_prev_url->hostname ) {
$parsed_url->hostname = $parsed_new_url->hostname;
if ( str_starts_with( $parsed_url->pathname, $parsed_prev_url->pathname ) ) {
$parsed_url->pathname = $parsed_new_url->pathname . substr( $parsed_url->pathname, strlen( $parsed_prev_url->pathname ) );
$updated = false;
$matched_url = $p->get_raw_url();
$parsed_matched_url = $p->get_parsed_url();
if ( $parsed_matched_url->hostname === $parsed_current_site_url->hostname ) {
$parsed_matched_url->hostname = $parsed_new_site_url->hostname;
if ( str_starts_with( $parsed_matched_url->pathname, $parsed_current_site_url->pathname ) ) {
$parsed_matched_url->pathname = $parsed_new_site_url->pathname . substr( $parsed_matched_url->pathname,
strlen( $parsed_current_site_url->pathname ) );
}

/*
* Stylistic choice – if the matched URL has no trailing slash,
* do not add it to the new URL. The WHATWG URL parser will
* add one automatically if the path is empty, so we have to
* explicitly remove it.
*/
$new_raw_url = $parsed_matched_url->toString();
if (
$matched_url[ strlen( $matched_url ) - 1 ] !== '/' &&
$parsed_matched_url->pathname === '/' &&
$parsed_matched_url->search === '' &&
$parsed_matched_url->hash === ''
) {
$new_raw_url = rtrim( $new_raw_url, '/' );
}
$p->set_url( $parsed_url->toString() );
$p->set_raw_url( $new_raw_url );
}
}
echo $p->get_updated_html();
Expand Down
1 change: 1 addition & 0 deletions transfer-protocol/bootstrap.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
require_once __DIR__ . '/src/WP_Block_Markup_Processor.php';
require_once __DIR__ . '/src/WP_Block_Markup_Url_Processor.php';
require_once __DIR__ . '/src/WP_Migration_URL_In_Text_Processor.php';
require_once __DIR__ . '/src/WP_URL.php';
require_once __DIR__ . '/vendor/autoload.php';

function _doing_it_wrong() {
Expand Down
2 changes: 1 addition & 1 deletion transfer-protocol/married-short.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<!-- /wp:image -->

<!-- wp:paragraph -->
<p>During the <a href="writeofpassage.school">Write of Passage</a>, I stubbornly tried to beat my writer’s block by writing until 3am multiple times. The burnout returned. I dropped everything and went to Greece for a week.</p>
<p>During the <a href="//writeofpassage.school/">Write of Passage</a>, I stubbornly tried to beat my writer’s block by writing until 3am multiple times. The burnout returned. I dropped everything and went to Greece for a week.</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
Expand Down
19 changes: 9 additions & 10 deletions transfer-protocol/phpunit.xml
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
<phpunit
bootstrap="bootstrap.php"
colors="true"
verbose="true"
>
<testsuites>
<testsuite name="Project Test Suite">
<directory>./tests</directory>
</testsuite>
</testsuites>
<?xml version="1.0"?>
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" bootstrap="bootstrap.php" colors="true" xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/10.0/phpunit.xsd" cacheDirectory=".phpunit.cache">
<testsuites>
<testsuite name="Application Test Suite">
<file>tests/WPBlockMarkupProcessorTests.php</file>
<file>tests/WPBlockMarkupUrlProcessorTests.php</file>
<file>tests/WPMigrationURLInTextProcessorTests.php</file>
</testsuite>
</testsuites>
</phpunit>
13 changes: 3 additions & 10 deletions transfer-protocol/run-tests.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,4 @@
#!/bin/bash
#COMMAND="phpunit tests/WP_Migration_*"
#COMMAND="phpunit tests/*.php"
#COMMAND="phpunit tests/WP_Block_Markup_Url_Processor_Tests.php"
#COMMAND="phpunit -c phpunit.xml"
#$COMMAND
#fswatch -o ./**/*.php | xargs -n1 -I{} $COMMAND

for i in $(ls tests/*.php | grep -v URL_Parser); do
phpunit $i
done
COMMAND="phpunit -c ./phpunit.xml"
$COMMAND
fswatch -o ./**/*.php | xargs -n1 -I{} $COMMAND
25 changes: 0 additions & 25 deletions transfer-protocol/src/WP_Block_Markup_Processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -144,31 +144,6 @@ public function get_block_attributes() {
return $this->block_attributes;
}

public function set_block_attributes( array $new_attributes ) {
if ( null === $this->block_name ) {
_doing_it_wrong(
__METHOD__,
__( 'Cannot set block attributes when not in `block_attributes` state' ),
'WP_VERSION'
);

return false;
}

if ( null !== $this->block_attributes_iterator ) {
_doing_it_wrong(
__METHOD__,
__( 'Cannot override all the block attributes when iterating over the existing attributes with next_block_attribute()' ),
'WP_VERSION'
);

return false;
}

$this->block_attributes_updated = true;
$this->block_attributes = $new_attributes;
}

public function is_block_closer() {
return $this->block_name !== null && $this->block_closer === true;
}
Expand Down
64 changes: 39 additions & 25 deletions transfer-protocol/src/WP_Block_Markup_Url_Processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@
*/
class WP_Block_Markup_Url_Processor extends WP_Block_Markup_Processor {

private $url;
private $raw_url;
/**
* @var URL
*/
private $parsed_url;
private $base_url;
private $url_in_text_processor;
private $url_in_text_node_updated;
Expand All @@ -27,14 +31,19 @@ public function get_updated_html() {
return parent::get_updated_html();
}

public function get_url() {
return $this->url;
public function get_raw_url() {
return $this->raw_url;
}

public function get_parsed_url() {
return $this->parsed_url;
}

public function next_token() {
$this->get_updated_html();

$this->url = null;
$this->raw_url = null;
$this->parsed_url = null;
$this->inspected_url_attribute_idx = - 1;
$this->url_in_text_processor = null;
// Do not reset url_in_text_node_updated – it's reset in get_updated_html() which
Expand All @@ -54,7 +63,7 @@ public function next_url() {
}

public function next_url_in_current_token() {
$this->url = null;
$this->raw_url = null;
switch ( parent::get_token_type() ) {
case '#tag':
return $this->next_url_attribute();
Expand All @@ -68,17 +77,11 @@ public function next_url_in_current_token() {
}

private function next_url_in_text_node() {

if ( $this->get_token_type() !== '#text' ) {
return false;
}

if ( null === $this->url_in_text_processor ) {
$this->url_in_text_processor = new WP_Migration_URL_In_Text_Processor( $this->get_modifiable_text() );
}

while ( $this->url_in_text_processor->next_url() ) {
$url = $this->url_in_text_processor->get_url();
/*
* Use the base URL for URLs matched in text nodes. This is the only
* way to recognize a substring "WordPress.org" as a URL. We might
Expand All @@ -90,11 +93,14 @@ private function next_url_in_text_node() {
* to filter out such false positives e.g. by checking the domain against
* a list of accepted domains, or the TLD against a list of public suffixes.
*/
if ( URL::canParse( $url, $this->base_url ) ) {
$this->url = $url;
$this->url_in_text_processor = new WP_Migration_URL_In_Text_Processor( $this->get_modifiable_text(), $this->base_url );
}

return true;
}
while ( $this->url_in_text_processor->next_url() ) {
$this->raw_url = $this->url_in_text_processor->get_raw_url();
$this->parsed_url = $this->url_in_text_processor->get_parsed_url();

return true;
}

return false;
Expand All @@ -109,7 +115,7 @@ private function next_url_attribute() {
return false;
}

while ( ++$this->inspected_url_attribute_idx < count( self::URL_ATTRIBUTES[ $tag ] ) ) {
while ( ++ $this->inspected_url_attribute_idx < count( self::URL_ATTRIBUTES[ $tag ] ) ) {
$attr = self::URL_ATTRIBUTES[ $tag ][ $this->inspected_url_attribute_idx ];
if ( false === $attr ) {
return false;
Expand All @@ -123,10 +129,14 @@ private function next_url_attribute() {
* be correctly recognized as a URL.
* Without a base URL, this Processor would incorrectly skip it.
*/
if ( is_string( $url_maybe ) && URL::canParse( $url_maybe, $this->base_url ) ) {
$this->url = $url_maybe;
if ( is_string( $url_maybe ) ) {
$parsed_url = WP_URL::parse( $url_maybe, $this->base_url );
if ( false !== $parsed_url ) {
$this->raw_url = $url_maybe;
$this->parsed_url = $parsed_url;

return true;
return true;
}
}
}

Expand All @@ -143,18 +153,22 @@ private function next_url_block_attribute() {
* When a base URL is missing, the string must start with a protocol to
* be considered a URL.
*/
if ( is_string( $url_maybe ) && URL::canParse( $url_maybe ) ) {
$this->url = $url_maybe;
if ( is_string( $url_maybe ) ) {
$parsed_url = WP_URL::parse( $url_maybe );
if ( false !== $parsed_url ) {
$this->raw_url = $url_maybe;
$this->parsed_url = $parsed_url;

return true;
return true;
}
}
}

return false;
}

public function set_url( $new_url ) {
if ( null === $this->url ) {
public function set_raw_url( $new_url ) {
if ( null === $this->raw_url ) {
return false;
}
switch ( parent::get_token_type() ) {
Expand All @@ -176,7 +190,7 @@ public function set_url( $new_url ) {
}
$this->url_in_text_node_updated = true;

return $this->url_in_text_processor->set_url( $new_url );
return $this->url_in_text_processor->set_raw_url( $new_url );
}
}

Expand Down
Loading

0 comments on commit 19008b7

Please sign in to comment.