From c827d8df740aa2fd198097b6a1f8f81a21bf2df8 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sun, 3 May 2020 22:56:51 +0200 Subject: [PATCH] spouts\rss: Provide unencoded link SimplePie returns links escaped with htmlspecialchars but we might want to use the URL to fetch additional data so we need to unescape it first. ContentLoader will escape it again before saving it to database. I went through all the spouts looking for returned values that are used as URLs, Reddit does it too. Ideally, everything would return raw data and we would escape it when displayed but that would be a pain since we would likely have to convert the already stored data. --- NEWS.md | 1 + src/helpers/FeedReader.php | 8 ++++++-- src/spouts/reddit/reddit2.php | 3 ++- src/spouts/rss/feed.php | 2 +- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 69572b6e26..91828de8a4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,7 @@ - Set 60 second timeout to spout HTTP requests to prevent a single feed blocking other updates ([#1104](https://github.com/SSilence/selfoss/issues/1104)) - Significantly improved accessibility ([#1133](https://github.com/SSilence/selfoss/pull/1133), [#1134](https://github.com/SSilence/selfoss/pull/1134) and [#1141](https://github.com/SSilence/selfoss/pull/1141)) - Fixed marking more than 1000 items as read at the same time ([#1182](https://github.com/SSilence/selfoss/issues/1182)) +- Fixed loading full text on pages containing ampersands in URLs ([#1188](https://github.com/SSilence/selfoss/pull/1188)) ### API changes - `tags` attribute is now consistently array of strings, numbers are numbers and booleans are booleans. **This might break third-party clients that have not updated yet.** ([#948](https://github.com/SSilence/selfoss/pull/948)) diff --git a/src/helpers/FeedReader.php b/src/helpers/FeedReader.php index db320f8ba0..b9888c5b3b 100644 --- a/src/helpers/FeedReader.php +++ b/src/helpers/FeedReader.php @@ -61,7 +61,7 @@ public function load($url) { return [ // save fetched items 'items' => $this->simplepie->get_items(), - 'htmlUrl' => @$this->simplepie->get_link(), + 'htmlUrl' => htmlspecialchars_decode($this->simplepie->get_link(), ENT_COMPAT), // SimplePie sanitizes URLs 'spoutTitle' => $this->simplepie->get_title(), ]; } @@ -72,7 +72,7 @@ public function load($url) { * @return ?string */ public function getImageUrl() { - return $this->simplepie->get_image_url(); + return htmlspecialchars_decode($this->simplepie->get_image_url(), ENT_COMPAT); // SimplePie sanitizes URLs } /** @@ -81,6 +81,10 @@ public function getImageUrl() { * @return ?string */ public function getFeedUrl() { + // SimplePie sanitizes URLs but it unescapes ampersands here. + // Since double quotes and angle brackets are excluded from URIs, + // we need not worry about them and consider this unescaped. + // https://tools.ietf.org/html/rfc2396#section-2.4.3 return $this->simplepie->subscribe_url(); } diff --git a/src/spouts/reddit/reddit2.php b/src/spouts/reddit/reddit2.php index 874c5d0212..8b6ac4ce83 100644 --- a/src/spouts/reddit/reddit2.php +++ b/src/spouts/reddit/reddit2.php @@ -118,7 +118,8 @@ public function getTitle() { public function getHtmlUrl() { if ($this->items !== null && $this->valid()) { - return @current($this->items)['data']['url']; + // Reddit escapes HTML, we can get away with just ampersands, since quotes and angle brackets are excluded from URLs. + return htmlspecialchars_decode(current($this->items)['data']['url'], ENT_NOQUOTES); } return null; diff --git a/src/spouts/rss/feed.php b/src/spouts/rss/feed.php index fae3fa60b7..b3c4a08ff8 100644 --- a/src/spouts/rss/feed.php +++ b/src/spouts/rss/feed.php @@ -141,7 +141,7 @@ public function getLink() { if ($this->items !== null && $this->valid()) { $link = @current($this->items)->get_link(); - return $link; + return htmlspecialchars_decode($link, ENT_COMPAT); // SimplePie sanitizes URLs } return null;