Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Support rendering previews with data: URLs in them #11767

Merged
merged 16 commits into from
Jan 24, 2022
Merged
1 change: 1 addition & 0 deletions changelog.d/11767.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a long-standing bug when previewing Reddit URLs which do not contain an image.
31 changes: 25 additions & 6 deletions synapse/rest/media/v1/preview_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,14 +321,33 @@ def _iterate_over_text(


def rebase_url(url: str, base: str) -> str:
clokep marked this conversation as resolved.
Show resolved Hide resolved
base_parts = list(urlparse.urlparse(base))
"""
Resolves a potentially relative `url` against an absolute `base` URL.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you know how this differs from urljoin in the standard lib?
Not necessarily opposed to rolling our own, but it'd be nice to know if we had specific motivation for doing so.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't know of it, but I'll look into it!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this will work OK, but I'd rather punt this to a separate PR since it is pretty unrelated to this work. Would that be OK?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah OK


For example:

>>> rebase_url("subpage", "https://example.com/foo/")
'https://example.com/foo/subpage'
>>> rebase_url("sibling", "https://example.com/foo")
'https://example.com/sibling'
>>> rebase_url("/bar", "https://example.com/foo/")
'https://example.com/bar'
>>> rebase_url("https://alice.com/a/", "https://example.com/foo/")
'https://alice.com/a'
"""
base_parts = urlparse.urlparse(base)
# Convert the parsed URL to a list for (potential) modification.
url_parts = list(urlparse.urlparse(url))
if not url_parts[0]: # fix up schema
url_parts[0] = base_parts[0] or "http"
if not url_parts[1]: # fix up hostname
url_parts[1] = base_parts[1]
# Add a scheme, if one does not exist.
if not url_parts[0]:
url_parts[0] = base_parts.scheme or "http"
# Fix up the hostname, if this is not a data URL.
if url_parts[0] != "data" and not url_parts[1]:
url_parts[1] = base_parts.netloc
# If the path does not start with a /, nest it under the base path's last
# directory.
if not url_parts[2].startswith("/"):
url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts[2]) + url_parts[2]
url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts.path) + url_parts[2]
return urlparse.urlunparse(url_parts)


Expand Down
208 changes: 160 additions & 48 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@
import shutil
import sys
import traceback
from typing import TYPE_CHECKING, Iterable, Optional, Tuple
from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
from urllib import parse as urlparse
from urllib.request import urlopen

import attr

Expand Down Expand Up @@ -256,7 +257,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
if oembed_url:
url_to_download = oembed_url

media_info = await self._download_url(url_to_download, user)
media_info = await self._handle_url(url_to_download, user)

logger.debug("got media_info of '%s'", media_info)

Expand Down Expand Up @@ -297,7 +298,9 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
oembed_url = self._oembed.autodiscover_from_html(tree)
og_from_oembed: JsonDict = {}
if oembed_url:
oembed_info = await self._download_url(oembed_url, user)
oembed_info = await self._handle_url(
oembed_url, user, allow_data_urls=True
)
(
og_from_oembed,
author_name,
Expand Down Expand Up @@ -367,7 +370,134 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:

return jsonog.encode("utf8")

async def _download_url(self, url: str, user: UserID) -> MediaInfo:
async def _download_url(
self, url: str, output_stream: BinaryIO
) -> Tuple[int, str, int, str, Optional[str], int, Optional[str]]:
clokep marked this conversation as resolved.
Show resolved Hide resolved
"""
Fetches a remote URL and parses the headers.

Args:
url: The URL to fetch.
output_stream: The stream to write the content to.

Returns:
A tuple of:
Media length, URL downloaded, the HTTP response code,
the media type, the downloaded file name, the number of
milliseconds the result is valid for, the etag header.
"""

try:
logger.debug("Trying to get preview for url '%s'", url)
length, headers, uri, code = await self.client.get_file(
url,
output_stream=output_stream,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url, e)

raise SynapseError(
500,
"Failed to download content: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)

if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii")
else:
media_type = "application/octet-stream"

download_name = get_filename_from_headers(headers)

# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
expires = ONE_HOUR
etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None

return length, uri, code, media_type, download_name, expires, etag

async def _parse_data_url(
self, url: str, output_stream: BinaryIO
) -> Tuple[int, str, int, str, Optional[str], int, Optional[str]]:
"""
Parses a data: URL.

Args:
url: The URL to parse.
output_stream: The stream to write the content to.

Returns:
A tuple of:
Media length, URL downloaded, the HTTP response code,
the media type, the downloaded file name, the number of
milliseconds the result is valid for, the etag header.
"""

try:
logger.debug("Trying to parse data url '%s'", url)
with urlopen(url) as url_info:
# TODO Can this be more efficient.
output_stream.write(url_info.read())
except Exception as e:
logger.warning("Error parsing data: URL %s: %r", url, e)

raise SynapseError(
500,
"Failed to parse data URL: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)

# Read back the length that has been written.
length = output_stream.tell()

# urlopen shoves the media-type from the data URL into the content type
# header object.
media_type = url_info.headers.get_content_type()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we cross reference this with the mime-type in the data url?

(Do we want to have some kind of whitelist of trusted mime types which are okay to preview?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the mime-type from the data URL, it gets parsed out for us and shoved into a headers property for some reason.

The docs for urlopen have some info on how data URLs are handled:

For FTP, file, and data URLs [...], this function returns a urllib.response.addinfourl object.

The addinfourl object has a headers property:

Returns the headers of the response in the form of an EmailMessage instance.

The EmailMessage object has a get_content_type() property:

Return the message’s content type, coerced to lower case of the form maintype/subtype. If there is no Content-Type header in the message return the value returned by get_default_type(). If the Content-Type header is invalid, return text/plain.

This all doesn't really make sense for data URLs, but from trial and error it seems to do the correct thing.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a clarifying comment in 116015f -- hopefully that helps?


# Some features are not supported by data: URLs.
download_name = None
expires = ONE_HOUR
etag = None

return length, url, 200, media_type, download_name, expires, etag

async def _handle_url(
self, url: str, user: UserID, allow_data_urls: bool = False
) -> MediaInfo:
"""
Fetches content from a URL and parses the result to generate a MediaInfo.

It uses the media storage provider to persist the fetched content and
stores the mapping into the database.

Args:
url: The URL to fetch.
user: The user who ahs requested this URL.
allow_data_urls: True if data URLs should be allowed.

Returns:
A MediaInfo object describing the fetched content.
"""

# TODO: we should probably honour robots.txt... except in practice
# we're most likely being explicitly triggered by a human rather than a
# bot, so are we really a robot?
Expand All @@ -377,51 +507,33 @@ async def _download_url(self, url: str, user: UserID) -> MediaInfo:
file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)

with self.media_storage.store_into_file(file_info) as (f, fname, finish):
try:
logger.debug("Trying to get preview for url '%s'", url)
length, headers, uri, code = await self.client.get_file(
url,
output_stream=f,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url, e)

raise SynapseError(
500,
"Failed to download content: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)
await finish()
if url.startswith("data:"):
if not allow_data_urls:
raise SynapseError(
500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN
)

if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii")
(
length,
uri,
code,
media_type,
download_name,
expires,
etag,
) = await self._parse_data_url(url, f)
else:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might want to assert HTTP(S) schemes here as additional error checking.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I feel like I'd be happier having a few select allowed protocols rather than singling out data:; not sure it really makes sense to ask us to preview an ftp: URI either for example.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, note that those will fail right now (we won't try them), but it would be better to be explicit, I think!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this might actually be tougher to get correct than I initially though since we seem to support previewing scheme-less URLs. I'm not sure it makes sense to refactor that as part of this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, fair enough.

media_type = "application/octet-stream"

download_name = get_filename_from_headers(headers)
(
length,
uri,
code,
media_type,
download_name,
expires,
etag,
) = await self._download_url(url, f)

# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
expires = ONE_HOUR
etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
await finish()

try:
time_now_ms = self.clock.time_msec()
Expand Down Expand Up @@ -474,8 +586,8 @@ async def _precache_image_url(
# FIXME: it might be cleaner to use the same flow as the main /preview_url
# request itself and benefit from the same caching etc. But for now we
# just rely on the caching on the master request to speed things up.
image_info = await self._download_url(
rebase_url(og["og:image"], media_info.uri), user
image_info = await self._handle_url(
rebase_url(og["og:image"], media_info.uri), user, allow_data_urls=True
)

if _is_media(image_info.media_type):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@
_get_html_media_encodings,
decode_body,
parse_html_to_open_graph,
rebase_url,
summarize_paragraphs,
)

from . import unittest
from tests import unittest

try:
import lxml
Expand Down Expand Up @@ -447,3 +448,34 @@ def test_unknown_invalid(self):
'text/html; charset="invalid"',
)
self.assertEqual(list(encodings), ["utf-8", "cp1252"])


class RebaseUrlTestCase(unittest.TestCase):
def test_relative(self):
"""Relative URLs should be resolved based on the context of the base URL."""
self.assertEqual(
rebase_url("subpage", "https://example.com/foo/"),
"https://example.com/foo/subpage",
)
self.assertEqual(
rebase_url("sibling", "https://example.com/foo"),
"https://example.com/sibling",
)
self.assertEqual(
rebase_url("/bar", "https://example.com/foo/"),
"https://example.com/bar",
)

def test_absolute(self):
"""Absolute URLs should not be modified."""
self.assertEqual(
rebase_url("https://alice.com/a/", "https://example.com/foo/"),
"https://alice.com/a/",
)

def test_data(self):
"""Data URLs should not be modified."""
self.assertEqual(
rebase_url("data:,Hello%2C%20World%21", "https://example.com/foo/"),
"data:,Hello%2C%20World%21",
)
Loading