matrix-org · clokep · Jan 24, 2022 · Jan 18, 2022 · Jan 18, 2022 · Jan 18, 2022
@@ -0,0 +1 @@
+Fix a long-standing bug when previewing Reddit URLs which do not contain an image.
@@ -321,14 +321,33 @@ def _iterate_over_text(
 
 
 def rebase_url(url: str, base: str) -> str:
-    base_parts = list(urlparse.urlparse(base))
+    """
+    Resolves a potentially relative `url` against an absolute `base` URL.
+
+    For example:
+
+        >>> rebase_url("subpage", "https://example.com/foo/")
+        'https://example.com/foo/subpage'
+        >>> rebase_url("sibling", "https://example.com/foo")
+        'https://example.com/sibling'
+        >>> rebase_url("/bar", "https://example.com/foo/")
+        'https://example.com/bar'
+        >>> rebase_url("https://alice.com/a/", "https://example.com/foo/")
+        'https://alice.com/a'
+    """
+    base_parts = urlparse.urlparse(base)
+    # Convert the parsed URL to a list for (potential) modification.
     url_parts = list(urlparse.urlparse(url))
-    if not url_parts[0]:  # fix up schema
-        url_parts[0] = base_parts[0] or "http"
-    if not url_parts[1]:  # fix up hostname
-        url_parts[1] = base_parts[1]
+    # Add a scheme, if one does not exist.
+    if not url_parts[0]:
+        url_parts[0] = base_parts.scheme or "http"
+    # Fix up the hostname, if this is not a data URL.
+    if url_parts[0] != "data" and not url_parts[1]:
+        url_parts[1] = base_parts.netloc
+        # If the path does not start with a /, nest it under the base path's last
+        # directory.
         if not url_parts[2].startswith("/"):
-            url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts[2]) + url_parts[2]
+            url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts.path) + url_parts[2]
     return urlparse.urlunparse(url_parts)
 
 

@@ -21,8 +21,9 @@
 import shutil
 import sys
 import traceback
-from typing import TYPE_CHECKING, Iterable, Optional, Tuple
+from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
 from urllib import parse as urlparse
+from urllib.request import urlopen
 
 import attr
 
@@ -256,7 +257,7 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
         if oembed_url:
             url_to_download = oembed_url
 
-        media_info = await self._download_url(url_to_download, user)
+        media_info = await self._handle_url(url_to_download, user)
 
         logger.debug("got media_info of '%s'", media_info)
 
@@ -297,7 +298,9 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
                 oembed_url = self._oembed.autodiscover_from_html(tree)
                 og_from_oembed: JsonDict = {}
                 if oembed_url:
-                    oembed_info = await self._download_url(oembed_url, user)
+                    oembed_info = await self._handle_url(
+                        oembed_url, user, allow_data_urls=True
+                    )
                     (
                         og_from_oembed,
                         author_name,
@@ -367,7 +370,134 @@ async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
 
         return jsonog.encode("utf8")
 
-    async def _download_url(self, url: str, user: UserID) -> MediaInfo:
+    async def _download_url(
+        self, url: str, output_stream: BinaryIO
+    ) -> Tuple[int, str, int, str, Optional[str], int, Optional[str]]:
+        """
+        Fetches a remote URL and parses the headers.
+
+        Args:
+             url: The URL to fetch.
+             output_stream: The stream to write the content to.
+
+        Returns:
+            A tuple of:
+                Media length, URL downloaded, the HTTP response code,
+                the media type, the downloaded file name, the number of
+                milliseconds the result is valid for, the etag header.
+        """
+
+        try:
+            logger.debug("Trying to get preview for url '%s'", url)
+            length, headers, uri, code = await self.client.get_file(
+                url,
+                output_stream=output_stream,
+                max_size=self.max_spider_size,
+                headers={"Accept-Language": self.url_preview_accept_language},
+            )
+        except SynapseError:
+            # Pass SynapseErrors through directly, so that the servlet
+            # handler will return a SynapseError to the client instead of
+            # blank data or a 500.
+            raise
+        except DNSLookupError:
+            # DNS lookup returned no results
+            # Note: This will also be the case if one of the resolved IP
+            # addresses is blacklisted
+            raise SynapseError(
+                502,
+                "DNS resolution failure during URL preview generation",
+                Codes.UNKNOWN,
+            )
+        except Exception as e:
+            # FIXME: pass through 404s and other error messages nicely
+            logger.warning("Error downloading %s: %r", url, e)
+
+            raise SynapseError(
+                500,
+                "Failed to download content: %s"
+                % (traceback.format_exception_only(sys.exc_info()[0], e),),
+                Codes.UNKNOWN,
+            )
+
+        if b"Content-Type" in headers:
+            media_type = headers[b"Content-Type"][0].decode("ascii")
+        else:
+            media_type = "application/octet-stream"
+
+        download_name = get_filename_from_headers(headers)
+
+        # FIXME: we should calculate a proper expiration based on the
+        # Cache-Control and Expire headers.  But for now, assume 1 hour.
+        expires = ONE_HOUR
+        etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
+
+        return length, uri, code, media_type, download_name, expires, etag
+
+    async def _parse_data_url(
+        self, url: str, output_stream: BinaryIO
+    ) -> Tuple[int, str, int, str, Optional[str], int, Optional[str]]:
+        """
+        Parses a data: URL.
+
+        Args:
+             url: The URL to parse.
+             output_stream: The stream to write the content to.
+
+        Returns:
+            A tuple of:
+                Media length, URL downloaded, the HTTP response code,
+                the media type, the downloaded file name, the number of
+                milliseconds the result is valid for, the etag header.
+        """
+
+        try:
+            logger.debug("Trying to parse data url '%s'", url)
+            with urlopen(url) as url_info:
+                # TODO Can this be more efficient.
+                output_stream.write(url_info.read())
+        except Exception as e:
+            logger.warning("Error parsing data: URL %s: %r", url, e)
+
+            raise SynapseError(
+                500,
+                "Failed to parse data URL: %s"
+                % (traceback.format_exception_only(sys.exc_info()[0], e),),
+                Codes.UNKNOWN,
+            )
+
+        # Read back the length that has been written.
+        length = output_stream.tell()
+
+        # urlopen shoves the media-type from the data URL into the content type
+        # header object.
+        media_type = url_info.headers.get_content_type()
+
+        # Some features are not supported by data: URLs.
+        download_name = None
+        expires = ONE_HOUR
+        etag = None
+
+        return length, url, 200, media_type, download_name, expires, etag
+
+    async def _handle_url(
+        self, url: str, user: UserID, allow_data_urls: bool = False
+    ) -> MediaInfo:
+        """
+        Fetches content from a URL and parses the result to generate a MediaInfo.
+
+        It uses the media storage provider to persist the fetched content and
+        stores the mapping into the database.
+
+        Args:
+             url: The URL to fetch.
+             user: The user who ahs requested this URL.
+             allow_data_urls: True if data URLs should be allowed.
+
+        Returns:
+            A MediaInfo object describing the fetched content.
+        """
+
         # TODO: we should probably honour robots.txt... except in practice
         # we're most likely being explicitly triggered by a human rather than a
         # bot, so are we really a robot?
@@ -377,51 +507,33 @@ async def _download_url(self, url: str, user: UserID) -> MediaInfo:
         file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
 
         with self.media_storage.store_into_file(file_info) as (f, fname, finish):
-            try:
-                logger.debug("Trying to get preview for url '%s'", url)
-                length, headers, uri, code = await self.client.get_file(
-                    url,
-                    output_stream=f,
-                    max_size=self.max_spider_size,
-                    headers={"Accept-Language": self.url_preview_accept_language},
-                )
-            except SynapseError:
-                # Pass SynapseErrors through directly, so that the servlet
-                # handler will return a SynapseError to the client instead of
-                # blank data or a 500.
-                raise
-            except DNSLookupError:
-                # DNS lookup returned no results
-                # Note: This will also be the case if one of the resolved IP
-                # addresses is blacklisted
-                raise SynapseError(
-                    502,
-                    "DNS resolution failure during URL preview generation",
-                    Codes.UNKNOWN,
-                )
-            except Exception as e:
-                # FIXME: pass through 404s and other error messages nicely
-                logger.warning("Error downloading %s: %r", url, e)
-
-                raise SynapseError(
-                    500,
-                    "Failed to download content: %s"
-                    % (traceback.format_exception_only(sys.exc_info()[0], e),),
-                    Codes.UNKNOWN,
-                )
-            await finish()
+            if url.startswith("data:"):
+                if not allow_data_urls:
+                    raise SynapseError(
+                        500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN
+                    )
 
-            if b"Content-Type" in headers:
-                media_type = headers[b"Content-Type"][0].decode("ascii")
+                (
+                    length,
+                    uri,
+                    code,
+                    media_type,
+                    download_name,
+                    expires,
+                    etag,
+                ) = await self._parse_data_url(url, f)
             else:
-                media_type = "application/octet-stream"
-
-            download_name = get_filename_from_headers(headers)
+                (
+                    length,
+                    uri,
+                    code,
+                    media_type,
+                    download_name,
+                    expires,
+                    etag,
+                ) = await self._download_url(url, f)
 
-            # FIXME: we should calculate a proper expiration based on the
-            # Cache-Control and Expire headers.  But for now, assume 1 hour.
-            expires = ONE_HOUR
-            etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
+            await finish()
 
         try:
             time_now_ms = self.clock.time_msec()
@@ -474,8 +586,8 @@ async def _precache_image_url(
         # FIXME: it might be cleaner to use the same flow as the main /preview_url
         # request itself and benefit from the same caching etc.  But for now we
         # just rely on the caching on the master request to speed things up.
-        image_info = await self._download_url(
-            rebase_url(og["og:image"], media_info.uri), user
+        image_info = await self._handle_url(
+            rebase_url(og["og:image"], media_info.uri), user, allow_data_urls=True
         )
 
         if _is_media(image_info.media_type):

@@ -16,10 +16,11 @@
     _get_html_media_encodings,
     decode_body,
     parse_html_to_open_graph,
+    rebase_url,
     summarize_paragraphs,
 )
 
-from . import unittest
+from tests import unittest
 
 try:
     import lxml
@@ -447,3 +448,34 @@ def test_unknown_invalid(self):
             'text/html; charset="invalid"',
         )
         self.assertEqual(list(encodings), ["utf-8", "cp1252"])
+
+
+class RebaseUrlTestCase(unittest.TestCase):
+    def test_relative(self):
+        """Relative URLs should be resolved based on the context of the base URL."""
+        self.assertEqual(
+            rebase_url("subpage", "https://example.com/foo/"),
+            "https://example.com/foo/subpage",
+        )
+        self.assertEqual(
+            rebase_url("sibling", "https://example.com/foo"),
+            "https://example.com/sibling",
+        )
+        self.assertEqual(
+            rebase_url("/bar", "https://example.com/foo/"),
+            "https://example.com/bar",
+        )
+
+    def test_absolute(self):
+        """Absolute URLs should not be modified."""
+        self.assertEqual(
+            rebase_url("https://alice.com/a/", "https://example.com/foo/"),
+            "https://alice.com/a/",
+        )
+
+    def test_data(self):
+        """Data URLs should not be modified."""
+        self.assertEqual(
+            rebase_url("data:,Hello%2C%20World%21", "https://example.com/foo/"),
+            "data:,Hello%2C%20World%21",
+        )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Fix a long-standing bug when previewing Reddit URLs which do not contain an image.