Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Merge pull request #6331 from matrix-org/rav/url_preview_limit_title
Browse files Browse the repository at this point in the history
* commit '02f99906f':
  Apply suggestions from code review
  Strip overlong OpenGraph data from url preview
  • Loading branch information
anoadragon453 committed Mar 16, 2020
2 parents 4d02402 + 02f9990 commit 6e290f3
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 1 deletion.
1 change: 1 addition & 0 deletions changelog.d/6331.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Limit the length of data returned by url previews, to prevent DoS attacks.
20 changes: 19 additions & 1 deletion synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@
_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)

OG_TAG_NAME_MAXLEN = 50
OG_TAG_VALUE_MAXLEN = 1000


class PreviewUrlResource(DirectServeResource):
isLeaf = True
Expand Down Expand Up @@ -171,7 +174,7 @@ def _do_preview(self, url, user, ts):
ts (int):
Returns:
Deferred[str]: json-encoded og data
Deferred[bytes]: json-encoded og data
"""
# check the URL cache in the DB (which will also provide us with
# historical previews, if we have any)
Expand Down Expand Up @@ -272,6 +275,17 @@ def _do_preview(self, url, user, ts):
logger.warning("Failed to find any OG data in %s", url)
og = {}

# filter out any stupidly long values
keys_to_remove = []
for k, v in og.items():
if len(k) > OG_TAG_NAME_MAXLEN or len(v) > OG_TAG_VALUE_MAXLEN:
logger.warning(
"Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
)
keys_to_remove.append(k)
for k in keys_to_remove:
del og[k]

logger.debug("Calculated OG for %s as %s", url, og)

jsonog = json.dumps(og)
Expand Down Expand Up @@ -506,6 +520,10 @@ def _calc_og(tree, media_uri):
og = {}
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
if "content" in tag.attrib:
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
return {}
og[tag.attrib["property"]] = tag.attrib["content"]

# TODO: grab article: meta tags too, e.g.:
Expand Down
35 changes: 35 additions & 0 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,41 @@ def test_non_ascii_preview_content_type(self):
self.assertEqual(channel.code, 200)
self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")

def test_overlong_title(self):
self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]

end_content = (
b"<html><head>"
b"<title>" + b"x" * 2000 + b"</title>"
b'<meta property="og:description" content="hi" />'
b"</head></html>"
)

request, channel = self.make_request(
"GET", "url_preview?url=http://matrix.org", shorthand=False
)
request.render(self.preview_url)
self.pump()

client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
)
% (len(end_content),)
+ end_content
)

self.pump()
self.assertEqual(channel.code, 200)
res = channel.json_body
# We should only see the `og:description` field, as `title` is too long and should be stripped out
self.assertCountEqual(["og:description"], res.keys())

def test_ipaddr(self):
"""
IP addresses can be previewed directly.
Expand Down

0 comments on commit 6e290f3

Please sign in to comment.