Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Strip overlong OpenGraph data from url preview #6331

Merged
merged 3 commits into from
Nov 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/6331.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Limit the length of data returned by url previews, to prevent DoS attacks.
20 changes: 19 additions & 1 deletion synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@
_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)

OG_TAG_NAME_MAXLEN = 50
OG_TAG_VALUE_MAXLEN = 1000


class PreviewUrlResource(DirectServeResource):
isLeaf = True
Expand Down Expand Up @@ -171,7 +174,7 @@ def _do_preview(self, url, user, ts):
ts (int):

Returns:
Deferred[str]: json-encoded og data
Deferred[bytes]: json-encoded og data
"""
# check the URL cache in the DB (which will also provide us with
# historical previews, if we have any)
Expand Down Expand Up @@ -272,6 +275,17 @@ def _do_preview(self, url, user, ts):
logger.warning("Failed to find any OG data in %s", url)
og = {}

# filter out any stupidly long values
keys_to_remove = []
for k, v in og.items():
if len(k) > OG_TAG_NAME_MAXLEN or len(v) > OG_TAG_VALUE_MAXLEN:
logger.warning(
"Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
)
keys_to_remove.append(k)
richvdh marked this conversation as resolved.
Show resolved Hide resolved
for k in keys_to_remove:
del og[k]

logger.debug("Calculated OG for %s as %s", url, og)

jsonog = json.dumps(og)
Expand Down Expand Up @@ -506,6 +520,10 @@ def _calc_og(tree, media_uri):
og = {}
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
if "content" in tag.attrib:
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
return {}
og[tag.attrib["property"]] = tag.attrib["content"]

# TODO: grab article: meta tags too, e.g.:
Expand Down
35 changes: 35 additions & 0 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,41 @@ def test_non_ascii_preview_content_type(self):
self.assertEqual(channel.code, 200)
self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")

def test_overlong_title(self):
self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]

end_content = (
b"<html><head>"
b"<title>" + b"x" * 2000 + b"</title>"
b'<meta property="og:description" content="hi" />'
b"</head></html>"
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(You can use multiline b"""....""" byte quotes fwiw)


request, channel = self.make_request(
"GET", "url_preview?url=http://matrix.org", shorthand=False
)
request.render(self.preview_url)
self.pump()

client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
)
% (len(end_content),)
+ end_content
)

self.pump()
self.assertEqual(channel.code, 200)
res = channel.json_body
# We should only see the `og:description` field, as `title` is too long and should be stripped out
self.assertCountEqual(["og:description"], res.keys())
richvdh marked this conversation as resolved.
Show resolved Hide resolved

def test_ipaddr(self):
"""
IP addresses can be previewed directly.
Expand Down