Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Commit

Permalink
Improve URL previews for some pages (#12951)
Browse files Browse the repository at this point in the history
* Skip `og` and `meta` tags where the value is empty.
* Fallback to the favicon if there are no other images.
* Ignore tags meant for navigation.
  • Loading branch information
clokep authored Jun 3, 2022
1 parent 888a29f commit 01df5ba
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 18 deletions.
1 change: 1 addition & 0 deletions changelog.d/12951.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve URL previews for pages with empty elements.
52 changes: 35 additions & 17 deletions synapse/rest/media/v1/preview_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)

# Certain elements aren't meant for display.
ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"}


def _normalise_encoding(encoding: str) -> Optional[str]:
"""Use the Python codec's name as the normalised entry."""
Expand Down Expand Up @@ -174,13 +177,15 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",

og: Dict[str, Optional[str]] = {}
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
if "content" in tag.attrib:
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
return {}
og[tag.attrib["property"]] = tag.attrib["content"]
for tag in tree.xpath(
"//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
):
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
return {}

og[tag.attrib["property"]] = tag.attrib["content"]

# TODO: grab article: meta tags too, e.g.:

Expand All @@ -192,21 +197,23 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
# "article:modified_time" content="2016-04-01T18:31:53+00:00" />

if "og:title" not in og:
# do some basic spidering of the HTML
title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
if title and title[0].text is not None:
og["og:title"] = title[0].text.strip()
# Attempt to find a title from the title tag, or the biggest header on the page.
title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
if title:
og["og:title"] = title[0].strip()
else:
og["og:title"] = None

if "og:image" not in og:
# TODO: extract a favicon failing all else
meta_image = tree.xpath(
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
)
# If a meta image is found, use it.
if meta_image:
og["og:image"] = meta_image[0]
else:
# Try to find images which are larger than 10px by 10px.
#
# TODO: consider inlined CSS styles as well as width & height attribs
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
images = sorted(
Expand All @@ -215,17 +222,24 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
-1 * float(i.attrib["width"]) * float(i.attrib["height"])
),
)
# If no images were found, try to find *any* images.
if not images:
images = tree.xpath("//img[@src]")
images = tree.xpath("//img[@src][1]")
if images:
og["og:image"] = images[0].attrib["src"]

# Finally, fallback to the favicon if nothing else.
else:
favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
if favicons:
og["og:image"] = favicons[0]

if "og:description" not in og:
# Check the first meta description tag for content.
meta_description = tree.xpath(
"//*/meta"
"[translate(@name, 'DESCRIPTION', 'description')='description']"
"/@content"
"//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
)
# If a meta description is found with content, use it.
if meta_description:
og["og:description"] = meta_description[0]
else:
Expand Down Expand Up @@ -306,6 +320,10 @@ def _iterate_over_text(
if isinstance(el, str):
yield el
elif el.tag not in tags_to_ignore:
# If the element isn't meant for display, ignore it.
if el.get("role") in ARIA_ROLES_TO_IGNORE:
continue

# el.text is the text before the first child, so we can immediately
# return it if the text exists.
if el.text:
Expand Down
37 changes: 36 additions & 1 deletion tests/rest/media/v1/test_html_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def test_small_then_large_summarize(self) -> None:
)


class CalcOgTestCase(unittest.TestCase):
class OpenGraphFromHtmlTestCase(unittest.TestCase):
if not lxml:
skip = "url preview feature requires lxml"

Expand Down Expand Up @@ -235,6 +235,21 @@ def test_missing_title(self) -> None:

self.assertEqual(og, {"og:title": None, "og:description": "Some text."})

# Another variant is a title with no content.
html = b"""
<html>
<head><title></title></head>
<body>
<h1>Title</h1>
</body>
</html>
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})

def test_h1_as_title(self) -> None:
html = b"""
<html>
Expand All @@ -250,6 +265,26 @@ def test_h1_as_title(self) -> None:

self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})

def test_empty_description(self) -> None:
"""Description tags with empty content should be ignored."""
html = b"""
<html>
<meta property="og:description" content=""/>
<meta property="og:description"/>
<meta name="description" content=""/>
<meta name="description"/>
<meta name="description" content="Finally!"/>
<body>
<h1>Title</h1>
</body>
</html>
"""

tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)

self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})

def test_missing_title_and_broken_h1(self) -> None:
html = b"""
<html>
Expand Down

0 comments on commit 01df5ba

Please sign in to comment.