Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove unnecessary html.unescape() calls in index/collector.py #10378

Merged
merged 1 commit into from
Sep 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/10378.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix double unescape of HTML ``data-requires-python`` and ``data-yanked`` attributes.
5 changes: 0 additions & 5 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import cgi
import collections
import functools
import html
import itertools
import logging
import os
Expand Down Expand Up @@ -248,11 +247,7 @@ def _create_link_from_element(

url = _clean_link(urllib.parse.urljoin(base_url, href))
pyrequire = anchor.get("data-requires-python")
pyrequire = html.unescape(pyrequire) if pyrequire else None

yanked_reason = anchor.get("data-yanked")
if yanked_reason:
yanked_reason = html.unescape(yanked_reason)

link = Link(
url,
Expand Down
63 changes: 45 additions & 18 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,45 @@ def test_clean_link(url, clean_url):
assert _clean_link(url) == clean_url


def _test_parse_links_data_attribute(anchor_html, attr, expected):
html = f'<html><head><meta charset="utf-8"><head><body>{anchor_html}</body></html>'
html_bytes = html.encode("utf-8")
page = HTMLPage(
html_bytes,
encoding=None,
# parse_links() is cached by url, so we inject a random uuid to ensure
# the page content isn't cached.
url=f"https://example.com/simple-{uuid.uuid4()}/",
)
links = list(parse_links(page))
(link,) = links
actual = getattr(link, attr)
assert actual == expected


@pytest.mark.parametrize(
"anchor_html, expected",
[
# Test not present.
('<a href="/pkg-1.0.tar.gz"></a>', None),
# Test present with no value.
('<a href="/pkg-1.0.tar.gz" data-requires-python></a>', None),
# Test a value with an escaped character.
(
'<a href="/pkg-1.0.tar.gz" data-requires-python="&gt;=3.6"></a>',
">=3.6",
),
# Test requires python is unescaped once.
(
'<a href="/pkg-1.0.tar.gz" data-requires-python="&amp;gt;=3.6"></a>',
"&gt;=3.6",
),
],
)
def test_parse_links__requires_python(anchor_html, expected):
_test_parse_links_data_attribute(anchor_html, "requires_python", expected)


@pytest.mark.parametrize(
"anchor_html, expected",
[
Expand All @@ -429,27 +468,15 @@ def test_clean_link(url, clean_url):
'<a href="/pkg-1.0.tar.gz" data-yanked="curlyquote \u2018"></a>',
"curlyquote \u2018",
),
# Test yanked reason is unescaped once.
(
'<a href="/pkg-1.0.tar.gz" data-yanked="version &amp;lt; 1"></a>',
"version &lt; 1",
),
],
)
def test_parse_links__yanked_reason(anchor_html, expected):
html = (
# Mark this as a unicode string for Python 2 since anchor_html
# can contain non-ascii.
'<html><head><meta charset="utf-8"><head>'
"<body>{}</body></html>"
).format(anchor_html)
html_bytes = html.encode("utf-8")
page = HTMLPage(
html_bytes,
encoding=None,
# parse_links() is cached by url, so we inject a random uuid to ensure
# the page content isn't cached.
url=f"https://example.com/simple-{uuid.uuid4()}/",
)
links = list(parse_links(page))
(link,) = links
actual = link.yanked_reason
assert actual == expected
_test_parse_links_data_attribute(anchor_html, "yanked_reason", expected)


def test_parse_links_caches_same_page_by_url():
Expand Down