Skip to content

Commit

Permalink
Merge pull request #7729 from cosmicexplorer/cache-parse-links
Browse files Browse the repository at this point in the history
Cache parse_links() by --find-links html page url
  • Loading branch information
xavfernandez authored Apr 8, 2020
2 parents bd912cf + 6e7b16c commit 327a315
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 13 deletions.
3 changes: 3 additions & 0 deletions news/7729.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Cache the result of parse_links() to avoid re-tokenizing a find-links page multiple times over a pip run.

This change significantly improves resolve performance when --find-links points to a very large html page.
96 changes: 86 additions & 10 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import cgi
import functools
import itertools
import logging
import mimetypes
Expand All @@ -25,8 +26,8 @@

if MYPY_CHECK_RUNNING:
from typing import (
Callable, Iterable, List, MutableMapping, Optional, Sequence, Tuple,
Union,
Callable, Iterable, List, MutableMapping, Optional,
Protocol, Sequence, Tuple, TypeVar, Union,
)
import xml.etree.ElementTree

Expand All @@ -38,10 +39,31 @@
HTMLElement = xml.etree.ElementTree.Element
ResponseHeaders = MutableMapping[str, str]

# Used in the @lru_cache polyfill.
F = TypeVar('F')

class LruCache(Protocol):
def __call__(self, maxsize=None):
# type: (Optional[int]) -> Callable[[F], F]
raise NotImplementedError


logger = logging.getLogger(__name__)


# Fallback to noop_lru_cache in Python 2
# TODO: this can be removed when python 2 support is dropped!
def noop_lru_cache(maxsize=None):
# type: (Optional[int]) -> Callable[[F], F]
def _wrapper(f):
# type: (F) -> F
return f
return _wrapper


_lru_cache = getattr(functools, "lru_cache", noop_lru_cache) # type: LruCache


def _match_vcs_scheme(url):
# type: (str) -> Optional[str]
"""Look for VCS schemes in the URL.
Expand Down Expand Up @@ -285,6 +307,48 @@ def _create_link_from_element(
return link


class CacheablePageContent(object):
def __init__(self, page):
# type: (HTMLPage) -> None
assert page.cache_link_parsing
self.page = page

def __eq__(self, other):
# type: (object) -> bool
return (isinstance(other, type(self)) and
self.page.url == other.page.url)

def __hash__(self):
# type: () -> int
return hash(self.page.url)


def with_cached_html_pages(
fn, # type: Callable[[HTMLPage], Iterable[Link]]
):
# type: (...) -> Callable[[HTMLPage], List[Link]]
"""
Given a function that parses an Iterable[Link] from an HTMLPage, cache the
function's result (keyed by CacheablePageContent), unless the HTMLPage
`page` has `page.cache_link_parsing == False`.
"""

@_lru_cache(maxsize=None)
def wrapper(cacheable_page):
# type: (CacheablePageContent) -> List[Link]
return list(fn(cacheable_page.page))

@functools.wraps(fn)
def wrapper_wrapper(page):
# type: (HTMLPage) -> List[Link]
if page.cache_link_parsing:
return wrapper(CacheablePageContent(page))
return list(fn(page))

return wrapper_wrapper


@with_cached_html_pages
def parse_links(page):
# type: (HTMLPage) -> Iterable[Link]
"""
Expand Down Expand Up @@ -314,18 +378,23 @@ class HTMLPage(object):

def __init__(
self,
content, # type: bytes
encoding, # type: Optional[str]
url, # type: str
content, # type: bytes
encoding, # type: Optional[str]
url, # type: str
cache_link_parsing=True, # type: bool
):
# type: (...) -> None
"""
:param encoding: the encoding to decode the given content.
:param url: the URL from which the HTML was downloaded.
:param cache_link_parsing: whether links parsed from this page's url
should be cached. PyPI index urls should
have this set to False, for example.
"""
self.content = content
self.encoding = encoding
self.url = url
self.cache_link_parsing = cache_link_parsing

def __str__(self):
# type: () -> str
Expand All @@ -343,10 +412,14 @@ def _handle_get_page_fail(
meth("Could not fetch URL %s: %s - skipping", link, reason)


def _make_html_page(response):
# type: (Response) -> HTMLPage
def _make_html_page(response, cache_link_parsing=True):
# type: (Response, bool) -> HTMLPage
encoding = _get_encoding_from_headers(response.headers)
return HTMLPage(response.content, encoding=encoding, url=response.url)
return HTMLPage(
response.content,
encoding=encoding,
url=response.url,
cache_link_parsing=cache_link_parsing)


def _get_html_page(link, session=None):
Expand Down Expand Up @@ -399,7 +472,8 @@ def _get_html_page(link, session=None):
except requests.Timeout:
_handle_get_page_fail(link, "timed out")
else:
return _make_html_page(resp)
return _make_html_page(resp,
cache_link_parsing=link.cache_link_parsing)
return None


Expand Down Expand Up @@ -562,7 +636,9 @@ def collect_links(self, project_name):
# We want to filter out anything that does not have a secure origin.
url_locations = [
link for link in itertools.chain(
(Link(url) for url in index_url_loc),
# Mark PyPI indices as "cache_link_parsing == False" -- this
# will avoid caching the result of parsing the page for links.
(Link(url, cache_link_parsing=False) for url in index_url_loc),
(Link(url) for url in fl_url_loc),
)
if self.session.is_secure_origin(link)
Expand Down
8 changes: 8 additions & 0 deletions src/pip/_internal/models/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def __init__(
comes_from=None, # type: Optional[Union[str, HTMLPage]]
requires_python=None, # type: Optional[str]
yanked_reason=None, # type: Optional[Text]
cache_link_parsing=True, # type: bool
):
# type: (...) -> None
"""
Expand All @@ -46,6 +47,11 @@ def __init__(
a simple repository HTML link. If the file has been yanked but
no reason was provided, this should be the empty string. See
PEP 592 for more information and the specification.
:param cache_link_parsing: A flag that is used elsewhere to determine
whether resources retrieved from this link
should be cached. PyPI index urls should
generally have this set to False, for
example.
"""

# url can be a UNC windows share
Expand All @@ -63,6 +69,8 @@ def __init__(

super(Link, self).__init__(key=url, defining_class=Link)

self.cache_link_parsing = cache_link_parsing

def __str__(self):
# type: () -> str
if self.requires_python:
Expand Down
60 changes: 57 additions & 3 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import logging
import os.path
import re
import uuid
from textwrap import dedent

import mock
Expand All @@ -26,7 +28,7 @@
from pip._internal.models.index import PyPI
from pip._internal.models.link import Link
from pip._internal.network.session import PipSession
from tests.lib import make_test_link_collector
from tests.lib import make_test_link_collector, skip_if_python2


@pytest.mark.parametrize(
Expand Down Expand Up @@ -355,14 +357,61 @@ def test_parse_links__yanked_reason(anchor_html, expected):
page = HTMLPage(
html_bytes,
encoding=None,
url='https://example.com/simple/',
# parse_links() is cached by url, so we inject a random uuid to ensure
# the page content isn't cached.
url='https://example.com/simple-{}/'.format(uuid.uuid4()),
)
links = list(parse_links(page))
link, = links
actual = link.yanked_reason
assert actual == expected


@skip_if_python2
def test_parse_links_caches_same_page_by_url():
html = (
'<html><head><meta charset="utf-8"><head>'
'<body><a href="/pkg1-1.0.tar.gz"></a></body></html>'
)
html_bytes = html.encode('utf-8')

url = 'https://example.com/simple/'

page_1 = HTMLPage(
html_bytes,
encoding=None,
url=url,
)
# Make a second page with zero content, to ensure that it's not accessed,
# because the page was cached by url.
page_2 = HTMLPage(
b'',
encoding=None,
url=url,
)
# Make a third page which represents an index url, which should not be
# cached, even for the same url. We modify the page content slightly to
# verify that the result is not cached.
page_3 = HTMLPage(
re.sub(b'pkg1', b'pkg2', html_bytes),
encoding=None,
url=url,
cache_link_parsing=False,
)

parsed_links_1 = list(parse_links(page_1))
assert len(parsed_links_1) == 1
assert 'pkg1' in parsed_links_1[0].url

parsed_links_2 = list(parse_links(page_2))
assert parsed_links_2 == parsed_links_1

parsed_links_3 = list(parse_links(page_3))
assert len(parsed_links_3) == 1
assert parsed_links_3 != parsed_links_1
assert 'pkg2' in parsed_links_3[0].url


def test_request_http_error(caplog):
caplog.set_level(logging.DEBUG)
link = Link('http://localhost')
Expand Down Expand Up @@ -528,13 +577,14 @@ def test_fetch_page(self, mock_get_html_response):
fake_response = make_fake_html_response(url)
mock_get_html_response.return_value = fake_response

location = Link(url)
location = Link(url, cache_link_parsing=False)
link_collector = make_test_link_collector()
actual = link_collector.fetch_page(location)

assert actual.content == fake_response.content
assert actual.encoding is None
assert actual.url == url
assert actual.cache_link_parsing == location.cache_link_parsing

# Also check that the right session object was passed to
# _get_html_response().
Expand All @@ -559,8 +609,12 @@ def test_collect_links(self, caplog, data):

assert len(actual.find_links) == 1
check_links_include(actual.find_links, names=['packages'])
# Check that find-links URLs are marked as cacheable.
assert actual.find_links[0].cache_link_parsing

assert actual.project_urls == [Link('https://pypi.org/simple/twine/')]
# Check that index URLs are marked as *un*cacheable.
assert not actual.project_urls[0].cache_link_parsing

expected_message = dedent("""\
1 location(s) to search for versions of twine:
Expand Down

0 comments on commit 327a315

Please sign in to comment.