Merge pull request #7729 from cosmicexplorer/cache-parse-links

Cache parse_links() by --find-links html page url
pypa · Apr 8, 2020 · 327a315 · 327a315
2 parents bd912cf + 6e7b16c
commit 327a315
Show file tree

Hide file tree

Showing 4 changed files with 154 additions and 13 deletions.
diff --git a/news/7729.feature b/news/7729.feature
@@ -0,0 +1,3 @@
+Cache the result of parse_links() to avoid re-tokenizing a find-links page multiple times over a pip run.
+
+This change significantly improves resolve performance when --find-links points to a very large html page.
diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py
@@ -3,6 +3,7 @@
 """
 
 import cgi
+import functools
 import itertools
 import logging
 import mimetypes
@@ -25,8 +26,8 @@
 
 if MYPY_CHECK_RUNNING:
     from typing import (
-        Callable, Iterable, List, MutableMapping, Optional, Sequence, Tuple,
-        Union,
+        Callable, Iterable, List, MutableMapping, Optional,
+        Protocol, Sequence, Tuple, TypeVar, Union,
     )
     import xml.etree.ElementTree
 
@@ -38,10 +39,31 @@
     HTMLElement = xml.etree.ElementTree.Element
     ResponseHeaders = MutableMapping[str, str]
 
+    # Used in the @lru_cache polyfill.
+    F = TypeVar('F')
+
+    class LruCache(Protocol):
+        def __call__(self, maxsize=None):
+            # type: (Optional[int]) -> Callable[[F], F]
+            raise NotImplementedError
+
 
 logger = logging.getLogger(__name__)
 
 
+# Fallback to noop_lru_cache in Python 2
+# TODO: this can be removed when python 2 support is dropped!
+def noop_lru_cache(maxsize=None):
+    # type: (Optional[int]) -> Callable[[F], F]
+    def _wrapper(f):
+        # type: (F) -> F
+        return f
+    return _wrapper
+
+
+_lru_cache = getattr(functools, "lru_cache", noop_lru_cache)  # type: LruCache
+
+
 def _match_vcs_scheme(url):
     # type: (str) -> Optional[str]
     """Look for VCS schemes in the URL.
@@ -285,6 +307,48 @@ def _create_link_from_element(
     return link
 
 
+class CacheablePageContent(object):
+    def __init__(self, page):
+        # type: (HTMLPage) -> None
+        assert page.cache_link_parsing
+        self.page = page
+
+    def __eq__(self, other):
+        # type: (object) -> bool
+        return (isinstance(other, type(self)) and
+                self.page.url == other.page.url)
+
+    def __hash__(self):
+        # type: () -> int
+        return hash(self.page.url)
+
+
+def with_cached_html_pages(
+    fn,    # type: Callable[[HTMLPage], Iterable[Link]]
+):
+    # type: (...) -> Callable[[HTMLPage], List[Link]]
+    """
+    Given a function that parses an Iterable[Link] from an HTMLPage, cache the
+    function's result (keyed by CacheablePageContent), unless the HTMLPage
+    `page` has `page.cache_link_parsing == False`.
+    """
+
+    @_lru_cache(maxsize=None)
+    def wrapper(cacheable_page):
+        # type: (CacheablePageContent) -> List[Link]
+        return list(fn(cacheable_page.page))
+
+    @functools.wraps(fn)
+    def wrapper_wrapper(page):
+        # type: (HTMLPage) -> List[Link]
+        if page.cache_link_parsing:
+            return wrapper(CacheablePageContent(page))
+        return list(fn(page))
+
+    return wrapper_wrapper
+
+
+@with_cached_html_pages
 def parse_links(page):
     # type: (HTMLPage) -> Iterable[Link]
     """
@@ -314,18 +378,23 @@ class HTMLPage(object):
 
     def __init__(
         self,
-        content,   # type: bytes
-        encoding,  # type: Optional[str]
-        url,       # type: str
+        content,                  # type: bytes
+        encoding,                 # type: Optional[str]
+        url,                      # type: str
+        cache_link_parsing=True,  # type: bool
     ):
         # type: (...) -> None
         """
         :param encoding: the encoding to decode the given content.
         :param url: the URL from which the HTML was downloaded.
+        :param cache_link_parsing: whether links parsed from this page's url
+                                   should be cached. PyPI index urls should
+                                   have this set to False, for example.
         """
         self.content = content
         self.encoding = encoding
         self.url = url
+        self.cache_link_parsing = cache_link_parsing
 
     def __str__(self):
         # type: () -> str
@@ -343,10 +412,14 @@ def _handle_get_page_fail(
     meth("Could not fetch URL %s: %s - skipping", link, reason)
 
 
-def _make_html_page(response):
-    # type: (Response) -> HTMLPage
+def _make_html_page(response, cache_link_parsing=True):
+    # type: (Response, bool) -> HTMLPage
     encoding = _get_encoding_from_headers(response.headers)
-    return HTMLPage(response.content, encoding=encoding, url=response.url)
+    return HTMLPage(
+        response.content,
+        encoding=encoding,
+        url=response.url,
+        cache_link_parsing=cache_link_parsing)
 
 
 def _get_html_page(link, session=None):
@@ -399,7 +472,8 @@ def _get_html_page(link, session=None):
     except requests.Timeout:
         _handle_get_page_fail(link, "timed out")
     else:
-        return _make_html_page(resp)
+        return _make_html_page(resp,
+                               cache_link_parsing=link.cache_link_parsing)
     return None
 
 
@@ -562,7 +636,9 @@ def collect_links(self, project_name):
         # We want to filter out anything that does not have a secure origin.
         url_locations = [
             link for link in itertools.chain(
-                (Link(url) for url in index_url_loc),
+                # Mark PyPI indices as "cache_link_parsing == False" -- this
+                # will avoid caching the result of parsing the page for links.
+                (Link(url, cache_link_parsing=False) for url in index_url_loc),
                 (Link(url) for url in fl_url_loc),
             )
             if self.session.is_secure_origin(link)

diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
@@ -30,6 +30,7 @@ def __init__(
         comes_from=None,       # type: Optional[Union[str, HTMLPage]]
         requires_python=None,  # type: Optional[str]
         yanked_reason=None,    # type: Optional[Text]
+        cache_link_parsing=True,  # type: bool
     ):
         # type: (...) -> None
         """
@@ -46,6 +47,11 @@ def __init__(
             a simple repository HTML link. If the file has been yanked but
             no reason was provided, this should be the empty string. See
             PEP 592 for more information and the specification.
+        :param cache_link_parsing: A flag that is used elsewhere to determine
+                                   whether resources retrieved from this link
+                                   should be cached. PyPI index urls should
+                                   generally have this set to False, for
+                                   example.
         """
 
         # url can be a UNC windows share
@@ -63,6 +69,8 @@ def __init__(
 
         super(Link, self).__init__(key=url, defining_class=Link)
 
+        self.cache_link_parsing = cache_link_parsing
+
     def __str__(self):
         # type: () -> str
         if self.requires_python:

diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py
@@ -1,5 +1,7 @@
 import logging
 import os.path
+import re
+import uuid
 from textwrap import dedent
 
 import mock
@@ -26,7 +28,7 @@
 from pip._internal.models.index import PyPI
 from pip._internal.models.link import Link
 from pip._internal.network.session import PipSession
-from tests.lib import make_test_link_collector
+from tests.lib import make_test_link_collector, skip_if_python2
 
 
 @pytest.mark.parametrize(
@@ -355,14 +357,61 @@ def test_parse_links__yanked_reason(anchor_html, expected):
     page = HTMLPage(
         html_bytes,
         encoding=None,
-        url='https://example.com/simple/',
+        # parse_links() is cached by url, so we inject a random uuid to ensure
+        # the page content isn't cached.
+        url='https://example.com/simple-{}/'.format(uuid.uuid4()),
     )
     links = list(parse_links(page))
     link, = links
     actual = link.yanked_reason
     assert actual == expected
 
 
+@skip_if_python2
+def test_parse_links_caches_same_page_by_url():
+    html = (
+        '<html><head><meta charset="utf-8"><head>'
+        '<body><a href="/pkg1-1.0.tar.gz"></a></body></html>'
+    )
+    html_bytes = html.encode('utf-8')
+
+    url = 'https://example.com/simple/'
+
+    page_1 = HTMLPage(
+        html_bytes,
+        encoding=None,
+        url=url,
+    )
+    # Make a second page with zero content, to ensure that it's not accessed,
+    # because the page was cached by url.
+    page_2 = HTMLPage(
+        b'',
+        encoding=None,
+        url=url,
+    )
+    # Make a third page which represents an index url, which should not be
+    # cached, even for the same url. We modify the page content slightly to
+    # verify that the result is not cached.
+    page_3 = HTMLPage(
+        re.sub(b'pkg1', b'pkg2', html_bytes),
+        encoding=None,
+        url=url,
+        cache_link_parsing=False,
+    )
+
+    parsed_links_1 = list(parse_links(page_1))
+    assert len(parsed_links_1) == 1
+    assert 'pkg1' in parsed_links_1[0].url
+
+    parsed_links_2 = list(parse_links(page_2))
+    assert parsed_links_2 == parsed_links_1
+
+    parsed_links_3 = list(parse_links(page_3))
+    assert len(parsed_links_3) == 1
+    assert parsed_links_3 != parsed_links_1
+    assert 'pkg2' in parsed_links_3[0].url
+
+
 def test_request_http_error(caplog):
     caplog.set_level(logging.DEBUG)
     link = Link('http://localhost')
@@ -528,13 +577,14 @@ def test_fetch_page(self, mock_get_html_response):
         fake_response = make_fake_html_response(url)
         mock_get_html_response.return_value = fake_response
 
-        location = Link(url)
+        location = Link(url, cache_link_parsing=False)
         link_collector = make_test_link_collector()
         actual = link_collector.fetch_page(location)
 
         assert actual.content == fake_response.content
         assert actual.encoding is None
         assert actual.url == url
+        assert actual.cache_link_parsing == location.cache_link_parsing
 
         # Also check that the right session object was passed to
         # _get_html_response().
@@ -559,8 +609,12 @@ def test_collect_links(self, caplog, data):
 
         assert len(actual.find_links) == 1
         check_links_include(actual.find_links, names=['packages'])
+        # Check that find-links URLs are marked as cacheable.
+        assert actual.find_links[0].cache_link_parsing
 
         assert actual.project_urls == [Link('https://pypi.org/simple/twine/')]
+        # Check that index URLs are marked as *un*cacheable.
+        assert not actual.project_urls[0].cache_link_parsing
 
         expected_message = dedent("""\
         1 location(s) to search for versions of twine:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		Cache the result of parse_links() to avoid re-tokenizing a find-links page multiple times over a pip run.

		This change significantly improves resolve performance when --find-links points to a very large html page.