From 8809012a836a8870cf057dbe7a9891ee92640d11 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Thu, 13 Feb 2020 15:31:23 -0800 Subject: [PATCH] cache html page fetching by link --- src/pip/_internal/index/collector.py | 44 ++++++++++++++++++---------- tests/unit/test_collector.py | 33 ++++++++++++++++----- 2 files changed, 54 insertions(+), 23 deletions(-) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 98921bed33e..5a2e8d4fe1c 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -9,6 +9,19 @@ import os from collections import OrderedDict +from pip._vendor import html5lib, requests +from pip._vendor.distlib.compat import unescape +from pip._vendor.requests.exceptions import HTTPError, RetryError, SSLError +from pip._vendor.six.moves.urllib import parse as urllib_parse +from pip._vendor.six.moves.urllib import request as urllib_request + +from pip._internal.models.link import Link +from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS +from pip._internal.utils.misc import redact_auth_from_url +from pip._internal.utils.typing import MYPY_CHECK_RUNNING +from pip._internal.utils.urls import path_to_url, url_to_path +from pip._internal.vcs import is_url, vcs + try: from functools import lru_cache except ImportError: @@ -26,18 +39,6 @@ def wrapped(arg): return wrapped return wrapper -from pip._vendor import html5lib, requests -from pip._vendor.distlib.compat import unescape -from pip._vendor.requests.exceptions import HTTPError, RetryError, SSLError -from pip._vendor.six.moves.urllib import parse as urllib_parse -from pip._vendor.six.moves.urllib import request as urllib_request - -from pip._internal.models.link import Link -from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS -from pip._internal.utils.misc import redact_auth_from_url -from pip._internal.utils.typing import MYPY_CHECK_RUNNING -from pip._internal.utils.urls import path_to_url, url_to_path -from pip._internal.vcs import is_url, vcs if MYPY_CHECK_RUNNING: from typing import ( @@ -261,15 +262,17 @@ def _create_link_from_element( return link -class CacheablePage(object): +class CacheablePageContent(object): def __init__(self, page): self.page = page def __eq__(self, other): - return isinstance(other, type(self)) and self.page.url == other.page.url + return (isinstance(other, type(self)) and + self.page.content == other.page.content and + self.page.encoding == other.page.encoding) def __hash__(self): - return hash(self.page.url) + return hash((self.page.content, self.page.encoding)) def with_cached_html_pages(fn): @@ -278,7 +281,7 @@ def wrapper(cacheable_page): return list(fn(cacheable_page.page)) def wrapper_wrapper(page): - return wrapper(CacheablePage(page)) + return wrapper(CacheablePageContent(page)) return wrapper_wrapper @@ -348,6 +351,15 @@ def _make_html_page(response): return HTMLPage(response.content, encoding=encoding, url=response.url) +def with_cached_link_fetch(fn): + @lru_cache(maxsize=None) + def wrapper(link, session=None): + return fn(link, session=session) + + return wrapper + + +@with_cached_link_fetch def _get_html_page(link, session=None): # type: (Link, Optional[PipSession]) -> Optional[HTMLPage] if session is None: diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 2dfb0026ad6..db16b17892b 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -1,6 +1,5 @@ import logging import os.path -import uuid from textwrap import dedent import mock @@ -11,6 +10,7 @@ from pip._vendor.six.moves.urllib import request as urllib_request from pip._internal.index.collector import ( + CacheablePageContent, HTMLPage, _clean_link, _determine_base_url, @@ -270,7 +270,7 @@ def test_parse_links__yanked_reason(anchor_html, expected): page = HTMLPage( html_bytes, encoding=None, - url='https://example.com/find-links-{}'.format(uuid.uuid4()), + url='https://example.com/simple/', ) links = list(parse_links(page)) link, = links @@ -287,19 +287,19 @@ def test_parse_links_caches_same_page(): ) html_bytes = html.encode('utf-8') - # The caching is only keyed on having the same `url`. page_1 = HTMLPage( html_bytes, encoding=None, - url='https://example.com/some-find-links-url/', + url='https://example.com/simple/', ) page_2 = HTMLPage( html_bytes, encoding=None, - url='https://example.com/some-find-links-url/', + url='https://example.com/simple/', ) - with mock.patch("pip._internal.index.collector.html5lib.parse") as mock_parse: + mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse") + with mock_parse as mock_parse: mock_parse.return_value = html5lib.parse( page_1.content, transport_encoding=page_1.encoding, @@ -308,7 +308,7 @@ def test_parse_links_caches_same_page(): parsed_links_1 = list(parse_links(page_1)) mock_parse.assert_called() - with mock.patch("pip._internal.index.collector.html5lib.parse") as mock_parse: + with mock_parse as mock_parse: parsed_links_2 = list(parse_links(page_2)) assert parsed_links_2 == parsed_links_1 mock_parse.assert_not_called() @@ -378,6 +378,25 @@ def test_get_html_page_invalid_scheme(caplog, url, vcs_scheme): ] +def test_get_html_page_caches_same_link(): + link = Link('https://example.com/link-1/') + session = mock.Mock(PipSession) + + fake_response = make_fake_html_response(link.url) + mock_func = mock.patch("pip._internal.index.collector._get_html_response") + with mock_func as mock_func: + mock_func.return_value = fake_response + page_1 = _get_html_page(link, session=session) + mock_func.assert_called_once() + + with mock_func as mock_func: + page_2 = _get_html_page(link, session=session) + # Assert that the result of the cached html page fetch will also then + # be cached by parse_links() and @with_cached_html_pages. + assert CacheablePageContent(page_1) == CacheablePageContent(page_2) + mock_func.assert_not_called() + + def make_fake_html_response(url): """ Create a fake requests.Response object.