Skip to content

Commit

Permalink
cache html page fetching by link
Browse files Browse the repository at this point in the history
  • Loading branch information
cosmicexplorer committed Feb 13, 2020
1 parent c399ba2 commit 8809012
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 23 deletions.
44 changes: 28 additions & 16 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,19 @@
import os
from collections import OrderedDict

from pip._vendor import html5lib, requests
from pip._vendor.distlib.compat import unescape
from pip._vendor.requests.exceptions import HTTPError, RetryError, SSLError
from pip._vendor.six.moves.urllib import parse as urllib_parse
from pip._vendor.six.moves.urllib import request as urllib_request

from pip._internal.models.link import Link
from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS
from pip._internal.utils.misc import redact_auth_from_url
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
from pip._internal.utils.urls import path_to_url, url_to_path
from pip._internal.vcs import is_url, vcs

try:
from functools import lru_cache
except ImportError:
Expand All @@ -26,18 +39,6 @@ def wrapped(arg):
return wrapped
return wrapper

from pip._vendor import html5lib, requests
from pip._vendor.distlib.compat import unescape
from pip._vendor.requests.exceptions import HTTPError, RetryError, SSLError
from pip._vendor.six.moves.urllib import parse as urllib_parse
from pip._vendor.six.moves.urllib import request as urllib_request

from pip._internal.models.link import Link
from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS
from pip._internal.utils.misc import redact_auth_from_url
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
from pip._internal.utils.urls import path_to_url, url_to_path
from pip._internal.vcs import is_url, vcs

if MYPY_CHECK_RUNNING:
from typing import (
Expand Down Expand Up @@ -261,15 +262,17 @@ def _create_link_from_element(
return link


class CacheablePage(object):
class CacheablePageContent(object):
def __init__(self, page):
self.page = page

def __eq__(self, other):
return isinstance(other, type(self)) and self.page.url == other.page.url
return (isinstance(other, type(self)) and
self.page.content == other.page.content and
self.page.encoding == other.page.encoding)

def __hash__(self):
return hash(self.page.url)
return hash((self.page.content, self.page.encoding))


def with_cached_html_pages(fn):
Expand All @@ -278,7 +281,7 @@ def wrapper(cacheable_page):
return list(fn(cacheable_page.page))

def wrapper_wrapper(page):
return wrapper(CacheablePage(page))
return wrapper(CacheablePageContent(page))

return wrapper_wrapper

Expand Down Expand Up @@ -348,6 +351,15 @@ def _make_html_page(response):
return HTMLPage(response.content, encoding=encoding, url=response.url)


def with_cached_link_fetch(fn):
@lru_cache(maxsize=None)
def wrapper(link, session=None):
return fn(link, session=session)

return wrapper


@with_cached_link_fetch
def _get_html_page(link, session=None):
# type: (Link, Optional[PipSession]) -> Optional[HTMLPage]
if session is None:
Expand Down
33 changes: 26 additions & 7 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import os.path
import uuid
from textwrap import dedent

import mock
Expand All @@ -11,6 +10,7 @@
from pip._vendor.six.moves.urllib import request as urllib_request

from pip._internal.index.collector import (
CacheablePageContent,
HTMLPage,
_clean_link,
_determine_base_url,
Expand Down Expand Up @@ -270,7 +270,7 @@ def test_parse_links__yanked_reason(anchor_html, expected):
page = HTMLPage(
html_bytes,
encoding=None,
url='https://example.com/find-links-{}'.format(uuid.uuid4()),
url='https://example.com/simple/',
)
links = list(parse_links(page))
link, = links
Expand All @@ -287,19 +287,19 @@ def test_parse_links_caches_same_page():
)
html_bytes = html.encode('utf-8')

# The caching is only keyed on having the same `url`.
page_1 = HTMLPage(
html_bytes,
encoding=None,
url='https://example.com/some-find-links-url/',
url='https://example.com/simple/',
)
page_2 = HTMLPage(
html_bytes,
encoding=None,
url='https://example.com/some-find-links-url/',
url='https://example.com/simple/',
)

with mock.patch("pip._internal.index.collector.html5lib.parse") as mock_parse:
mock_parse = mock.patch("pip._internal.index.collector.html5lib.parse")
with mock_parse as mock_parse:
mock_parse.return_value = html5lib.parse(
page_1.content,
transport_encoding=page_1.encoding,
Expand All @@ -308,7 +308,7 @@ def test_parse_links_caches_same_page():
parsed_links_1 = list(parse_links(page_1))
mock_parse.assert_called()

with mock.patch("pip._internal.index.collector.html5lib.parse") as mock_parse:
with mock_parse as mock_parse:
parsed_links_2 = list(parse_links(page_2))
assert parsed_links_2 == parsed_links_1
mock_parse.assert_not_called()
Expand Down Expand Up @@ -378,6 +378,25 @@ def test_get_html_page_invalid_scheme(caplog, url, vcs_scheme):
]


def test_get_html_page_caches_same_link():
link = Link('https://example.com/link-1/')
session = mock.Mock(PipSession)

fake_response = make_fake_html_response(link.url)
mock_func = mock.patch("pip._internal.index.collector._get_html_response")
with mock_func as mock_func:
mock_func.return_value = fake_response
page_1 = _get_html_page(link, session=session)
mock_func.assert_called_once()

with mock_func as mock_func:
page_2 = _get_html_page(link, session=session)
# Assert that the result of the cached html page fetch will also then
# be cached by parse_links() and @with_cached_html_pages.
assert CacheablePageContent(page_1) == CacheablePageContent(page_2)
mock_func.assert_not_called()


def make_fake_html_response(url):
"""
Create a fake requests.Response object.
Expand Down

0 comments on commit 8809012

Please sign in to comment.