-
-
Notifications
You must be signed in to change notification settings - Fork 291
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This reworks http handling in pex to be more performant and to allow for alternate implementations and connection disciplines. It also fixes the general flakiness around untranslatable packages. The pex.http submodule is gone and each of its packages are moved into pex directly: pex.crawler pex.link pex.http Crawler is out of the business of caching -- instead this is handed off to the http layer. Link is out of the business of fetching -- it is now only a wrapper around a URL. Web/CachedWeb is killed in favor of a new class pex.http.Context. Subclasses need only implement 'open(link)' and return a file-like object. There are three concrete implementations: - UrllibContext (python standard library http context) - RequestsContext (requests-based http context) - CachingRequestsContext (a requests-based http context with CacheControl if available) The Requests-based contexts also support https cert validation and hash fragment verification (via StreamFilelike) bringing it up to security parity with pip. The rest of the API is modified as minimally as possible to accommodate the above. Users consuming the 'pex' binary and those who just use 'resolve' with default implementations will be unaffected. Changes that will break pants: Obtainer now takes a context instead of a crawler (don't dwell on this too much -- Obtainer will be deleted altogether in the next review.) Translators no longer take conn_timeout since they no longer do any fetching -- this responsibility is delegated to the Context implementations. Increments to 0.8.0-rc0. Testing Done: pex.{crawler,link,http} have improved coverage over their predecessors. The only thing I can think that might be worse is that UrllibContext does nothing to try to recover from errors -- it's mostly assumed that people will use the RequestsContext. Reviewed at https://rbcommons.com/s/twitter/r/778/
- Loading branch information
Showing
28 changed files
with
749 additions
and
882 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,4 +6,3 @@ PEX API Reference | |
|
||
pex | ||
pex.bin | ||
pex.http |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
# Copyright 2014 Pants project contributors (see CONTRIBUTORS.md). | ||
# Licensed under the Apache License, Version 2.0 (see LICENSE). | ||
|
||
import os | ||
import re | ||
import threading | ||
|
||
from .compatibility import PY3 | ||
from .link import Link | ||
from .http import Context | ||
from .tracer import TRACER | ||
|
||
if PY3: | ||
from queue import Empty, Queue | ||
from urllib.parse import urlparse | ||
else: | ||
from Queue import Empty, Queue | ||
from urlparse import urlparse | ||
|
||
|
||
class PageParser(object): | ||
HREF_RE = re.compile(r"""href=(?:"([^"]*)"|\'([^\']*)\'|([^>\s\n]*))""", re.I | re.S) | ||
REL_RE = re.compile(r"""<[^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*>""", re.I) | ||
REL_SKIP_EXTENSIONS = frozenset(['.zip', '.tar', '.tar.gz', '.tar.bz2', '.tgz', '.exe']) | ||
REL_TYPES = frozenset(['homepage', 'download']) | ||
|
||
@classmethod | ||
def href_match_to_url(cls, match): | ||
def pick(group): | ||
return '' if group is None else group | ||
return pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3)) | ||
|
||
@classmethod | ||
def rel_links(cls, page): | ||
"""return rel= links that should be scraped, skipping obviously data links.""" | ||
for match in cls.REL_RE.finditer(page): | ||
href, rel = match.group(0), match.group(1) | ||
if rel not in cls.REL_TYPES: | ||
continue | ||
href_match = cls.HREF_RE.search(href) | ||
if href_match: | ||
href = cls.href_match_to_url(href_match) | ||
parsed_href = urlparse(href) | ||
if any(parsed_href.path.endswith(ext) for ext in cls.REL_SKIP_EXTENSIONS): | ||
continue | ||
yield href | ||
|
||
@classmethod | ||
def links(cls, page): | ||
"""return all links on a page, including potentially rel= links.""" | ||
for match in cls.HREF_RE.finditer(page): | ||
yield cls.href_match_to_url(match) | ||
|
||
|
||
def partition(L, pred): | ||
return filter(lambda v: not pred(v), L), filter(lambda v: pred(v), L) | ||
|
||
|
||
class Crawler(object): | ||
@classmethod | ||
def crawl_local(cls, link): | ||
try: | ||
dirents = os.listdir(link.path) | ||
# except OSError as e: | ||
except Exception as e: | ||
TRACER.log('Failed to read %s: %s' % (link.path, e), V=1) | ||
return set(), set() | ||
files, dirs = partition([os.path.join(link.path, fn) for fn in dirents], os.path.isdir) | ||
return set(map(Link.from_filename, files)), set(map(Link.from_filename, dirs)) | ||
|
||
@classmethod | ||
def crawl_remote(cls, context, link): | ||
try: | ||
content = context.read(link) | ||
# except context.Error as e: | ||
except Exception as e: | ||
TRACER.log('Failed to read %s: %s' % (link.url, e), V=1) | ||
return set(), set() | ||
links = set(link.join(href) for href in PageParser.links(content)) | ||
rel_links = set(link.join(href) for href in PageParser.rel_links(content)) | ||
return links, rel_links | ||
|
||
@classmethod | ||
def crawl_link(cls, context, link): | ||
if link.local: | ||
return cls.crawl_local(link) | ||
elif link.remote: | ||
return cls.crawl_remote(context, link) | ||
else: | ||
TRACER.log('Failed to crawl %s: unknown scheme %s' % (link.url, link.scheme)) | ||
return set(), set() | ||
|
||
def __init__(self, context=None, threads=1): | ||
self._threads = threads | ||
self.context = context or Context.get() | ||
|
||
def crawl(self, link_or_links, follow_links=False): | ||
links, seen = set(), set() | ||
queue = Queue() | ||
converged = threading.Event() | ||
|
||
def execute(): | ||
while not converged.is_set(): | ||
try: | ||
link = queue.get(timeout=0.1) | ||
except Empty: | ||
continue | ||
if link not in seen: | ||
seen.add(link) | ||
try: | ||
roots, rels = self.crawl_link(self.context, link) | ||
except Exception as e: | ||
TRACER.log('Unknown exception encountered: %s' % e) | ||
continue | ||
links.update(roots) | ||
if follow_links: | ||
for rel in rels: | ||
if rel not in seen: | ||
queue.put(rel) | ||
queue.task_done() | ||
|
||
for link in Link.wrap_iterable(link_or_links): | ||
queue.put(link) | ||
|
||
workers = [] | ||
for _ in range(self._threads): | ||
worker = threading.Thread(target=execute) | ||
workers.append(worker) | ||
worker.daemon = True | ||
worker.start() | ||
|
||
queue.join() | ||
converged.set() | ||
|
||
for worker in workers: | ||
worker.join() | ||
|
||
return links |
Oops, something went wrong.