From 6f56613441ec019903022dc30a521e881a46cd80 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 6 Nov 2012 13:55:39 +0100 Subject: [PATCH] Do not require HtmlElement. * Do not use element.base_url which only exists in lxml.html.HtmlElement * Use lxml.etree.HtmlParser instead of lxml.html This is one step toward using the html5lib parser, but see https://github.com/Kozea/WeasyPrint/pull/12 --- weasyprint/__init__.py | 8 ++++---- weasyprint/css/__init__.py | 6 +++--- weasyprint/tests/test_api.py | 10 ++++++---- weasyprint/urls.py | 35 ++++++++++++++++++++++++----------- 4 files changed, 37 insertions(+), 22 deletions(-) diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index e0d9578bb..63c8afe3f 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -69,7 +69,7 @@ class HTML(object): def __init__(self, guess=None, filename=None, url=None, file_obj=None, string=None, tree=None, encoding=None, base_url=None, url_fetcher=default_url_fetcher, media_type='print'): - import lxml.html + import lxml.etree from .html import find_base_url from .urls import wrap_url_fetcher url_fetcher = wrap_url_fetcher(url_fetcher) @@ -82,12 +82,12 @@ def __init__(self, guess=None, filename=None, url=None, file_obj=None, result = source else: if source_type == 'string': - parse = lxml.html.document_fromstring + parse = lxml.etree.fromstring else: - parse = lxml.html.parse + parse = lxml.etree.parse if not encoding: encoding = protocol_encoding - parser = lxml.html.HTMLParser(encoding=encoding) + parser = lxml.etree.HTMLParser(encoding=encoding) result = parse(source, parser=parser) if result is None: raise ValueError('Error while parsing HTML') diff --git a/weasyprint/css/__init__.py b/weasyprint/css/__init__.py index 7d805e1e5..b9f8c27ba 100644 --- a/weasyprint/css/__init__.py +++ b/weasyprint/css/__init__.py @@ -29,7 +29,7 @@ from . import properties from . import computed_values from .validation import preprocess_declarations -from ..urls import get_url_attribute, url_join +from ..urls import element_base_url, get_url_attribute, url_join from ..logger import LOGGER from ..compat import iteritems from .. import CSS @@ -171,7 +171,7 @@ def find_stylesheets(element_tree, device_media_type, url_fetcher): content = ''.join(content) # lxml should give us either unicode or ASCII-only bytestrings, so # we don't need `encoding` here. - css = CSS(string=content, base_url=element.base_url, + css = CSS(string=content, base_url=element_base_url(element), url_fetcher=url_fetcher, media_type=device_media_type) yield css elif element.tag == 'link' and element.get('href'): @@ -196,7 +196,7 @@ def find_style_attributes(element_tree): declarations, errors = parser.parse_style_attr(style_attribute) for error in errors: LOGGER.warn(error) - yield element, declarations, element.base_url + yield element, declarations, element_base_url(element) def evaluate_media_query(query_list, device_media_type): diff --git a/weasyprint/tests/test_api.py b/weasyprint/tests/test_api.py index 4ffc5cf76..401665289 100644 --- a/weasyprint/tests/test_api.py +++ b/weasyprint/tests/test_api.py @@ -21,7 +21,8 @@ import shutil import tempfile -import lxml.html, lxml.etree +import lxml.html +import lxml.etree import cairo import pytest @@ -38,6 +39,7 @@ CHDIR_LOCK = threading.Lock() + @contextlib.contextmanager def chdir(path): """Change the current directory in a context manager.""" @@ -100,7 +102,7 @@ def _test_resource(class_, basename, check, **kwargs): check(class_(string=content, base_url=relative_filename, **kwargs)) encoding = kwargs.get('encoding') or 'utf8' check(class_(string=content.decode(encoding), # unicode - base_url=relative_filename, **kwargs)) + base_url=relative_filename, **kwargs)) with pytest.raises(TypeError): class_(filename='foo', url='bar') @@ -117,11 +119,11 @@ def check_doc1(html, has_base_url=True): h1 = body[0] assert h1.text == 'WeasyPrint test document (with Ünicōde)' if has_base_url: - url = urljoin(h1.base_url, 'pattern.png') + url = urljoin(html.base_url, 'pattern.png') assert url.startswith('file:') assert url.endswith('weasyprint/tests/resources/pattern.png') else: - assert h1.base_url is None + assert html.base_url is None _test_resource(TestHTML, 'doc1.html', check_doc1) _test_resource(TestHTML, 'doc1_UTF-16BE.html', check_doc1, diff --git a/weasyprint/urls.py b/weasyprint/urls.py index ec9d503d2..ccd15b59c 100644 --- a/weasyprint/urls.py +++ b/weasyprint/urls.py @@ -92,6 +92,16 @@ def url_is_absolute(url): .match(url)) +def element_base_url(element): + """Return the URL associated with a lxml document. + + This is the same as the HtmlElement.base_url property, but dont’t want + to require HtmlElement. + + """ + return element.getroottree().docinfo.URL + + def get_url_attribute(element, attr_name): """Get the URI corresponding to the ``attr_name`` attribute. @@ -105,8 +115,9 @@ def get_url_attribute(element, attr_name): """ value = element.get(attr_name, '').strip() if value: - return url_join(element.base_url, value, '<%s %s="%s"> at line %d', - element.tag, attr_name, value, element.sourceline) + return url_join(element_base_url(element), value, + '<%s %s="%s"> at line %s', element.tag, attr_name, + value, element.sourceline) def url_join(base_url, url, context, *args): @@ -135,10 +146,11 @@ def get_link_attribute(element, attr_name): return 'internal', unquote(attr_value[1:]) else: uri = get_url_attribute(element, attr_name) - if uri and element.base_url: + document_url = element_base_url(element) + if uri and document_url: parsed = urlsplit(uri) # Compare with fragments removed - if parsed[:-1] == urlsplit(element.base_url)[:-1]: + if parsed[:-1] == urlsplit(document_url)[:-1]: return 'internal', unquote(parsed.fragment) else: return 'external', uri @@ -169,7 +181,7 @@ def safe_base64_decode(data): """ missing_padding = 4 - len(data) % 4 if missing_padding: - data += b'='* missing_padding + data += b'=' * missing_padding return base64_decode(data) @@ -194,7 +206,7 @@ def open_data_url(url): semi = header.rfind(';') if semi >= 0 and '=' not in header[semi:]: content_type = header[:semi] - encoding = header[semi+1:] + encoding = header[semi + 1:] else: content_type = header encoding = '' @@ -228,17 +240,18 @@ def default_url_fetcher(url): in the message. :returns: In case of success, a dict with the following keys: - * One of ``string`` (a byte string) or ``file_obj`` (a file-like object) + * One of ``string`` (a byte string) or ``file_obj`` + (a file-like object) * Optionally: ``mime_type``, a MIME type extracted eg. from a *Content-Type* header. If not provided, the type is guessed from the file extension in the URL. * Optionally: ``encoding``, a character encoding extracted eg. from a *charset* parameter in a *Content-Type* header - * Optionally: ``redirected_url``, the actual URL of the ressource in case - there were eg. HTTP redirects. + * Optionally: ``redirected_url``, the actual URL of the ressource + in case there were eg. HTTP redirects. - If a ``file_obj`` key is given, it is the caller’s responsability to call - ``file_obj.close()``. + If a ``file_obj`` key is given, it is the caller’s responsability + to call ``file_obj.close()``. """ if url.startswith('data:'):