Do not require HtmlElement.

* Do not use element.base_url which only exists in lxml.html.HtmlElement * Use lxml.etree.HtmlParser instead of lxml.html This is one step toward using the html5lib parser, but see Kozea#12
innovimax · Nov 6, 2012 · 6f56613 · 6f56613
1 parent 63a2f35
commit 6f56613
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 22 deletions.
diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py
@@ -69,7 +69,7 @@ class HTML(object):
     def __init__(self, guess=None, filename=None, url=None, file_obj=None,
                  string=None, tree=None, encoding=None, base_url=None,
                  url_fetcher=default_url_fetcher, media_type='print'):
-        import lxml.html
+        import lxml.etree
         from .html import find_base_url
         from .urls import wrap_url_fetcher
         url_fetcher = wrap_url_fetcher(url_fetcher)
@@ -82,12 +82,12 @@ def __init__(self, guess=None, filename=None, url=None, file_obj=None,
             result = source
         else:
             if source_type == 'string':
-                parse = lxml.html.document_fromstring
+                parse = lxml.etree.fromstring
             else:
-                parse = lxml.html.parse
+                parse = lxml.etree.parse
             if not encoding:
                 encoding = protocol_encoding
-            parser = lxml.html.HTMLParser(encoding=encoding)
+            parser = lxml.etree.HTMLParser(encoding=encoding)
             result = parse(source, parser=parser)
             if result is None:
                 raise ValueError('Error while parsing HTML')

diff --git a/weasyprint/css/__init__.py b/weasyprint/css/__init__.py
@@ -29,7 +29,7 @@
 from . import properties
 from . import computed_values
 from .validation import preprocess_declarations
-from ..urls import get_url_attribute, url_join
+from ..urls import element_base_url, get_url_attribute, url_join
 from ..logger import LOGGER
 from ..compat import iteritems
 from .. import CSS
@@ -171,7 +171,7 @@ def find_stylesheets(element_tree, device_media_type, url_fetcher):
             content = ''.join(content)
             # lxml should give us either unicode or ASCII-only bytestrings, so
             # we don't need `encoding` here.
-            css = CSS(string=content, base_url=element.base_url,
+            css = CSS(string=content, base_url=element_base_url(element),
                       url_fetcher=url_fetcher, media_type=device_media_type)
             yield css
         elif element.tag == 'link' and element.get('href'):
@@ -196,7 +196,7 @@ def find_style_attributes(element_tree):
             declarations, errors = parser.parse_style_attr(style_attribute)
             for error in errors:
                 LOGGER.warn(error)
-            yield element, declarations, element.base_url
+            yield element, declarations, element_base_url(element)
 
 
 def evaluate_media_query(query_list, device_media_type):

diff --git a/weasyprint/tests/test_api.py b/weasyprint/tests/test_api.py
@@ -21,7 +21,8 @@
 import shutil
 import tempfile
 
-import lxml.html, lxml.etree
+import lxml.html
+import lxml.etree
 import cairo
 import pytest
 
@@ -38,6 +39,7 @@
 
 CHDIR_LOCK = threading.Lock()
 
+
 @contextlib.contextmanager
 def chdir(path):
     """Change the current directory in a context manager."""
@@ -100,7 +102,7 @@ def _test_resource(class_, basename, check, **kwargs):
         check(class_(string=content, base_url=relative_filename, **kwargs))
         encoding = kwargs.get('encoding') or 'utf8'
         check(class_(string=content.decode(encoding),  # unicode
-                        base_url=relative_filename, **kwargs))
+                     base_url=relative_filename, **kwargs))
     with pytest.raises(TypeError):
         class_(filename='foo', url='bar')
 
@@ -117,11 +119,11 @@ def check_doc1(html, has_base_url=True):
         h1 = body[0]
         assert h1.text == 'WeasyPrint test document (with Ünicōde)'
         if has_base_url:
-            url = urljoin(h1.base_url, 'pattern.png')
+            url = urljoin(html.base_url, 'pattern.png')
             assert url.startswith('file:')
             assert url.endswith('weasyprint/tests/resources/pattern.png')
         else:
-            assert h1.base_url is None
+            assert html.base_url is None
 
     _test_resource(TestHTML, 'doc1.html', check_doc1)
     _test_resource(TestHTML, 'doc1_UTF-16BE.html', check_doc1,

diff --git a/weasyprint/urls.py b/weasyprint/urls.py
@@ -92,6 +92,16 @@ def url_is_absolute(url):
         .match(url))
 
 
+def element_base_url(element):
+    """Return the URL associated with a lxml document.
+
+    This is the same as the HtmlElement.base_url property, but dont’t want
+    to require HtmlElement.
+
+    """
+    return element.getroottree().docinfo.URL
+
+
 def get_url_attribute(element, attr_name):
     """Get the URI corresponding to the ``attr_name`` attribute.
 
@@ -105,8 +115,9 @@ def get_url_attribute(element, attr_name):
     """
     value = element.get(attr_name, '').strip()
     if value:
-        return url_join(element.base_url, value, '<%s %s="%s"> at line %d',
-            element.tag, attr_name, value, element.sourceline)
+        return url_join(element_base_url(element), value,
+                        '<%s %s="%s"> at line %s', element.tag, attr_name,
+                        value, element.sourceline)
 
 
 def url_join(base_url, url, context, *args):
@@ -135,10 +146,11 @@ def get_link_attribute(element, attr_name):
         return 'internal', unquote(attr_value[1:])
     else:
         uri = get_url_attribute(element, attr_name)
-        if uri and element.base_url:
+        document_url = element_base_url(element)
+        if uri and document_url:
             parsed = urlsplit(uri)
             # Compare with fragments removed
-            if parsed[:-1] == urlsplit(element.base_url)[:-1]:
+            if parsed[:-1] == urlsplit(document_url)[:-1]:
                 return 'internal', unquote(parsed.fragment)
             else:
                 return 'external', uri
@@ -169,7 +181,7 @@ def safe_base64_decode(data):
     """
     missing_padding = 4 - len(data) % 4
     if missing_padding:
-        data += b'='* missing_padding
+        data += b'=' * missing_padding
     return base64_decode(data)
 
 
@@ -194,7 +206,7 @@ def open_data_url(url):
         semi = header.rfind(';')
         if semi >= 0 and '=' not in header[semi:]:
             content_type = header[:semi]
-            encoding = header[semi+1:]
+            encoding = header[semi + 1:]
         else:
             content_type = header
             encoding = ''
@@ -228,17 +240,18 @@ def default_url_fetcher(url):
         in the message.
     :returns: In case of success, a dict with the following keys:
 
-        * One of ``string`` (a byte string) or ``file_obj`` (a file-like object)
+        * One of ``string`` (a byte string) or ``file_obj``
+          (a file-like object)
         * Optionally: ``mime_type``, a MIME type extracted eg. from a
           *Content-Type* header. If not provided, the type is guessed from the
           file extension in the URL.
         * Optionally: ``encoding``, a character encoding extracted eg. from a
           *charset* parameter in a *Content-Type* header
-        * Optionally: ``redirected_url``, the actual URL of the ressource in case
-          there were eg. HTTP redirects.
+        * Optionally: ``redirected_url``, the actual URL of the ressource
+          in case there were eg. HTTP redirects.
 
-        If a ``file_obj`` key is given, it is the caller’s responsability to call
-        ``file_obj.close()``.
+        If a ``file_obj`` key is given, it is the caller’s responsability
+        to call ``file_obj.close()``.
 
     """
     if url.startswith('data:'):