Skip to content

Commit

Permalink
Do not require HtmlElement.
Browse files Browse the repository at this point in the history
* Do not use element.base_url which only exists in lxml.html.HtmlElement
* Use lxml.etree.HtmlParser instead of lxml.html

This is one step toward using the html5lib parser, but see
Kozea#12
  • Loading branch information
SimonSapin committed Nov 6, 2012
1 parent 63a2f35 commit 6f56613
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 22 deletions.
8 changes: 4 additions & 4 deletions weasyprint/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class HTML(object):
def __init__(self, guess=None, filename=None, url=None, file_obj=None,
string=None, tree=None, encoding=None, base_url=None,
url_fetcher=default_url_fetcher, media_type='print'):
import lxml.html
import lxml.etree
from .html import find_base_url
from .urls import wrap_url_fetcher
url_fetcher = wrap_url_fetcher(url_fetcher)
Expand All @@ -82,12 +82,12 @@ def __init__(self, guess=None, filename=None, url=None, file_obj=None,
result = source
else:
if source_type == 'string':
parse = lxml.html.document_fromstring
parse = lxml.etree.fromstring
else:
parse = lxml.html.parse
parse = lxml.etree.parse
if not encoding:
encoding = protocol_encoding
parser = lxml.html.HTMLParser(encoding=encoding)
parser = lxml.etree.HTMLParser(encoding=encoding)
result = parse(source, parser=parser)
if result is None:
raise ValueError('Error while parsing HTML')
Expand Down
6 changes: 3 additions & 3 deletions weasyprint/css/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from . import properties
from . import computed_values
from .validation import preprocess_declarations
from ..urls import get_url_attribute, url_join
from ..urls import element_base_url, get_url_attribute, url_join
from ..logger import LOGGER
from ..compat import iteritems
from .. import CSS
Expand Down Expand Up @@ -171,7 +171,7 @@ def find_stylesheets(element_tree, device_media_type, url_fetcher):
content = ''.join(content)
# lxml should give us either unicode or ASCII-only bytestrings, so
# we don't need `encoding` here.
css = CSS(string=content, base_url=element.base_url,
css = CSS(string=content, base_url=element_base_url(element),
url_fetcher=url_fetcher, media_type=device_media_type)
yield css
elif element.tag == 'link' and element.get('href'):
Expand All @@ -196,7 +196,7 @@ def find_style_attributes(element_tree):
declarations, errors = parser.parse_style_attr(style_attribute)
for error in errors:
LOGGER.warn(error)
yield element, declarations, element.base_url
yield element, declarations, element_base_url(element)


def evaluate_media_query(query_list, device_media_type):
Expand Down
10 changes: 6 additions & 4 deletions weasyprint/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
import shutil
import tempfile

import lxml.html, lxml.etree
import lxml.html
import lxml.etree
import cairo
import pytest

Expand All @@ -38,6 +39,7 @@

CHDIR_LOCK = threading.Lock()


@contextlib.contextmanager
def chdir(path):
"""Change the current directory in a context manager."""
Expand Down Expand Up @@ -100,7 +102,7 @@ def _test_resource(class_, basename, check, **kwargs):
check(class_(string=content, base_url=relative_filename, **kwargs))
encoding = kwargs.get('encoding') or 'utf8'
check(class_(string=content.decode(encoding), # unicode
base_url=relative_filename, **kwargs))
base_url=relative_filename, **kwargs))
with pytest.raises(TypeError):
class_(filename='foo', url='bar')

Expand All @@ -117,11 +119,11 @@ def check_doc1(html, has_base_url=True):
h1 = body[0]
assert h1.text == 'WeasyPrint test document (with Ünicōde)'
if has_base_url:
url = urljoin(h1.base_url, 'pattern.png')
url = urljoin(html.base_url, 'pattern.png')
assert url.startswith('file:')
assert url.endswith('weasyprint/tests/resources/pattern.png')
else:
assert h1.base_url is None
assert html.base_url is None

_test_resource(TestHTML, 'doc1.html', check_doc1)
_test_resource(TestHTML, 'doc1_UTF-16BE.html', check_doc1,
Expand Down
35 changes: 24 additions & 11 deletions weasyprint/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,16 @@ def url_is_absolute(url):
.match(url))


def element_base_url(element):
"""Return the URL associated with a lxml document.
This is the same as the HtmlElement.base_url property, but dont’t want
to require HtmlElement.
"""
return element.getroottree().docinfo.URL


def get_url_attribute(element, attr_name):
"""Get the URI corresponding to the ``attr_name`` attribute.
Expand All @@ -105,8 +115,9 @@ def get_url_attribute(element, attr_name):
"""
value = element.get(attr_name, '').strip()
if value:
return url_join(element.base_url, value, '<%s %s="%s"> at line %d',
element.tag, attr_name, value, element.sourceline)
return url_join(element_base_url(element), value,
'<%s %s="%s"> at line %s', element.tag, attr_name,
value, element.sourceline)


def url_join(base_url, url, context, *args):
Expand Down Expand Up @@ -135,10 +146,11 @@ def get_link_attribute(element, attr_name):
return 'internal', unquote(attr_value[1:])
else:
uri = get_url_attribute(element, attr_name)
if uri and element.base_url:
document_url = element_base_url(element)
if uri and document_url:
parsed = urlsplit(uri)
# Compare with fragments removed
if parsed[:-1] == urlsplit(element.base_url)[:-1]:
if parsed[:-1] == urlsplit(document_url)[:-1]:
return 'internal', unquote(parsed.fragment)
else:
return 'external', uri
Expand Down Expand Up @@ -169,7 +181,7 @@ def safe_base64_decode(data):
"""
missing_padding = 4 - len(data) % 4
if missing_padding:
data += b'='* missing_padding
data += b'=' * missing_padding
return base64_decode(data)


Expand All @@ -194,7 +206,7 @@ def open_data_url(url):
semi = header.rfind(';')
if semi >= 0 and '=' not in header[semi:]:
content_type = header[:semi]
encoding = header[semi+1:]
encoding = header[semi + 1:]
else:
content_type = header
encoding = ''
Expand Down Expand Up @@ -228,17 +240,18 @@ def default_url_fetcher(url):
in the message.
:returns: In case of success, a dict with the following keys:
* One of ``string`` (a byte string) or ``file_obj`` (a file-like object)
* One of ``string`` (a byte string) or ``file_obj``
(a file-like object)
* Optionally: ``mime_type``, a MIME type extracted eg. from a
*Content-Type* header. If not provided, the type is guessed from the
file extension in the URL.
* Optionally: ``encoding``, a character encoding extracted eg. from a
*charset* parameter in a *Content-Type* header
* Optionally: ``redirected_url``, the actual URL of the ressource in case
there were eg. HTTP redirects.
* Optionally: ``redirected_url``, the actual URL of the ressource
in case there were eg. HTTP redirects.
If a ``file_obj`` key is given, it is the caller’s responsability to call
``file_obj.close()``.
If a ``file_obj`` key is given, it is the caller’s responsability
to call ``file_obj.close()``.
"""
if url.startswith('data:'):
Expand Down

0 comments on commit 6f56613

Please sign in to comment.