diff --git a/docs/install.rst b/docs/install.rst index f6df8ca1d..0d7982243 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -8,6 +8,7 @@ WeasyPrint |version| depends on: * Pango_ * CFFI_ ≥ 0.5 * lxml_ +* html5lib ≥ 1.0b3 * cairocffi_ ≥ 0.3 * tinycss_ = 0.3 * cssselect_ ≥ 0.6 diff --git a/setup.py b/setup.py index 403bb7dd6..e2da1a73e 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ REQUIREMENTS = [ # XXX: Keep this in sync with docs/install.rst 'lxml', + 'html5lib>=1.0b3', 'tinycss==0.3', 'cssselect>=0.6', 'CairoSVG>=0.4.1', diff --git a/weasyprint/__init__.py b/weasyprint/__init__.py index 62274a54b..4bf783de1 100644 --- a/weasyprint/__init__.py +++ b/weasyprint/__init__.py @@ -27,10 +27,11 @@ import contextlib -import lxml.etree +import html5lib from .urls import (fetch, default_url_fetcher, path2url, ensure_url, url_is_absolute) +from .compat import unicode from .logger import LOGGER # Some import are at the end of the file (after the CSS class) is defined # to work around circular imports. @@ -81,16 +82,14 @@ def __init__(self, guess=None, filename=None, url=None, file_obj=None, if source_type == 'tree': result = source else: - if source_type == 'string': - parse = lxml.etree.fromstring - else: - parse = lxml.etree.parse if not encoding: encoding = protocol_encoding - parser = lxml.etree.HTMLParser(encoding=encoding) - result = parse(source, parser=parser) - if result is None: - raise ValueError('Error while parsing HTML') + if isinstance(source, unicode): + encoding = None + result = html5lib.parse( + source, treebuilder='lxml', encoding=encoding, + namespaceHTMLElements=False) + assert result base_url = find_base_url(result, base_url) if hasattr(result, 'getroot'): result.docinfo.URL = base_url diff --git a/weasyprint/css/tests_ua.css b/weasyprint/css/tests_ua.css index 765da4ac0..e387edbe0 100644 --- a/weasyprint/css/tests_ua.css +++ b/weasyprint/css/tests_ua.css @@ -10,16 +10,16 @@ br:before { content: '\A'; white-space: pre-line } ol { list-style-type: decimal } ol, ul { counter-reset: list-item } -table { display: table; +table, x-table { display: table; box-sizing: border-box } -tr { display: table-row } -thead { display: table-header-group } -tbody { display: table-row-group } -tfoot { display: table-footer-group } -col { display: table-column } -colgroup { display: table-column-group } -td, th { display: table-cell } -caption { display: table-caption } +tr, x-tr { display: table-row } +thead, x-thead { display: table-header-group } +tbody, x-tbody { display: table-row-group } +tfoot, x-tfoot { display: table-footer-group } +col, x-col { display: table-column } +colgroup, x-colgroup { display: table-column-group } +td, th, x-td, x-th { display: table-cell } +caption, x-caption { display: table-caption } *[lang] { -weasy-lang: attr(lang); } a[href] { -weasy-link: attr(href); } diff --git a/weasyprint/tests/test_boxes.py b/weasyprint/tests/test_boxes.py index 6ed840eb9..196f2f95e 100644 --- a/weasyprint/tests/test_boxes.py +++ b/weasyprint/tests/test_boxes.py @@ -13,6 +13,8 @@ from __future__ import division, unicode_literals import functools +import pprint +import difflib from .testing_utils import ( resource_filename, TestHTML, assert_no_logs, capture_logs) @@ -116,7 +118,13 @@ def assert_tree(box, expected): expected: a list of serialized children as returned by to_lists(). """ - assert to_lists(box) == expected + lists = to_lists(box) + if lists != expected: + print(''.join(difflib.unified_diff( + *(pprint.pformat(v).splitlines(keepends=True) + for v in [lists, expected]), + n=9999))) + assert lists == expected def sanity_checks(box): @@ -251,10 +259,10 @@ def test_block_in_inline(): box = parse('''

Lorem ipsum dolor sit - amet,conse

''') + amet,conse

''') box = build.inline_in_block(box) assert_tree(box, [ ('body', 'Line', [ @@ -277,7 +285,7 @@ def test_block_in_inline(): ('span', 'Line', [ ('em', 'Inline', [ ('em', 'Text', 'conse'), - ('div', 'Block', [])])])])])])])])]) + ('i', 'Block', [])])])])])])])])]) box = build.block_in_inline(box) assert_tree(box, [ @@ -312,7 +320,7 @@ def test_block_in_inline(): ('span', 'Line', [ ('em', 'Inline', [ ('em', 'Text', 'conse')])])]), - ('div', 'Block', []), + ('i', 'Block', []), ('span', 'AnonBlock', [ ('span', 'Line', [ ('em', 'Inline', [])])])]), @@ -481,48 +489,48 @@ def test_tables(): # Rule 1.3 # Also table model: http://www.w3.org/TR/CSS21/tables.html#model assert_tree(parse_all(''' - - - - - - - - - - - - - - -
foobar
top caption
baz
+ + + foo + bar + + + + + + + top caption + + baz + + '''), [ - ('table', 'AnonBlock', [ - ('caption', 'TableCaption', [ - ('caption', 'Line', [ - ('caption', 'Text', 'top caption')])]), - ('table', 'Table', [ - ('table', 'AnonTableColumnGroup', [ - ('col', 'TableColumn', [])]), - ('thead', 'TableRowGroup', [ - ('thead', 'AnonTableRow', [ - ('th', 'TableCell', [])])]), - ('table', 'AnonTableRowGroup', [ - ('tr', 'TableRow', [ - ('th', 'TableCell', [ - ('th', 'Line', [ - ('th', 'Text', 'foo')])]), - ('th', 'TableCell', [ - ('th', 'Line', [ - ('th', 'Text', 'bar')])])])]), - ('thead', 'TableRowGroup', []), - ('table', 'AnonTableRowGroup', [ - ('tr', 'TableRow', [ - ('td', 'TableCell', [ - ('td', 'Line', [ - ('td', 'Text', 'baz')])])])]), - ('tfoot', 'TableRowGroup', [])]), - ('caption', 'TableCaption', [])])]) + ('x-table', 'AnonBlock', [ + ('x-caption', 'TableCaption', [ + ('x-caption', 'Line', [ + ('x-caption', 'Text', 'top caption')])]), + ('x-table', 'Table', [ + ('x-table', 'AnonTableColumnGroup', [ + ('x-col', 'TableColumn', [])]), + ('x-thead', 'TableRowGroup', [ + ('x-thead', 'AnonTableRow', [ + ('x-th', 'TableCell', [])])]), + ('x-table', 'AnonTableRowGroup', [ + ('x-tr', 'TableRow', [ + ('x-th', 'TableCell', [ + ('x-th', 'Line', [ + ('x-th', 'Text', 'foo')])]), + ('x-th', 'TableCell', [ + ('x-th', 'Line', [ + ('x-th', 'Text', 'bar')])])])]), + ('x-thead', 'TableRowGroup', []), + ('x-table', 'AnonTableRowGroup', [ + ('x-tr', 'TableRow', [ + ('x-td', 'TableCell', [ + ('x-td', 'Line', [ + ('x-td', 'Text', 'baz')])])])]), + ('x-tfoot', 'TableRowGroup', [])]), + ('x-caption', 'TableCaption', [])])]) # Rules 1.4 and 3.1 assert_tree(parse_all(''' @@ -562,72 +570,74 @@ def test_tables(): ('ins', 'AnonTableColumn', [])])])])]) # Rules 2.1 then 2.3 - assert_tree(parse_all('foo
'), [ - ('table', 'AnonBlock', [ - ('table', 'Table', [ - ('table', 'AnonTableRowGroup', [ - ('table', 'AnonTableRow', [ - ('table', 'AnonTableCell', [ - ('table', 'AnonBlock', [ - ('table', 'Line', [ - ('table', 'Text', 'foo ')])]), + assert_tree(parse_all('foo
'), [ + ('x-table', 'AnonBlock', [ + ('x-table', 'Table', [ + ('x-table', 'AnonTableRowGroup', [ + ('x-table', 'AnonTableRow', [ + ('x-table', 'AnonTableCell', [ + ('x-table', 'AnonBlock', [ + ('x-table', 'Line', [ + ('x-table', 'Text', 'foo ')])]), ('div', 'Block', [])])])])])])]) # Rule 2.2 - assert_tree(parse_all('
'), [ + assert_tree(parse_all('' + '
'), [ ('body', 'AnonBlock', [ ('body', 'AnonTable', [ - ('thead', 'TableRowGroup', [ - ('thead', 'AnonTableRow', [ - ('thead', 'AnonTableCell', [ + ('x-thead', 'TableRowGroup', [ + ('x-thead', 'AnonTableRow', [ + ('x-thead', 'AnonTableCell', [ ('div', 'Block', [])]), - ('td', 'TableCell', [])])])])])]) + ('x-td', 'TableCell', [])])])])])]) # TODO: re-enable this once we support inline-table -# # Rule 3.2 -# assert_tree(parse_all(''), [ -# ('body', 'Line', [ -# ('span', 'Inline', [ -# ('span', 'AnonInlineBlock', [ -# ('span', 'AnonInlineTable', [ -# ('span', 'AnonTableRowGroup', [ -# ('tr', 'TableRow', [])])])])])])]) - -# # Rule 3.1 -# # Also, rule 1.3 does not apply: whitespace before and after is preserved -# assert_tree(parse_all(''' -# -# -# -# -# '''), [ -# ('body', 'Line', [ -# ('span', 'Inline', [ -# # Whitespace is preserved in table handling, then collapsed -# # into a single space. -# ('span', 'Text', ' '), -# ('span', 'AnonInlineBlock', [ -# ('span', 'AnonInlineTable', [ -# ('span', 'AnonTableRowGroup', [ -# ('span', 'AnonTableRow', [ -# ('em', 'TableCell', []), -# ('em', 'TableCell', [])])])])]), -# ('span', 'Text', ' ')])])]) + # Rule 3.2 + assert_tree(parse_all(''), [ + ('body', 'Line', [ + ('span', 'Inline', [ + ('span', 'AnonInlineBlock', [ + ('span', 'AnonInlineTable', [ + ('span', 'AnonTableRowGroup', [ + ('x-tr', 'TableRow', [])])])])])])]) + + # Rule 3.1 + # Also, rule 1.3 does not apply: whitespace before and after is preserved + assert_tree(parse_all(''' + + + + + '''), [ + ('body', 'Line', [ + ('span', 'Inline', [ + # Whitespace is preserved in table handling, then collapsed + # into a single space. + ('span', 'Text', ' '), + ('span', 'AnonInlineBlock', [ + ('span', 'AnonInlineTable', [ + ('span', 'AnonTableRowGroup', [ + ('span', 'AnonTableRow', [ + ('em', 'TableCell', []), + ('em', 'TableCell', [])])])])]), + ('span', 'Text', ' ')])])]) # Rule 3.2 - assert_tree(parse_all('\t'), [ + assert_tree(parse_all('\t'), [ ('body', 'AnonBlock', [ ('body', 'AnonTable', [ ('body', 'AnonTableRowGroup', [ - ('tr', 'TableRow', []), - ('tr', 'TableRow', [])])])])]) - assert_tree(parse_all('\n'), [ + ('x-tr', 'TableRow', []), + ('x-tr', 'TableRow', [])])])])]) + + assert_tree(parse_all('\n'), [ ('body', 'AnonBlock', [ ('body', 'AnonTable', [ ('body', 'AnonTableColumnGroup', [ - ('col', 'TableColumn', [])]), - ('colgroup', 'TableColumnGroup', [ - ('colgroup', 'AnonTableColumn', [])])])])]) + ('x-col', 'TableColumn', [])]), + ('x-colgroup', 'TableColumnGroup', [ + ('x-colgroup', 'AnonTableColumn', [])])])])]) @assert_no_logs diff --git a/weasyprint/tests/test_draw.py b/weasyprint/tests/test_draw.py index 14fcbd519..65f23281a 100644 --- a/weasyprint/tests/test_draw.py +++ b/weasyprint/tests/test_draw.py @@ -24,6 +24,7 @@ from ..compat import xrange, izip, ints_from_bytes from ..urls import ensure_url +from ..html import HTML_HANDLERS from .. import HTML from .testing_utils import ( resource_filename, TestHTML, FONTS, assert_no_logs, capture_logs) @@ -1459,37 +1460,42 @@ def test_visibility(): @assert_no_logs @requires_cairo_1_12 def test_tables(): + # TODO: refactor colspan/rowspan into CSS: + # td, th { column-span: attr(colspan integer) } + HTML_HANDLERS['x-td'] = HTML_HANDLERS['td'] + HTML_HANDLERS['x-th'] = HTML_HANDLERS['th'] + source = ''' - - - - - - - - - - - - - - - - - - - - - -
+ + + + + + + + + + + + + + + + + + + + + + ''' r = as_pixel(b'\xff\x7f\x7f\xff') # rgba(255, 0, 0, 0.5) above #fff R = as_pixel(b'\xff\x3f\x3f\xff') # r above r above #fff @@ -1526,8 +1532,8 @@ def test_tables(): _+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+_, _+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_, ], source % {'extra_css': ''' - table { border-color: #00f; table-layout: fixed } - td { border-color: rgba(255, 0, 0, 0.5) } + x-table { border-color: #00f; table-layout: fixed } + x-td { border-color: rgba(255, 0, 0, 0.5) } '''}) assert_pixels('table_collapsed_borders', 28, 28, [ @@ -1560,9 +1566,9 @@ def test_tables(): _+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_, _+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_, ], source % {'extra_css': ''' - table { border: 2px solid #00f; table-layout: fixed; - border-collapse: collapse } - td { border-color: #ff7f7f } + x-table { border: 2px solid #00f; table-layout: fixed; + border-collapse: collapse } + x-td { border-color: #ff7f7f } '''}) assert_pixels('table_collapsed_borders_paged', 28, 52, [ @@ -1619,9 +1625,9 @@ def test_tables(): _+g+g+g+g+g+g+g+g+g+g+g+g+g+g+g+g+g+g+g+g+g+g+g+g+g+g+_, _+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_, ], source % {'extra_css': ''' - table { border: solid #00f; border-width: 8px 2px; - table-layout: fixed; border-collapse: collapse } - td { border-color: #ff7f7f } + x-table { border: solid #00f; border-width: 8px 2px; + table-layout: fixed; border-collapse: collapse } + x-td { border-color: #ff7f7f } @page { size: 28px 26px; margin: 1px; border: 1px solid rgba(0, 255, 0, 0.5); } '''}) @@ -1656,8 +1662,8 @@ def test_tables(): _+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+_, _+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_, ], source % {'extra_css': ''' - table { border-color: #00f; table-layout: fixed } - td { background: rgba(255, 0, 0, 0.5) } + x-table { border-color: #00f; table-layout: fixed } + x-td { background: rgba(255, 0, 0, 0.5) } '''}) assert_pixels('table_column_backgrounds', 28, 28, [ @@ -1690,9 +1696,9 @@ def test_tables(): _+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+_, _+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_, ], source % {'extra_css': ''' - table { border-color: #00f; table-layout: fixed } - colgroup { background: rgba(255, 0, 0, 0.5) } - col { background: rgba(0, 255, 0, 0.5) } + x-table { border-color: #00f; table-layout: fixed } + x-colgroup { background: rgba(255, 0, 0, 0.5) } + x-col { background: rgba(0, 255, 0, 0.5) } '''}) assert_pixels('table_row_backgrounds', 28, 28, [ @@ -1725,9 +1731,9 @@ def test_tables(): _+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+B+_, _+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_+_, ], source % {'extra_css': ''' - table { border-color: #00f; table-layout: fixed } - tbody { background: rgba(255, 0, 0, 0.5) } - tr { background: rgba(0, 255, 0, 0.5) } + x-table { border-color: #00f; table-layout: fixed } + x-tbody { background: rgba(255, 0, 0, 0.5) } + x-tr { background: rgba(0, 255, 0, 0.5) } '''}) r = as_pixel(b'\xff\x00\x00\xff') @@ -1842,7 +1848,7 @@ def test_before_after(): body { margin: 0; background: #fff } a[href]:before { content: '[' attr(href) '] ' } -

some content

+

some content

'''), ('pseudo_before_reference', ''' -
+
''') page_divs = [] for page in pages: @@ -2603,10 +2603,10 @@ def test_text_align_justify(): @page { size: 300px 1000px } body { text-align: justify } -

 +

-  -  + + ''') @@ -2655,8 +2655,7 @@ def test_word_spacing(): # (Not a string.) page, = parse(''' - Lorem ipsum dolorsit amet - ''') + Lorem ipsum dolorsit amet''') html, = page.children body, = html.children line, = body.children @@ -2667,8 +2666,7 @@ def test_word_spacing(): # of a TextBox. Is this what we want? page, = parse(''' - Lorem ipsum dolorsit amet - ''') + Lorem ipsum dolorsit amet''') html, = page.children body, = html.children line, = body.children @@ -2680,8 +2678,7 @@ def test_word_spacing(): def test_letter_spacing(): """Test letter-spacing.""" page, = parse(''' - Supercalifragilisticexpialidocious - ''') + Supercalifragilisticexpialidocious''') html, = page.children body, = html.children line, = body.children @@ -2690,8 +2687,7 @@ def test_letter_spacing(): page, = parse(''' - Supercalifragilisticexpialidocious - ''') + Supercalifragilisticexpialidocious''') html, = page.children body, = html.children line, = body.children @@ -2851,8 +2847,8 @@ def test_table_column_width(): with capture_logs() as logs: page, = parse(source) assert len(logs) == 1 - assert logs[0] == ('WARNING: This table row has more columns than ' - 'the table, ignored 1 cells: (,)') + assert logs[0].startswith('WARNING: This table row has more columns than ' + 'the table, ignored 1 cell') html, = page.children body, = html.children wrapper, = body.children @@ -2975,8 +2971,8 @@ def test_table_row_height(): X
X
X - X
+ +
X
X X X diff --git a/weasyprint/tests/test_stacking.py b/weasyprint/tests/test_stacking.py index 8f4e7decc..17f87aacc 100644 --- a/weasyprint/tests/test_stacking.py +++ b/weasyprint/tests/test_stacking.py @@ -21,14 +21,10 @@ def to_lists(page): return serialize_stacking(StackingContext.from_box(html, page)) -def serialize_box(box): - return '%s %s' % (box.element_tag, box.sourceline) - - def serialize_stacking(context): return ( - serialize_box(context.box), - [serialize_box(b) for b in context.blocks_and_cells], + context.box.element_tag, + [b.element_tag for b in context.blocks_and_cells], [serialize_stacking(c) for c in context.zero_z_contexts], ) @@ -39,14 +35,14 @@ def test_nested():

-

+
''') assert to_lists(page) == ( - 'html 1', - ['body 1', 'p 1'], + 'html', + ['body', 'p'], [( - 'div 2', - ['p 3'], + 'div', + ['p'], [])]) page, = parse('''\ @@ -55,10 +51,10 @@ def test_nested():
''') assert to_lists(page) == ( - 'html 1', - ['body 1'], - [('div 1', [], []), # In this order - ('p 2', [], [])]) + 'html', + ['body'], + [('div', [], []), # In this order + ('p', [], [])]) @assert_no_logs