Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update html5 tests #2281

Merged
merged 3 commits into from
Jun 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 17 additions & 13 deletions gumbo-parser/src/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -4418,6 +4418,7 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
|| token_has_attribute(token, "size")
)
)
|| tag_in(token, kEndTag, &(const TagSet) { TAG(BR), TAG(P) })
) {
/* Parse error */
parser_add_parse_error(parser, token);
Expand All @@ -4427,20 +4428,23 @@ static void handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
* fragment parsing algorithm, then act as described in the "any other
* start tag" entry below.
*/
if (!is_fragment_parser(parser)) {
do {
pop_current_node(parser);
} while (
!(
is_mathml_integration_point(get_current_node(parser))
|| is_html_integration_point(get_current_node(parser))
|| get_current_node(parser)->v.element.tag_namespace == GUMBO_NAMESPACE_HTML
)
);
parser->_parser_state->_reprocess_current_token = true;
return;
while (
!(
is_mathml_integration_point(get_current_node(parser))
|| is_html_integration_point(get_current_node(parser))
|| get_current_node(parser)->v.element.tag_namespace == GUMBO_NAMESPACE_HTML
)
) {
pop_current_node(parser);
}
// This is a start tag so the next if's then branch will be taken.
// XXX: The spec currently says to handle this using the in body insertion
// mode rules. That seems wrong. See
// <https://github.com/whatwg/html/issues/6808>. Instead, use the current
// insertion mode which seems like it works.
//
// handle_in_body(parser, token);
handle_html_content(parser, token);
return;
}

if (token->type == GUMBO_TOKEN_START_TAG) {
Expand Down
92 changes: 92 additions & 0 deletions gumbo-parser/test/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2220,4 +2220,96 @@ TEST_F(GumboParserTest, FragmentWithoutForm) {
EXPECT_EQ(0, GetChildCount(span));
}

TEST_F(GumboParserTest, ForeignFragment) {
ParseFragment("</p><foo>", "svg", GUMBO_NAMESPACE_SVG);
EXPECT_EQ(1, GetChildCount(root_));
GumboNode* html = GetChild(root_, 0);
ASSERT_EQ(GUMBO_NODE_ELEMENT, html->type);
EXPECT_EQ(GUMBO_TAG_HTML, html->v.element.tag);
EXPECT_EQ(2, GetChildCount(html));

ASSERT_EQ(2, GetChildCount(html));
GumboNode* p = GetChild(html, 0);
ASSERT_EQ(GUMBO_NODE_ELEMENT, p->type);
ASSERT_EQ(GUMBO_TAG_P, p->v.element.tag);
ASSERT_EQ(GUMBO_NAMESPACE_HTML, p->v.element.tag_namespace);

GumboNode* foo = GetChild(html, 1);
ASSERT_EQ(GUMBO_NODE_ELEMENT, foo->type);
ASSERT_EQ(std::string("foo"), foo->v.element.name);
ASSERT_EQ(GUMBO_NAMESPACE_SVG, foo->v.element.tag_namespace);
}

TEST_F(GumboParserTest, FosterParenting) {
Parse("<!doctype><body><table><colgroup><svg><g>foo</g><g>bar</g><p>baz</table><p>quux");
EXPECT_EQ(1, GetChildCount(root_));
GumboNode* html = GetChild(root_, 0);
ASSERT_EQ(GUMBO_NODE_ELEMENT, html->type);
EXPECT_EQ(GUMBO_TAG_HTML, html->v.element.tag);
EXPECT_EQ(2, GetChildCount(html));

GumboNode* body = GetChild(html, 1);
ASSERT_EQ(GUMBO_NODE_ELEMENT, body->type);
EXPECT_EQ(GUMBO_TAG_BODY, body->v.element.tag);
EXPECT_EQ(4, GetChildCount(body));

GumboNode* svg = GetChild(body, 0);
ASSERT_EQ(GUMBO_NODE_ELEMENT, svg->type);
EXPECT_EQ(GUMBO_TAG_SVG, svg->v.element.tag);
EXPECT_EQ(GUMBO_NAMESPACE_SVG, svg->v.element.tag_namespace);
EXPECT_EQ(2, GetChildCount(svg));

GumboNode* g = GetChild(svg, 0);
ASSERT_EQ(GUMBO_NODE_ELEMENT, g->type);
EXPECT_EQ(std::string("g"), g->v.element.name);
EXPECT_EQ(GUMBO_NAMESPACE_SVG, g->v.element.tag_namespace);
EXPECT_EQ(1, GetChildCount(g));

GumboNode* text = GetChild(g, 0);
ASSERT_EQ(GUMBO_NODE_TEXT, text->type);
EXPECT_EQ(std::string("foo"), text->v.text.text);

g = GetChild(svg, 1);
ASSERT_EQ(GUMBO_NODE_ELEMENT, g->type);
EXPECT_EQ(std::string("g"), g->v.element.name);
EXPECT_EQ(GUMBO_NAMESPACE_SVG, g->v.element.tag_namespace);
EXPECT_EQ(1, GetChildCount(g));

text = GetChild(g, 0);
ASSERT_EQ(GUMBO_NODE_TEXT, text->type);
EXPECT_EQ(std::string("bar"), text->v.text.text);

GumboNode* p = GetChild(body, 1);
ASSERT_EQ(GUMBO_NODE_ELEMENT, p->type);
EXPECT_EQ(GUMBO_TAG_P, p->v.element.tag);
EXPECT_EQ(GUMBO_NAMESPACE_HTML, p->v.element.tag_namespace);
EXPECT_EQ(1, GetChildCount(p));

text = GetChild(p, 0);
ASSERT_EQ(GUMBO_NODE_TEXT, text->type);
EXPECT_EQ(std::string("baz"), text->v.text.text);

GumboNode* table = GetChild(body, 2);
ASSERT_EQ(GUMBO_NODE_ELEMENT, table->type);
EXPECT_EQ(GUMBO_TAG_TABLE, table->v.element.tag);
EXPECT_EQ(GUMBO_NAMESPACE_HTML, table->v.element.tag_namespace);
EXPECT_EQ(1, GetChildCount(table));

GumboNode* colgroup = GetChild(table, 0);
ASSERT_EQ(GUMBO_NODE_ELEMENT, colgroup->type);
EXPECT_EQ(GUMBO_TAG_COLGROUP, colgroup->v.element.tag);
EXPECT_EQ(GUMBO_NAMESPACE_HTML, colgroup->v.element.tag_namespace);
EXPECT_EQ(0, GetChildCount(colgroup));

p = GetChild(body, 3);
ASSERT_EQ(GUMBO_NODE_ELEMENT, p->type);
EXPECT_EQ(GUMBO_TAG_P, p->v.element.tag);
EXPECT_EQ(GUMBO_NAMESPACE_HTML, p->v.element.tag_namespace);
EXPECT_EQ(1, GetChildCount(p));

text = GetChild(p, 0);
ASSERT_EQ(GUMBO_NODE_TEXT, text->type);
EXPECT_EQ(std::string("quux"), text->v.text.text);
}

} // namespace
20 changes: 13 additions & 7 deletions test/html5/test_tree-construction.rb
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,14 @@ def parse_test(test_data)
node[:name] = $~[1]
node[:public_id] = $~[2].nil? || $~[2].empty? ? nil : $~[2]
node[:system_id] = $~[3].nil? || $~[3].empty? ? nil : $~[3]
elsif /^<!-- (.*) -->$/ =~ node_text
elsif node_text.start_with?('<!-- ')
loop do
break if lines[index].end_with?(' -->')
index += 1
node_text << "\n" + lines[index]
end
node[:type] = :comment
node[:contents] = $~[1]
node[:contents] = node_text[5..-5]
elsif /^<(svg |math )?(.+)>$/ =~ node_text
node[:type] = :element
node[:ns] = $~[1].nil? ? nil : $~[1].rstrip
Expand Down Expand Up @@ -154,7 +159,7 @@ def compare_nodes(node, ng_node)
assert_equal(attr[:value], value)
end
assert_equal(node[:children].length, ng_node.children.length,
"Element <#{node[:tag]}> has wrong number of children: #{ng_node.children.map { |c| c.name }}")
"Element <#{node[:tag]}> has wrong number of children #{ng_node.children.map { |c| c.name }} in #{@test[:data]}")
when Nokogiri::XML::Node::TEXT_NODE, Nokogiri::XML::Node::CDATA_SECTION_NODE
# We preserve the CDATA in the tree, but the tests represent it as text.
assert_equal(node[:type], :text)
Expand All @@ -167,7 +172,8 @@ def compare_nodes(node, ng_node)
assert_equal(node[:children].length, ng_node.children.length)
when Nokogiri::XML::Node::DOCUMENT_FRAG_NODE
assert_equal(node[:type], :fragment)
assert_equal(node[:children].length, ng_node.children.length)
assert_equal(node[:children].length, ng_node.children.length,
"Fragment node has wrong number of children #{ng_node.children.map { |c| c.name }} in #{@test[:data]}")
when Nokogiri::XML::Node::DTD_NODE
assert_equal(node[:type], :doctype)
assert_equal(node[:name], ng_node.name)
Expand Down Expand Up @@ -212,7 +218,7 @@ def run_test
end

# Test the errors.
assert_equal(@test[:errors].length, doc.errors.length)
assert_equal(@test[:errors].length, doc.errors.length, "Wrong number of errors for #{@test[:data]}")

# The new, standardized tokenizer errors live in @test[:new_errors]. Let's
# match each one to exactly one error in doc.errors. Unfortunately, the
Expand All @@ -224,7 +230,7 @@ def run_test
errors.reject! { |err| err[:code] == "generic-parser" }
error_regex = /^\((?<line>\d+):(?<column>\d+)(?:-\d+:\d+)?\) (?<code>.*)$/
@test[:new_errors].each do |err|
assert_match(error_regex, err)
assert_match(error_regex, err, "New error format does not match: #{mu_pp(err)}")
m = err.match(error_regex)
line = m[:line].to_i
column = m[:column].to_i
Expand All @@ -236,7 +242,7 @@ def run_test
end
# This error should be the first error in the list.
# refute_nil(idx, "Expected to find error #{code} at #{line}:#{column}")
assert_equal(0, idx, "Expected to find error #{code} at #{line}:#{column}")
assert_equal(0, idx, "Expected to find error #{code} at #{line}:#{column} in #{@test[:data]}")
errors.delete_at(idx)
end
end
Expand Down