Skip to content

Commit

Permalink
Added support for svg and math foreign elements (#2008)
Browse files Browse the repository at this point in the history
Added support for svg and math foreign elements

Includes namespace support in tags.
  • Loading branch information
jhy authored Oct 12, 2023
1 parent afc38d8 commit 6d48703
Show file tree
Hide file tree
Showing 18 changed files with 572 additions and 78 deletions.
6 changes: 5 additions & 1 deletion CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,17 @@ Release 1.16.2 [PENDING]
matching process by ensuring that simpler evaluations (such as a tag name match) are conducted prior to more
complex evaluations (such as an attribute regex, or a deep child scan with a :has).

* Improvement: added support for <svg> and <math> tags (and their children). This includes tag namespaces and case
preservation on applicable tags and attributes.
<https://github.com/jhy/jsoup/pull/2008>

* Improvement: when converting jsoup Documents to W3C Documents in W3CDom, HTML documents will be placed in the
`http://www.w3.org/1999/xhtml` namespace by default, per the HTML5 spec. This can be controlled by setting
`W3CDom#namespaceAware(false)`.
<https://github.com/jhy/jsoup/pull/1848>

* Improvement: speed optimized the Structural Evaluators by memoizing previous evaluations. Particularly the `~`
(any preceeding sibling) and `:nth-of-type` selectors are improved.
(any preceding sibling) and `:nth-of-type` selectors are improved.
<https://github.com/jhy/jsoup/issues/1956>

* Improvement: tweaked the performance of the Element nextElementSibling, previousElementSibling, firstElementSibling,
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/org/jsoup/helper/W3CDom.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.parser.HtmlTreeBuilder;
import org.jsoup.parser.Parser;
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;
import org.jsoup.select.Selector;
Expand Down Expand Up @@ -339,9 +340,9 @@ public String asString(Document doc) {
* Implements the conversion by walking the input.
*/
protected static class W3CBuilder implements NodeVisitor {
// TODO: move the namespace handling stuff into XmlTreeBuilder / HtmlTreeBuilder, now that Tags have namespaces
private static final String xmlnsKey = "xmlns";
private static final String xmlnsPrefix = "xmlns:";
private static final String xhtmlNs = "http://www.w3.org/1999/xhtml";

private final Document doc;
private boolean namespaceAware = true;
Expand All @@ -358,7 +359,7 @@ public W3CBuilder(Document doc) {
final org.jsoup.nodes.Document inDoc = contextElement.ownerDocument();
if (namespaceAware && inDoc != null && inDoc.parser().getTreeBuilder() instanceof HtmlTreeBuilder) {
// as per the WHATWG HTML5 spec § 2.1.3, elements are in the HTML namespace by default
namespacesStack.peek().put("", xhtmlNs);
namespacesStack.peek().put("", Parser.NamespaceHtml);
}
}

Expand Down
21 changes: 16 additions & 5 deletions src/main/java/org/jsoup/nodes/Document.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,28 @@ public class Document extends Element {
private boolean updateMetaCharset = false;

/**
Create a new, empty Document.
Create a new, empty Document, in the specified namespace.
@param namespace the namespace of this Document's root node.
@param baseUri base URI of document
@see org.jsoup.Jsoup#parse
@see #createShell
*/
public Document(String baseUri) {
super(Tag.valueOf("#root", ParseSettings.htmlDefault), baseUri);
public Document(String namespace, String baseUri) {
super(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), baseUri);
this.location = baseUri;
this.parser = Parser.htmlParser(); // default, but overridable
}

/**
Create a new, empty Document, in the HTML namespace.
@param baseUri base URI of document
@see org.jsoup.Jsoup#parse
@see #Document(String namespace, String baseUri)
*/
public Document(String baseUri) {
this(Parser.NamespaceHtml, baseUri);
}

/**
Create a valid, empty shell of a document, suitable for adding more elements to.
@param baseUri baseUri of document
Expand Down Expand Up @@ -207,7 +218,7 @@ public void title(String title) {
@return new element
*/
public Element createElement(String tagName) {
return new Element(Tag.valueOf(tagName, ParseSettings.preserveCase), this.baseUri());
return new Element(Tag.valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri());
}

@Override
Expand Down Expand Up @@ -311,7 +322,7 @@ public Document clone() {

@Override
public Document shallowClone() {
Document clone = new Document(baseUri());
Document clone = new Document(this.tag().namespace(), baseUri());
if (attributes != null)
clone.attributes = attributes.clone();
clone.outputSettings = this.outputSettings.clone();
Expand Down
43 changes: 38 additions & 5 deletions src/main/java/org/jsoup/nodes/Element.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.jsoup.internal.NonnullByDefault;
import org.jsoup.internal.StringUtil;
import org.jsoup.parser.ParseSettings;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
import org.jsoup.select.Collector;
import org.jsoup.select.Elements;
Expand Down Expand Up @@ -51,11 +52,21 @@ public class Element extends Node {
@Nullable Attributes attributes; // field is nullable but all methods for attributes are non-null

/**
* Create a new, standalone element.
* Create a new, standalone element, in the specified namespace.
* @param tag tag name
* @param namespace namespace for this element
*/
public Element(String tag, String namespace) {
this(Tag.valueOf(tag, namespace, ParseSettings.preserveCase), null);
}

/**
* Create a new, standalone element, in the HTML namespace.
* @param tag tag name
* @see #Element(String tag, String namespace)
*/
public Element(String tag) {
this(Tag.valueOf(tag), "", null);
this(Tag.valueOf(tag, Parser.NamespaceHtml, ParseSettings.preserveCase), "", null);
}

/**
Expand Down Expand Up @@ -172,8 +183,22 @@ public String normalName() {
* @see Elements#tagName(String)
*/
public Element tagName(String tagName) {
return tagName(tagName, tag.namespace());
}

/**
* Change (rename) the tag of this element. For example, convert a {@code <span>} to a {@code <div>} with
* {@code el.tagName("div");}.
*
* @param tagName new tag name for this element
* @param namespace the new namespace for this element
* @return this element, for chaining
* @see Elements#tagName(String)
*/
public Element tagName(String tagName, String namespace) {
Validate.notEmptyParam(tagName, "tagName");
tag = Tag.valueOf(tagName, NodeUtils.parser(this).settings()); // maintains the case option of the original parse
Validate.notEmptyParam(namespace, "namespace");
tag = Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()); // maintains the case option of the original parse
return this;
}

Expand Down Expand Up @@ -679,7 +704,11 @@ public Element insertChildren(int index, Node... children) {
* {@code parent.appendElement("h1").attr("id", "header").text("Welcome");}
*/
public Element appendElement(String tagName) {
Element child = new Element(Tag.valueOf(tagName, NodeUtils.parser(this).settings()), baseUri());
return appendElement(tagName, tag.namespace());
}

public Element appendElement(String tagName, String namespace) {
Element child = new Element(Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()), baseUri());
appendChild(child);
return child;
}
Expand All @@ -692,7 +721,11 @@ public Element appendElement(String tagName) {
* {@code parent.prependElement("h1").attr("id", "header").text("Welcome");}
*/
public Element prependElement(String tagName) {
Element child = new Element(Tag.valueOf(tagName, NodeUtils.parser(this).settings()), baseUri());
return prependElement(tagName, tag.namespace());
}

public Element prependElement(String tagName, String namespace) {
Element child = new Element(Tag.valueOf(tagName, namespace, NodeUtils.parser(this).settings()), baseUri());
prependChild(child);
return child;
}
Expand Down
119 changes: 112 additions & 7 deletions src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.jsoup.parser;

import org.jsoup.helper.Validate;
import org.jsoup.internal.Normalizer;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.CDataNode;
import org.jsoup.nodes.Comment;
Expand All @@ -21,6 +22,7 @@

import static org.jsoup.internal.StringUtil.inSorted;
import static org.jsoup.parser.HtmlTreeBuilderState.Constants.InTableFoster;
import static org.jsoup.parser.HtmlTreeBuilderState.ForeignContent;

/**
* HTML Tree Builder; creates a DOM from Tokens.
Expand All @@ -42,6 +44,8 @@ public class HtmlTreeBuilder extends TreeBuilder {
"noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script",
"section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead",
"title", "tr", "ul", "wbr", "xmp"};
static final String[] TagMathMlTextIntegration = new String[]{"mi", "mn", "mo", "ms", "mtext"};
static final String[] TagSvgHtmlIntegration = new String[]{"desc", "foreignObject", "title"};

public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages

Expand Down Expand Up @@ -165,7 +169,86 @@ List<Node> parseFragment(String inputFragment, @Nullable Element context, String
@Override
protected boolean process(Token token) {
currentToken = token;
return this.state.process(token, this);

if (shouldDispatchToCurrentInsertionMode(token)) {
return this.state.process(token, this);
} else {
return ForeignContent.process(token, this);
}
}

boolean shouldDispatchToCurrentInsertionMode(Token token) {
// https://html.spec.whatwg.org/multipage/parsing.html#tree-construction
// If the stack of open elements is empty
if (stack.isEmpty())
return true;
final Element el = currentElement();
final String ns = el.tag().namespace();

// If the adjusted current node is an element in the HTML namespace
if (Parser.NamespaceHtml.equals(ns))
return true;

// If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark"
// If the adjusted current node is a MathML text integration point and the token is a character token
if (isMathmlTextIntegration(el)) {
if (token.isStartTag()
&& !"mglyph".equals(token.asStartTag().normalName)
&& !"malignmark".equals(token.asStartTag().normalName))
return true;
if (token.isCharacter())
return true;
}
// If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg"
if (Parser.NamespaceMathml.equals(ns)
&& el.normalName().equals("annotation-xml")
&& token.isStartTag()
&& "svg".equals(token.asStartTag().normalName))
return true;

// If the adjusted current node is an HTML integration point and the token is a start tag
// If the adjusted current node is an HTML integration point and the token is a character token
if (isHtmlIntegration(el)
&& (token.isStartTag() || token.isCharacter()))
return true;

// If the token is an end-of-file token
return token.isEOF();
}

boolean isMathmlTextIntegration(Element el) {
/*
A node is a MathML text integration point if it is one of the following elements:
A MathML mi element
A MathML mo element
A MathML mn element
A MathML ms element
A MathML mtext element
*/
return (Parser.NamespaceMathml.equals(el.tag().namespace())
&& StringUtil.inSorted(el.normalName(), TagMathMlTextIntegration));
}

boolean isHtmlIntegration(Element el) {
/*
A node is an HTML integration point if it is one of the following elements:
A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html"
A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml"
An SVG foreignObject element
An SVG desc element
An SVG title element
*/
if (Parser.NamespaceMathml.equals(el.tag().namespace())
&& el.normalName().equals("annotation-xml")) {
String encoding = Normalizer.normalize(el.attr("encoding"));
if (encoding.equals("text/html") || encoding.equals("application/xhtml+xml"))
return true;
}
if (Parser.NamespaceSvg.equals(el.tag().namespace())
&& StringUtil.in(el.tagName(), TagSvgHtmlIntegration)) // note using .tagName for case-sensitive hit here of foreignObject
return true;

return false;
}

boolean process(Token token, HtmlTreeBuilderState state) {
Expand Down Expand Up @@ -245,6 +328,23 @@ Element insert(final Token.StartTag startTag) {
return el;
}

/**
Inserts a foreign element. Preserves the case of the tag name and of the attributes.
*/
Element insertForeign(final Token.StartTag startTag, String namespace) {
dedupeAttributes(startTag);
Tag tag = tagFor(startTag.name(), namespace, ParseSettings.preserveCase);
Element el = new Element(tag, null, ParseSettings.preserveCase.normalizeAttributes(startTag.attributes));
insert(el, startTag);

if (startTag.isSelfClosing()) {
tag.setSelfClosing(); // remember this is self-closing for output
pop();
}

return el;
}

Element insertStartTag(String startTagName) {
Element el = new Element(tagFor(startTagName, settings), null);
insert(el);
Expand Down Expand Up @@ -272,7 +372,7 @@ Element insertEmpty(Token.StartTag startTag) {
if (!tag.isEmpty())
tokeniser.error("Tag [%s] cannot be self closing; not a void tag", tag.normalName());
}
else // unknown tag, remember this is self closing for output
else // unknown tag, remember this is self-closing for output
tag.setSelfClosing();
}
return el;
Expand Down Expand Up @@ -306,6 +406,7 @@ void insert(Token.Character characterToken) {
insert(characterToken, el);
}

/** Inserts the provided character token into the provided element. */
void insert(Token.Character characterToken, Element el) {
final Node node;
final String tagName = el.normalName();
Expand All @@ -321,7 +422,7 @@ else if (isContentForTagData(tagName))
onNodeInserted(node, characterToken);
}

/** Inserts the provided character token into the provided element. Use when not going onto stack element */
/** Inserts the provided Node into the current element. */
private void insertNode(Node node, @Nullable Token token) {
// if the stack hasn't been set up yet, elements (doctype, comments) go into the doc
if (stack.isEmpty())
Expand All @@ -331,10 +432,14 @@ else if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(),
else
currentElement().appendChild(node);

// connect form controls to their form element
if (node instanceof Element && ((Element) node).tag().isFormListed()) {
if (formElement != null)
formElement.addElement((Element) node);
if (node instanceof Element) {
Element el = (Element) node;
if (el.tag().isFormListed() && formElement != null)
formElement.addElement(el); // connect form controls to their form element

// in HTML, the xmlns attribute if set must match what the parser set the tag's namespace to
if (el.hasAttr("xmlns") && !el.attr("xmlns").equals(el.tag().namespace()))
error("Invalid xmlns attribute [%s] on tag [%s]", el.attr("xmlns"), el.tagName());
}
onNodeInserted(node, token);
}
Expand Down
Loading

0 comments on commit 6d48703

Please sign in to comment.