Skip to content

Commit

Permalink
parsing numeric character references needs an update of nekohtml
Browse files Browse the repository at this point in the history
the new version of nekohtml brought a few regressions. this commit fixes
but two error warning ones.

it avoids to autocomplete the tbody tag around tr tags of a table. the check
of unknown html did change upstream and got adjusted.

fixes #1113

Sponsored by Lookout Inc.
  • Loading branch information
mkristian committed Feb 21, 2015
1 parent 3a2542b commit 2f43a0c
Show file tree
Hide file tree
Showing 7 changed files with 47 additions and 7 deletions.
3 changes: 2 additions & 1 deletion ext/java/nokogiri/HtmlElementDescription.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,11 @@ protected static List<String> findSubElements(HTMLElements.Element elem) {
public static IRubyObject get(ThreadContext context,
IRubyObject klazz, IRubyObject name) {

HTMLElements.Element elem = HTMLElements.getElement(name.toString());
HTMLElements.Element elem = HTMLElements.getElement(name.asJavaString(), HTMLElements.NO_SUCH_ELEMENT);
if (elem == HTMLElements.NO_SUCH_ELEMENT)
return context.getRuntime().getNil();

elem = HTMLElements.getElement(name.toString());
HtmlElementDescription desc =
new HtmlElementDescription(context.getRuntime(), (RubyClass)klazz);
desc.element = elem;
Expand Down
19 changes: 19 additions & 0 deletions ext/java/nokogiri/NokogiriService.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import java.util.HashMap;
import java.util.Map;

import org.cyberneko.html.HTMLElements;
import org.jruby.Ruby;
import org.jruby.RubyArray;
import org.jruby.RubyClass;
Expand All @@ -53,6 +54,24 @@
* @author Yoko Harada <[email protected]>
*/
public class NokogiriService implements BasicLibraryService {

// nekohtml from version 1.9.13 they autocomplete tbody around
// tr tags of a table - http://sourceforge.net/p/nekohtml/code/241/
// this monkey patch undoes this autocompletion
static class MonkeyPatchHTMLElements extends HTMLElements {
static void patchIt() {
Element[] array = ELEMENTS_ARRAY['T'-'A'];
for(int i = 0; i < array.length; i++) {
if (array[i].name.equals("TR")) {
array[i] = new Element(TR, "TR", Element.BLOCK, TABLE, new short[]{TD,TH,TR,COLGROUP,DIV});
}
}
}
}
static {
MonkeyPatchHTMLElements.patchIt();
}

public static final String nokogiriClassCacheGvarName = "$NOKOGIRI_CLASS_CACHE";

public boolean basicLoad(Ruby ruby) {
Expand Down
8 changes: 5 additions & 3 deletions ext/java/nokogiri/internals/NokogiriErrorHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,10 @@ public List<IRubyObject> getErrorsReadyForRuby(ThreadContext context) {
return res;
}

protected boolean usesNekoHtml(String domain) {
if ("http://cyberneko.org/html".equals(domain)) return true;
else return false;
protected void add(Exception e){
// this message might be bound to the nekohtml version 1.9.21
if (!e.getMessage().equals("No character encoding indicator at beginning of document.")) {
errors.add(e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ public void fatalError(String domain, String key, XMLParseException e) {
* @param e Exception.
*/
public void warning(String domain, String key, XMLParseException e) {
//noop. NekoHtml adds too many warnings.
add(e);
}

}
3 changes: 1 addition & 2 deletions ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ public void fatalError(String domain, String key, XMLParseException e) throws XM

public void warning(String domain, String key, XMLParseException e) throws XMLParseException {
if (!nowarning) throw e;
if (!usesNekoHtml(domain)) errors.add(e);
else add(e);
}

}
Binary file modified lib/nekohtml.jar
Binary file not shown.
19 changes: 19 additions & 0 deletions test/html/test_node_encoding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,25 @@ def test_inner_html
contents = doc.at('h2').inner_html
assert_match hello, contents
end

def test_encoding_GH_1113
doc = Nokogiri::HTML::Document.new
hex = '<p>&#x1f340;</p>'
decimal = '<p>&#127808;</p>'
encoded = '<p>🍀</p>'

doc.encoding = 'UTF-8'
expected = encoded
assert_equal expected, doc.fragment(hex).to_s
assert_equal expected, doc.fragment(decimal).to_s
assert_equal expected, doc.fragment(encoded).to_s

doc.encoding = 'US-ASCII'
expected = defined?(JRUBY_VERSION) ? hex : decimal
assert_equal expected, doc.fragment(hex).to_s
assert_equal expected, doc.fragment(decimal).to_s
assert_equal expected, doc.fragment(encoded).to_s
end
end
end
end
Expand Down

0 comments on commit 2f43a0c

Please sign in to comment.