From 043acfe4fbb3d08d50df4abf1e08ac62d55c7a89 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 7 Apr 2019 11:31:15 -0400 Subject: [PATCH 01/11] fix test from #1124 to use a StringIO object because implementing read like this can result in nondeterministic behavior. see related #1821 and #1888. --- test/xml/test_document.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/test/xml/test_document.rb b/test/xml/test_document.rb index fe81b048527..9b5cb2ab1a9 100644 --- a/test/xml/test_document.rb +++ b/test/xml/test_document.rb @@ -707,8 +707,12 @@ def test_parsing_empty_io def test_parse_works_with_an_object_that_responds_to_read klass = Class.new do - def read *args - "
foo
" + def initialize + @contents = StringIO.new("
foo
") + end + + def read(*args) + @contents.read(*args) end end From dd0a03e4ab5fe53d8db87cb20d96a4ad8a37b2c0 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sun, 7 Apr 2019 11:33:16 -0400 Subject: [PATCH 02/11] add failing test to reproduce #1888 related to incomplete application of fix from #1124 --- test/html/test_document.rb | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/test/html/test_document.rb b/test/html/test_document.rb index 62b3fc6b1f3..2c21f1203cc 100644 --- a/test/html/test_document.rb +++ b/test/html/test_document.rb @@ -355,6 +355,21 @@ def test_parse_io } end + def test_parse_works_with_an_object_that_responds_to_read + klass = Class.new do + def initialize + @contents = StringIO.new("
foo
") + end + + def read(*args) + @contents.read(*args) + end + end + + doc = Nokogiri::HTML.parse klass.new + doc.at_css("div").content.must_equal("foo") + end + def test_parse_temp_file temp_html_file = Tempfile.new("TEMP_HTML_FILE") File.open(HTML_FILE, 'rb') { |f| temp_html_file.write f.read } From 203d7c41efe5f1ce8089ef4dfaeab19154893e67 Mon Sep 17 00:00:00 2001 From: John Shahid Date: Tue, 16 Apr 2019 19:46:33 -0400 Subject: [PATCH 03/11] Remove unused new_from_str method --- ext/java/nokogiri/XmlNode.java | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/ext/java/nokogiri/XmlNode.java b/ext/java/nokogiri/XmlNode.java index d94921fc4e0..ad29d2c443d 100644 --- a/ext/java/nokogiri/XmlNode.java +++ b/ext/java/nokogiri/XmlNode.java @@ -64,8 +64,8 @@ import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; import org.jruby.exceptions.RaiseException; -import org.jruby.runtime.Helpers; import org.jruby.runtime.Block; +import org.jruby.runtime.Helpers; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.Visibility; import org.jruby.runtime.builtin.IRubyObject; @@ -1316,14 +1316,6 @@ public IRubyObject previous_sibling(ThreadContext context) { return getCachedNodeOrCreate(context.getRuntime(), node.getPreviousSibling()); } - @JRubyMethod(meta = true, rest = true) - public static IRubyObject new_from_str(ThreadContext context, - IRubyObject cls, - IRubyObject[] args) { - XmlDocument doc = (XmlDocument) XmlDocument.read_memory(context, args); - return doc.root(context); - } - @JRubyMethod(name = {"node_name", "name"}) public IRubyObject node_name(ThreadContext context) { return getNodeName(context); From b327c87a9a00c2120dbb4a48c3bd01d7473be789 Mon Sep 17 00:00:00 2001 From: John Shahid Date: Tue, 16 Apr 2019 20:26:42 -0400 Subject: [PATCH 04/11] Wrap objects that respond_to?("read") in an IOInputStream Fixes #1888 --- ext/java/nokogiri/internals/ParserContext.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ext/java/nokogiri/internals/ParserContext.java b/ext/java/nokogiri/internals/ParserContext.java index eb97c65fb01..ce9e07ed455 100644 --- a/ext/java/nokogiri/internals/ParserContext.java +++ b/ext/java/nokogiri/internals/ParserContext.java @@ -53,6 +53,7 @@ import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; import org.jruby.util.ByteList; +import org.jruby.util.IOInputStream; import org.jruby.util.TypeConverter; import org.xml.sax.InputSource; @@ -107,7 +108,7 @@ public void setInputSource(ThreadContext context, IRubyObject data, IRubyObject source.setByteStream(new UncloseableInputStream(io.getInStream())); } else if (invoke(context, data, "respond_to?", ruby.newSymbol("read")).isTrue()) { - stringData = invoke(context, data, "read").convertToString(); + source.setByteStream(new UncloseableInputStream(new IOInputStream(data))); } else if (invoke(context, data, "respond_to?", ruby.newSymbol("string")).isTrue()) { stringData = invoke(context, data, "string").convertToString(); From f3325c10f102c07592e9d30f6f42ebbaf60c8787 Mon Sep 17 00:00:00 2001 From: John Shahid Date: Tue, 16 Apr 2019 20:28:44 -0400 Subject: [PATCH 05/11] Use the encoding that was passed to read_memory and read_io We don't have to figure out the encoding again. This was already figured out in the Ruby code. fixes #1888 --- .../NokogiriEncodingReaderWrapper.java | 107 ------------------ .../nokogiri/internals/ParserContext.java | 28 ++--- .../internals/XmlDomParserContext.java | 11 +- 3 files changed, 12 insertions(+), 134 deletions(-) delete mode 100644 ext/java/nokogiri/internals/NokogiriEncodingReaderWrapper.java diff --git a/ext/java/nokogiri/internals/NokogiriEncodingReaderWrapper.java b/ext/java/nokogiri/internals/NokogiriEncodingReaderWrapper.java deleted file mode 100644 index 10f15519b2f..00000000000 --- a/ext/java/nokogiri/internals/NokogiriEncodingReaderWrapper.java +++ /dev/null @@ -1,107 +0,0 @@ -package nokogiri.internals; - -import java.io.InputStream; - -import org.jruby.Ruby; -import org.jruby.RubyObject; -import org.jruby.exceptions.RaiseException; -import org.jruby.runtime.Helpers; -import org.jruby.runtime.ThreadContext; -import org.jruby.runtime.builtin.IRubyObject; -import org.jruby.util.ByteList; - -/** - * This class wraps the EncodingReader which act like a rewinding input stream, - * it tries to read the first 1K of data to detect the encoding, but save - * this data in a buffer for the subsequent read. Unfortunately, the EncodingReader - * will behave as expected only if encoding was detected, otherwise, the read data - * won't be stored and EncodingReader will fallback to read directory from the io stream. - * this is kind of lame, since we need to have similar logic in both layers. The alternative - * is to implement the encoding detection similar to the way C-Nokogiri does it; it starts - * parsing assuming encoding is unknown and if encoding is detected it will throw an exception - * causing parsing to stop, in which case we have to intercept the exception and set the encoding. - * Also in this case we don't have to restart the parsing since html/document.rb does that for us. - * - * @author John Shahid - * - */ -public class NokogiriEncodingReaderWrapper extends InputStream { - private final ThreadContext context; - private final IRubyObject encodingReader; - private final Ruby ruby; - private IRubyObject detectedEncoding; - private final byte[] firstChunk = new byte[1024]; - private int firstChunkOff = 0; - private int firstChunkLength = 0; - - public NokogiriEncodingReaderWrapper(ThreadContext context, RubyObject encodingReader) { - this.context = context; - this.encodingReader = encodingReader; - this.ruby = context.getRuntime(); - - if (!Helpers.invoke(context, encodingReader, "respond_to?", ruby.newSymbol("read")).isTrue() - || encodingReader.getInstanceVariable("@io") == null) { - throw ruby.newArgumentError("Argument doesn't respond to read or doesn't have instance variable @io"); - } - } - - public boolean detectEncoding() { - try { - firstChunkLength = read(firstChunk); - } catch (RaiseException e) { - detectedEncoding = e.getException().getInstanceVariable("@found_encoding"); - return true; - } - detectedEncoding = context.nil; - return false; - } - - public IRubyObject getEncoding() { - return detectedEncoding; - } - - @Override - public int read(byte b[]) { - return read(b, 0, b.length); - } - - @Override - public int read(byte b[], int off, int len) { - if (b == null) { - throw new NullPointerException(); - } else if (off < 0 || len < 0 || len > b.length - off) { - throw new IndexOutOfBoundsException(); - } else if (len == 0) { - return 0; - } - - int copyLength = Math.min(firstChunkLength - firstChunkOff, len); - if (copyLength > 0) { - System.arraycopy(firstChunk, firstChunkOff, b, off, copyLength); - len -= copyLength; - firstChunkOff += copyLength; - } - - if (len <= 0) - return copyLength; - - IRubyObject returnValue = encodingReader.callMethod(context, "read", ruby.newFixnum(len)); - if (returnValue.isNil()) - return -1; - - ByteList bytes = returnValue.asString().getByteList(); - int length = bytes.length(); - System.arraycopy(bytes.unsafeBytes(), bytes.getBegin(), b, off + copyLength, length); - return length + copyLength; - } - - @Override - public int read() { - byte[] bytes = new byte[1]; - int count = read(bytes, 0, 1); - if (count < 1) - return count; - return bytes[0]; - } - -} diff --git a/ext/java/nokogiri/internals/ParserContext.java b/ext/java/nokogiri/internals/ParserContext.java index ce9e07ed455..6b6422a895c 100644 --- a/ext/java/nokogiri/internals/ParserContext.java +++ b/ext/java/nokogiri/internals/ParserContext.java @@ -68,6 +68,7 @@ public abstract class ParserContext extends RubyObject { protected InputSource source = null; protected IRubyObject detected_encoding = null; protected int stringDataSize = -1; + protected String java_encoding; public ParserContext(Ruby runtime) { // default to class 'Object' because this class isn't exposed to Ruby @@ -93,11 +94,6 @@ public void setInputSource(ThreadContext context, IRubyObject data, IRubyObject ParserContext.setUrl(context, source, url); - // if setEncoding returned true, then the stream is set - // to the EncodingReaderInputStream - if (setEncoding(context, data)) - return; - RubyString stringData = null; if (invoke(context, data, "respond_to?", ruby.newSymbol("to_io")).isTrue()) { RubyIO io = @@ -106,9 +102,15 @@ public void setInputSource(ThreadContext context, IRubyObject data, IRubyObject "to_io"); // use unclosedable input stream to fix #495 source.setByteStream(new UncloseableInputStream(io.getInStream())); + if (java_encoding != null) { + source.setEncoding(java_encoding); + } } else if (invoke(context, data, "respond_to?", ruby.newSymbol("read")).isTrue()) { source.setByteStream(new UncloseableInputStream(new IOInputStream(data))); + if (java_encoding != null) { + source.setEncoding(java_encoding); + } } else if (invoke(context, data, "respond_to?", ruby.newSymbol("string")).isTrue()) { stringData = invoke(context, data, "string").convertToString(); @@ -172,22 +174,6 @@ public static void setUrl(ThreadContext context, InputSource source, IRubyObject } } - private boolean setEncoding(ThreadContext context, IRubyObject data) { - if (data.getType().respondsTo("detect_encoding")) { - // in case of EncodingReader is used - // since EncodingReader won't respond to :to_io - NokogiriEncodingReaderWrapper reader = new NokogiriEncodingReaderWrapper(context, (RubyObject) data); - source.setByteStream(reader); - // data is EnocodingReader - if(reader.detectEncoding()) { - detected_encoding = reader.getEncoding(); - source.setEncoding(detected_encoding.asJavaString()); - } - return true; - } - return false; - } - protected void setEncoding(String encoding) { source.setEncoding(encoding); } diff --git a/ext/java/nokogiri/internals/XmlDomParserContext.java b/ext/java/nokogiri/internals/XmlDomParserContext.java index 89af2bc1629..8264c18ab69 100644 --- a/ext/java/nokogiri/internals/XmlDomParserContext.java +++ b/ext/java/nokogiri/internals/XmlDomParserContext.java @@ -39,11 +39,6 @@ import java.util.ArrayList; import java.util.List; -import nokogiri.NokogiriService; -import nokogiri.XmlDocument; -import nokogiri.XmlDtd; -import nokogiri.XmlSyntaxError; - import org.apache.xerces.parsers.DOMParser; import org.jruby.Ruby; import org.jruby.RubyArray; @@ -57,6 +52,11 @@ import org.w3c.dom.NodeList; import org.xml.sax.SAXException; +import nokogiri.NokogiriService; +import nokogiri.XmlDocument; +import nokogiri.XmlDtd; +import nokogiri.XmlSyntaxError; + /** * Parser class for XML DOM processing. This class actually parses XML document * and creates DOM tree in Java side. However, DOM tree in Ruby side is not since @@ -84,7 +84,6 @@ public class XmlDomParserContext extends ParserContext { protected ParserContext.Options options; protected DOMParser parser; protected NokogiriErrorHandler errorHandler; - protected String java_encoding; protected IRubyObject ruby_encoding; public XmlDomParserContext(Ruby runtime, IRubyObject options) { From 8c8a952f0dba15cf6193d481a77dbcdfc56c4feb Mon Sep 17 00:00:00 2001 From: John Shahid Date: Tue, 16 Apr 2019 20:58:08 -0400 Subject: [PATCH 06/11] Remove unused code path We know data can only be a String or IO like object. --- ext/java/nokogiri/internals/ParserContext.java | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/ext/java/nokogiri/internals/ParserContext.java b/ext/java/nokogiri/internals/ParserContext.java index 6b6422a895c..f038495344f 100644 --- a/ext/java/nokogiri/internals/ParserContext.java +++ b/ext/java/nokogiri/internals/ParserContext.java @@ -47,14 +47,12 @@ import org.jruby.Ruby; import org.jruby.RubyClass; -import org.jruby.RubyIO; import org.jruby.RubyObject; import org.jruby.RubyString; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; import org.jruby.util.ByteList; import org.jruby.util.IOInputStream; -import org.jruby.util.TypeConverter; import org.xml.sax.InputSource; /** @@ -95,26 +93,12 @@ public void setInputSource(ThreadContext context, IRubyObject data, IRubyObject ParserContext.setUrl(context, source, url); RubyString stringData = null; - if (invoke(context, data, "respond_to?", ruby.newSymbol("to_io")).isTrue()) { - RubyIO io = - (RubyIO) TypeConverter.convertToType(data, - ruby.getIO(), - "to_io"); - // use unclosedable input stream to fix #495 - source.setByteStream(new UncloseableInputStream(io.getInStream())); - if (java_encoding != null) { - source.setEncoding(java_encoding); - } - - } else if (invoke(context, data, "respond_to?", ruby.newSymbol("read")).isTrue()) { + if (invoke(context, data, "respond_to?", ruby.newSymbol("read")).isTrue()) { source.setByteStream(new UncloseableInputStream(new IOInputStream(data))); if (java_encoding != null) { source.setEncoding(java_encoding); } - } else if (invoke(context, data, "respond_to?", ruby.newSymbol("string")).isTrue()) { - stringData = invoke(context, data, "string").convertToString(); - } else if (data instanceof RubyString) { stringData = (RubyString) data; From 76e3c7efa0fe7dc49d38a85a1a3f96605d5fe91e Mon Sep 17 00:00:00 2001 From: John Shahid Date: Wed, 17 Apr 2019 07:06:29 -0400 Subject: [PATCH 07/11] Remove unecessary UncloseableInputStream The tests are passing without it. --- ext/java/nokogiri/XmlReader.java | 21 ++-- .../nokogiri/internals/ParserContext.java | 2 +- .../internals/UncloseableInputStream.java | 102 ------------------ 3 files changed, 11 insertions(+), 114 deletions(-) delete mode 100644 ext/java/nokogiri/internals/UncloseableInputStream.java diff --git a/ext/java/nokogiri/XmlReader.java b/ext/java/nokogiri/XmlReader.java index 7b93b62e569..1c42a641295 100644 --- a/ext/java/nokogiri/XmlReader.java +++ b/ext/java/nokogiri/XmlReader.java @@ -41,15 +41,6 @@ import java.util.List; import java.util.Stack; -import nokogiri.internals.NokogiriEntityResolver; -import nokogiri.internals.ParserContext; -import nokogiri.internals.ParserContext.Options; -import nokogiri.internals.ReaderNode; -import nokogiri.internals.ReaderNode.ClosingNode; -import nokogiri.internals.ReaderNode.ElementNode; -import nokogiri.internals.ReaderNode.TextNode; -import nokogiri.internals.UncloseableInputStream; - import org.apache.xerces.impl.Constants; import org.apache.xerces.impl.xs.opti.DefaultXMLDocumentHandler; import org.apache.xerces.parsers.StandardParserConfiguration; @@ -81,6 +72,14 @@ import org.jruby.util.IOInputStream; import org.xml.sax.InputSource; +import nokogiri.internals.NokogiriEntityResolver; +import nokogiri.internals.ParserContext; +import nokogiri.internals.ParserContext.Options; +import nokogiri.internals.ReaderNode; +import nokogiri.internals.ReaderNode.ClosingNode; +import nokogiri.internals.ReaderNode.ElementNode; +import nokogiri.internals.ReaderNode.TextNode; + /** * Class for Nokogiri:XML::Reader * @@ -217,7 +216,7 @@ public static IRubyObject from_io(ThreadContext context, IRubyObject cls, IRubyO options = new ParserContext.Options(2048 | 1); } - InputStream in = new UncloseableInputStream(new IOInputStream(args[0])); + InputStream in = new IOInputStream(args[0]); reader.setInput(context, in, url, options); return reader; } @@ -245,7 +244,7 @@ public static IRubyObject from_memory(ThreadContext context, IRubyObject cls, IR options = new ParserContext.Options(2048 | 1); } IRubyObject stringIO = runtime.getClass("StringIO").newInstance(context, args[0], Block.NULL_BLOCK); - InputStream in = new UncloseableInputStream(new IOInputStream(stringIO)); + InputStream in = new IOInputStream(stringIO); reader.setInput(context, in, url, options); return reader; } diff --git a/ext/java/nokogiri/internals/ParserContext.java b/ext/java/nokogiri/internals/ParserContext.java index f038495344f..4529e7f0249 100644 --- a/ext/java/nokogiri/internals/ParserContext.java +++ b/ext/java/nokogiri/internals/ParserContext.java @@ -94,7 +94,7 @@ public void setInputSource(ThreadContext context, IRubyObject data, IRubyObject RubyString stringData = null; if (invoke(context, data, "respond_to?", ruby.newSymbol("read")).isTrue()) { - source.setByteStream(new UncloseableInputStream(new IOInputStream(data))); + source.setByteStream(new IOInputStream(data)); if (java_encoding != null) { source.setEncoding(java_encoding); } diff --git a/ext/java/nokogiri/internals/UncloseableInputStream.java b/ext/java/nokogiri/internals/UncloseableInputStream.java deleted file mode 100644 index e285e595dfa..00000000000 --- a/ext/java/nokogiri/internals/UncloseableInputStream.java +++ /dev/null @@ -1,102 +0,0 @@ -/** - * (The MIT License) - * - * Copyright (c) 2008 - 2012: - * - * * {Aaron Patterson}[http://tenderlovemaking.com] - * * {Mike Dalessio}[http://mike.daless.io] - * * {Charles Nutter}[http://blog.headius.com] - * * {Sergio Arbeo}[http://www.serabe.com] - * * {Patrick Mahoney}[http://polycrystal.org] - * * {Yoko Harada}[http://yokolet.blogspot.com] - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * 'Software'), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package nokogiri.internals; - -import java.io.IOException; -import java.io.InputStream; - -/** - * Delegates all the methods to another InputStream except the - * close() method, which is ignored. This is used to fix #495. - * - * @author John Shahid - */ -public class UncloseableInputStream extends InputStream { - private final InputStream delegate; - - /** - * Create a new uncloseable stream. - * - * @param delegate The InputStream to which all methods (except close) - * will be delegated. - */ - public UncloseableInputStream(InputStream delegate) { - this.delegate = delegate; - } - - @Override - public int read() throws IOException { - return delegate.read(); - } - - @Override - public int read(byte []b) throws IOException { - return delegate.read(b); - } - - @Override - public int read(byte []b, int offset, int len) throws IOException { - return delegate.read(b, offset, len); - } - - @Override - public long skip(long n) throws IOException { - return delegate.skip(n); - } - - @Override - public int available() throws IOException { - return delegate.available(); - } - - @Override - public void close() { - // don't forward this to the InputStream we're delegating from - // we don't want the InputStream of the RubyIO to be closed - } - - @Override - public void mark(int readlimit) { - delegate.mark(readlimit); - } - - @Override - public void reset() throws IOException { - delegate.reset(); - } - - @Override - public boolean markSupported() { - return delegate.markSupported(); - } -} From ba16682aae2cbd42c196bd8afdfcfe8a5d82fbdb Mon Sep 17 00:00:00 2001 From: John Shahid Date: Thu, 18 Apr 2019 13:58:18 -0400 Subject: [PATCH 08/11] Split setInputSource into setIOInputSource and setStringInputSource This commit also consolidates some of the encoding handling logic that was repeated in multiple places. --- ext/java/nokogiri/HtmlDocument.java | 25 +++-- ext/java/nokogiri/HtmlSaxParserContext.java | 100 +++++------------- ext/java/nokogiri/XmlDocument.java | 31 ++---- ext/java/nokogiri/XmlSaxParserContext.java | 14 +-- .../nokogiri/internals/NokogiriHelpers.java | 12 +++ .../nokogiri/internals/ParserContext.java | 67 +++++------- 6 files changed, 91 insertions(+), 158 deletions(-) diff --git a/ext/java/nokogiri/HtmlDocument.java b/ext/java/nokogiri/HtmlDocument.java index ff5245434fe..3d66ff68730 100644 --- a/ext/java/nokogiri/HtmlDocument.java +++ b/ext/java/nokogiri/HtmlDocument.java @@ -108,17 +108,6 @@ public IRubyObject getInternalSubset(ThreadContext context) { return internalSubset; } - - public static IRubyObject do_parse(ThreadContext context, - IRubyObject klass, - IRubyObject[] args) { - Ruby ruby = context.getRuntime(); - Arity.checkArgumentCount(ruby, args, 4, 4); - HtmlDomParserContext ctx = - new HtmlDomParserContext(ruby, args[2], args[3]); - ctx.setInputSource(context, args[0], args[1]); - return ctx.parse(context, klass, args[1]); - } public void setDocumentNode(ThreadContext context, Node node) { super.setNode(context, node); @@ -171,7 +160,12 @@ public String getPraedEncoding() { public static IRubyObject read_io(ThreadContext context, IRubyObject cls, IRubyObject[] args) { - return do_parse(context, cls, args); + Ruby ruby = context.getRuntime(); + Arity.checkArgumentCount(ruby, args, 4, 4); + HtmlDomParserContext ctx = + new HtmlDomParserContext(ruby, args[2], args[3]); + ctx.setIOInputSource(context, args[0], args[1]); + return ctx.parse(context, cls, args[1]); } /* @@ -185,6 +179,11 @@ public static IRubyObject read_io(ThreadContext context, public static IRubyObject read_memory(ThreadContext context, IRubyObject cls, IRubyObject[] args) { - return do_parse(context, cls, args); + Ruby ruby = context.getRuntime(); + Arity.checkArgumentCount(ruby, args, 4, 4); + HtmlDomParserContext ctx = + new HtmlDomParserContext(ruby, args[2], args[3]); + ctx.setStringInputSource(context, args[0], args[1]); + return ctx.parse(context, cls, args[1]); } } diff --git a/ext/java/nokogiri/HtmlSaxParserContext.java b/ext/java/nokogiri/HtmlSaxParserContext.java index 2fa4ee71c5c..c64e080e046 100644 --- a/ext/java/nokogiri/HtmlSaxParserContext.java +++ b/ext/java/nokogiri/HtmlSaxParserContext.java @@ -32,28 +32,22 @@ package nokogiri; -import static nokogiri.internals.NokogiriHelpers.rubyStringToString; - -import java.io.ByteArrayInputStream; import java.io.InputStream; -import java.nio.charset.Charset; -import java.nio.charset.IllegalCharsetNameException; -import java.nio.charset.UnsupportedCharsetException; -import java.util.EnumSet; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import nokogiri.internals.NokogiriHandler; import org.apache.xerces.parsers.AbstractSAXParser; import org.cyberneko.html.parsers.SAXParser; -import org.jruby.*; +import org.jruby.Ruby; +import org.jruby.RubyClass; +import org.jruby.RubyFixnum; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; import org.xml.sax.SAXException; +import nokogiri.internals.NokogiriHandler; +import nokogiri.internals.NokogiriHelpers; + /** * Class for Nokogiri::HTML::SAX::ParserContext. * @@ -78,6 +72,11 @@ protected AbstractSAXParser createParser() throws SAXException { "http://cyberneko.org/html/properties/names/elems", "lower"); parser.setProperty( "http://cyberneko.org/html/properties/names/attrs", "lower"); + + // NekoHTML should not try to guess the encoding based on the meta + // tags or other information in the document. This is already + // handled by the EncodingReader. + parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true); return parser; } catch(SAXException ex) { throw new SAXException( @@ -92,16 +91,11 @@ public static IRubyObject parse_memory(ThreadContext context, IRubyObject encoding) { HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz); ctx.initialize(context.getRuntime()); - String javaEncoding = findEncoding(context, encoding); - if (javaEncoding != null) { - String input = applyEncoding(rubyStringToString(data), javaEncoding); - ByteArrayInputStream istream = new ByteArrayInputStream(input.getBytes()); - ctx.setInputSource(istream); - ctx.getInputSource().setEncoding(javaEncoding); - } + ctx.java_encoding = NokogiriHelpers.getValidEncodingOrNull(context.runtime, encoding); + ctx.setStringInputSource(context, data, context.nil); return ctx; } - + public enum EncodingType { NONE(0, "NONE"), UTF_8(1, "UTF-8"), @@ -152,44 +146,15 @@ private static String findName(final int value) { } private static String findEncoding(ThreadContext context, IRubyObject encoding) { - String rubyEncoding = null; - if (encoding instanceof RubyString) { - rubyEncoding = rubyStringToString(encoding); - } - else if (encoding instanceof RubyFixnum) { + // HTML::Sax::Parser leaks a libxml implementation detail and passes an + // Encoding integer to parse_io. We have to reverse map the integer + // into a name. + if (encoding instanceof RubyFixnum) { int value = RubyFixnum.fix2int((RubyFixnum) encoding); - rubyEncoding = findName(value); - } - if (rubyEncoding == null) return null; - try { - return Charset.forName(rubyEncoding).displayName(); - } - catch (UnsupportedCharsetException e) { - throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + "is not supported"); + return findName(value); } - catch (IllegalCharsetNameException e) { - throw context.getRuntime().newInvalidEncoding(e.getMessage()); - } - } - private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+"); - - private static String applyEncoding(String input, String enc) { - String str = input.toLowerCase(); - int start_pos = 0; - int end_pos = 0; - if (input.contains("meta") && input.contains("charset")) { - Matcher m = CHARSET_PATTERN.matcher(str); - while (m.find()) { - start_pos = m.start(); - end_pos = m.end(); - } - } - if (start_pos != end_pos) { - String substr = input.substring(start_pos, end_pos); - input = input.replace(substr, "charset=" + enc); - } - return input; + return NokogiriHelpers.getValidEncodingOrNull(context.runtime, encoding); } @JRubyMethod(name="file", meta=true) @@ -199,11 +164,8 @@ public static IRubyObject parse_file(ThreadContext context, IRubyObject encoding) { HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz); ctx.initialize(context.getRuntime()); + ctx.java_encoding = NokogiriHelpers.getValidEncodingOrNull(context.runtime, encoding); ctx.setInputSourceFile(context, data); - String javaEncoding = findEncoding(context, encoding); - if (javaEncoding != null) { - ctx.getInputSource().setEncoding(javaEncoding); - } return ctx; } @@ -214,11 +176,8 @@ public static IRubyObject parse_io(ThreadContext context, IRubyObject encoding) { HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz); ctx.initialize(context.getRuntime()); - ctx.setInputSource(context, data, context.getRuntime().getNil()); - String javaEncoding = findEncoding(context, encoding); - if (javaEncoding != null) { - ctx.getInputSource().setEncoding(javaEncoding); - } + ctx.java_encoding = findEncoding(context, encoding); + ctx.setIOInputSource(context, data, context.getRuntime().getNil()); return ctx; } @@ -235,18 +194,7 @@ static HtmlSaxParserContext parse_stream(final Ruby runtime, RubyClass klazz, In @Override protected void preParse(final Ruby runtime, IRubyObject handlerRuby, NokogiriHandler handler) { - // final String path = "Nokogiri::XML::FragmentHandler"; - // final String docFrag = - // "http://cyberneko.org/html/features/balance-tags/document-fragment"; - // RubyObjectAdapter adapter = JavaEmbedUtils.newObjectAdapter(); - // IRubyObject doc = adapter.getInstanceVariable(handlerRuby, "@document"); - // RubyModule mod = runtime.getClassFromPath(path); - // try { - // if (doc != null && !doc.isNil() && adapter.isKindOf(doc, mod)) - // parser.setFeature(docFrag, true); - // } catch (Exception e) { - // // ignore - // } + // this function is meant to be empty. It overrides the one in XmlSaxParserContext } } diff --git a/ext/java/nokogiri/XmlDocument.java b/ext/java/nokogiri/XmlDocument.java index 152d2e9efd5..1530b75a082 100644 --- a/ext/java/nokogiri/XmlDocument.java +++ b/ext/java/nokogiri/XmlDocument.java @@ -310,37 +310,28 @@ public static IRubyObject load_external_subsets_set(ThreadContext context, IRuby return context.getRuntime().getNil(); } - /** - * TODO: handle encoding? - * - * @param args[0] a Ruby IO or StringIO - * @param args[1] url or nil - * @param args[2] encoding - * @param args[3] bitset of parser options - */ - public static IRubyObject newFromData(ThreadContext context, - IRubyObject klass, - IRubyObject[] args) { + @JRubyMethod(meta = true, rest = true) + public static IRubyObject read_io(ThreadContext context, + IRubyObject klass, + IRubyObject[] args) { Ruby ruby = context.getRuntime(); Arity.checkArgumentCount(ruby, args, 4, 4); XmlDomParserContext ctx = new XmlDomParserContext(ruby, args[2], args[3]); - ctx.setInputSource(context, args[0], args[1]); + ctx.setIOInputSource(context, args[0], args[1]); return ctx.parse(context, klass, args[1]); } - @JRubyMethod(meta = true, rest = true) - public static IRubyObject read_io(ThreadContext context, - IRubyObject klass, - IRubyObject[] args) { - return newFromData(context, klass, args); - } - @JRubyMethod(meta = true, rest = true) public static IRubyObject read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args) { - return newFromData(context, klass, args); + Ruby ruby = context.getRuntime(); + Arity.checkArgumentCount(ruby, args, 4, 4); + XmlDomParserContext ctx = + new XmlDomParserContext(ruby, args[2], args[3]); + ctx.setStringInputSource(context, args[0], args[1]); + return ctx.parse(context, klass, args[1]); } /** not a JRubyMethod */ diff --git a/ext/java/nokogiri/XmlSaxParserContext.java b/ext/java/nokogiri/XmlSaxParserContext.java index 5537619022f..a3fe8f8c99c 100644 --- a/ext/java/nokogiri/XmlSaxParserContext.java +++ b/ext/java/nokogiri/XmlSaxParserContext.java @@ -37,11 +37,6 @@ import java.io.IOException; import java.io.InputStream; -import nokogiri.internals.NokogiriHandler; -import nokogiri.internals.NokogiriHelpers; -import nokogiri.internals.ParserContext; -import nokogiri.internals.XmlSaxParser; - import org.apache.xerces.parsers.AbstractSAXParser; import org.jruby.Ruby; import org.jruby.RubyClass; @@ -61,6 +56,11 @@ import org.xml.sax.SAXNotSupportedException; import org.xml.sax.SAXParseException; +import nokogiri.internals.NokogiriHandler; +import nokogiri.internals.NokogiriHelpers; +import nokogiri.internals.ParserContext; +import nokogiri.internals.XmlSaxParser; + /** * Base class for the SAX parsers. * @@ -126,7 +126,7 @@ public static IRubyObject parse_memory(ThreadContext context, final Ruby runtime = context.runtime; XmlSaxParserContext ctx = newInstance(runtime, (RubyClass) klazz); ctx.initialize(runtime); - ctx.setInputSource(context, data, runtime.getNil()); + ctx.setStringInputSource(context, data, runtime.getNil()); return ctx; } @@ -160,7 +160,7 @@ public static IRubyObject parse_io(ThreadContext context, final Ruby runtime = context.runtime; XmlSaxParserContext ctx = newInstance(runtime, (RubyClass) klazz); ctx.initialize(runtime); - ctx.setInputSource(context, data, runtime.getNil()); + ctx.setIOInputSource(context, data, runtime.getNil()); return ctx; } diff --git a/ext/java/nokogiri/internals/NokogiriHelpers.java b/ext/java/nokogiri/internals/NokogiriHelpers.java index c642ede8623..b9c7e5247f6 100644 --- a/ext/java/nokogiri/internals/NokogiriHelpers.java +++ b/ext/java/nokogiri/internals/NokogiriHelpers.java @@ -657,6 +657,18 @@ public static RubyArray namedNodeMapToRubyArray(Ruby ruby, NamedNodeMap map) { return n; } + public static String getValidEncodingOrNull(Ruby runtime, IRubyObject encoding) { + if (encoding.isNil()) { + return null; + } + + String givenEncoding = rubyStringToString(encoding); + if (charsetNames.contains(givenEncoding)) { + return givenEncoding; + } + return null; + } + public static String getValidEncoding(Ruby runtime, IRubyObject encoding) { if (encoding.isNil()) { return guessEncoding(); diff --git a/ext/java/nokogiri/internals/ParserContext.java b/ext/java/nokogiri/internals/ParserContext.java index 4529e7f0249..a79023c7e1b 100644 --- a/ext/java/nokogiri/internals/ParserContext.java +++ b/ext/java/nokogiri/internals/ParserContext.java @@ -33,16 +33,12 @@ package nokogiri.internals; import static nokogiri.internals.NokogiriHelpers.rubyStringToString; -import static org.jruby.runtime.Helpers.invoke; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.io.StringReader; import java.net.URI; -import java.nio.charset.Charset; -import java.nio.charset.UnsupportedCharsetException; import java.util.concurrent.Callable; import org.jruby.Ruby; @@ -81,55 +77,42 @@ protected InputSource getInputSource() { return source; } - /** - * Set the InputSource from url or data, - * which may be an IO object, a String, or a StringIO. - */ - public void setInputSource(ThreadContext context, IRubyObject data, IRubyObject url) { + public void setIOInputSource(ThreadContext context, IRubyObject data, IRubyObject url) { source = new InputSource(); + ParserContext.setUrl(context, source, url); - Ruby ruby = context.getRuntime(); + source.setByteStream(new IOInputStream(data)); + if (java_encoding != null) { + source.setEncoding(java_encoding); + } + } + public void setStringInputSource(ThreadContext context, IRubyObject data, IRubyObject url) { + source = new InputSource(); ParserContext.setUrl(context, source, url); - RubyString stringData = null; - if (invoke(context, data, "respond_to?", ruby.newSymbol("read")).isTrue()) { - source.setByteStream(new IOInputStream(data)); - if (java_encoding != null) { - source.setEncoding(java_encoding); - } - - } else if (data instanceof RubyString) { - stringData = (RubyString) data; + Ruby ruby = context.getRuntime(); - } else { - throw ruby.newArgumentError("must be kind_of String or respond to :to_io, :read, or :string"); + if (!(data instanceof RubyString)) { + throw ruby.newArgumentError("must be kind_of String"); } - if (stringData != null) { - String encName = null; - if (stringData.encoding(context) != null) { - encName = stringData.encoding(context).toString(); - } - Charset charset = null; + RubyString stringData = (RubyString) data; + + if (stringData.encoding(context) != null) { + RubyString stringEncoding = stringData.encoding(context).asString(); + String encName = NokogiriHelpers.getValidEncodingOrNull(context.runtime, stringEncoding); if (encName != null) { - try { - charset = Charset.forName(encName); - } catch (UnsupportedCharsetException e) { - // do nothing; - } - } - ByteList bytes = stringData.getByteList(); - if (charset != null) { - StringReader reader = new StringReader(new String(bytes.unsafeBytes(), bytes.begin(), bytes.length(), charset)); - source.setCharacterStream(reader); - source.setEncoding(charset.name()); - } else { - stringDataSize = bytes.length() - bytes.begin(); - ByteArrayInputStream stream = new ByteArrayInputStream(bytes.unsafeBytes(), bytes.begin(), bytes.length()); - source.setByteStream(stream); + java_encoding = encName; } } + + ByteList bytes = stringData.getByteList(); + + stringDataSize = bytes.length() - bytes.begin(); + ByteArrayInputStream stream = new ByteArrayInputStream(bytes.unsafeBytes(), bytes.begin(), bytes.length()); + source.setByteStream(stream); + source.setEncoding(java_encoding); } public static void setUrl(ThreadContext context, InputSource source, IRubyObject url) { From 0ed37e2cdc2b5e886c7a2b36f21f6233aafb664d Mon Sep 17 00:00:00 2001 From: John Shahid Date: Sat, 20 Apr 2019 09:38:15 -0400 Subject: [PATCH 09/11] Use JRubyMethod to specify the required number of arguments --- ext/java/nokogiri/HtmlDocument.java | 7 ++----- ext/java/nokogiri/XmlDocument.java | 15 ++------------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/ext/java/nokogiri/HtmlDocument.java b/ext/java/nokogiri/HtmlDocument.java index 3d66ff68730..aa5029a14d4 100644 --- a/ext/java/nokogiri/HtmlDocument.java +++ b/ext/java/nokogiri/HtmlDocument.java @@ -36,7 +36,6 @@ import org.jruby.RubyClass; import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; -import org.jruby.runtime.Arity; import org.jruby.runtime.Helpers; import org.jruby.runtime.ThreadContext; import org.jruby.runtime.builtin.IRubyObject; @@ -156,12 +155,11 @@ public String getPraedEncoding() { * Read the HTML document from +io+ with given +url+, +encoding+, * and +options+. See Nokogiri::HTML.parse */ - @JRubyMethod(meta = true, rest = true) + @JRubyMethod(meta = true, required = 4) public static IRubyObject read_io(ThreadContext context, IRubyObject cls, IRubyObject[] args) { Ruby ruby = context.getRuntime(); - Arity.checkArgumentCount(ruby, args, 4, 4); HtmlDomParserContext ctx = new HtmlDomParserContext(ruby, args[2], args[3]); ctx.setIOInputSource(context, args[0], args[1]); @@ -175,12 +173,11 @@ public static IRubyObject read_io(ThreadContext context, * Read the HTML document contained in +string+ with given +url+, +encoding+, * and +options+. See Nokogiri::HTML.parse */ - @JRubyMethod(meta = true, rest = true) + @JRubyMethod(meta = true, required = 4) public static IRubyObject read_memory(ThreadContext context, IRubyObject cls, IRubyObject[] args) { Ruby ruby = context.getRuntime(); - Arity.checkArgumentCount(ruby, args, 4, 4); HtmlDomParserContext ctx = new HtmlDomParserContext(ruby, args[2], args[3]); ctx.setStringInputSource(context, args[0], args[1]); diff --git a/ext/java/nokogiri/XmlDocument.java b/ext/java/nokogiri/XmlDocument.java index 1530b75a082..53ca88698f0 100644 --- a/ext/java/nokogiri/XmlDocument.java +++ b/ext/java/nokogiri/XmlDocument.java @@ -53,7 +53,6 @@ import org.jruby.anno.JRubyClass; import org.jruby.anno.JRubyMethod; import org.jruby.javasupport.JavaUtil; -import org.jruby.runtime.Arity; import org.jruby.runtime.Block; import org.jruby.runtime.Helpers; import org.jruby.runtime.ThreadContext; @@ -310,38 +309,28 @@ public static IRubyObject load_external_subsets_set(ThreadContext context, IRuby return context.getRuntime().getNil(); } - @JRubyMethod(meta = true, rest = true) + @JRubyMethod(meta = true, required = 4) public static IRubyObject read_io(ThreadContext context, IRubyObject klass, IRubyObject[] args) { Ruby ruby = context.getRuntime(); - Arity.checkArgumentCount(ruby, args, 4, 4); XmlDomParserContext ctx = new XmlDomParserContext(ruby, args[2], args[3]); ctx.setIOInputSource(context, args[0], args[1]); return ctx.parse(context, klass, args[1]); } - @JRubyMethod(meta = true, rest = true) + @JRubyMethod(meta = true, required = 4) public static IRubyObject read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args) { Ruby ruby = context.getRuntime(); - Arity.checkArgumentCount(ruby, args, 4, 4); XmlDomParserContext ctx = new XmlDomParserContext(ruby, args[2], args[3]); ctx.setStringInputSource(context, args[0], args[1]); return ctx.parse(context, klass, args[1]); } - /** not a JRubyMethod */ - public static IRubyObject read_memory(ThreadContext context, - IRubyObject[] args) { - return read_memory(context, - getNokogiriClass(context.getRuntime(), "Nokogiri::XML::Document"), - args); - } - @JRubyMethod(name="remove_namespaces!") public IRubyObject remove_namespaces(ThreadContext context) { removeNamespceRecursively(context, this); From b08976887444c1344d701938515aad4fc8a73658 Mon Sep 17 00:00:00 2001 From: John Shahid Date: Sat, 20 Apr 2019 21:43:05 -0400 Subject: [PATCH 10/11] Use Charset.defaultCharset instead the file.encoding property The file.encoding property could have an invalid encoding name because it is not normalized, as opposed to Charset.defaultCharset which is. --- .../nokogiri/internals/NokogiriHelpers.java | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/ext/java/nokogiri/internals/NokogiriHelpers.java b/ext/java/nokogiri/internals/NokogiriHelpers.java index b9c7e5247f6..10b2caafa43 100644 --- a/ext/java/nokogiri/internals/NokogiriHelpers.java +++ b/ext/java/nokogiri/internals/NokogiriHelpers.java @@ -670,27 +670,16 @@ public static String getValidEncodingOrNull(Ruby runtime, IRubyObject encoding) } public static String getValidEncoding(Ruby runtime, IRubyObject encoding) { - if (encoding.isNil()) { - return guessEncoding(); - } else { - return ignoreInvalidEncoding(runtime, encoding); + String validEncoding = getValidEncodingOrNull(runtime, encoding); + if (validEncoding != null) { + return validEncoding; } - } - private static String guessEncoding() { - String name = System.getProperty("file.encoding"); - if (name == null) name = "UTF-8"; - return name; + return Charset.defaultCharset().name(); } private static Set charsetNames = Charset.availableCharsets().keySet(); - private static String ignoreInvalidEncoding(Ruby runtime, IRubyObject encoding) { - String givenEncoding = rubyStringToString(encoding); - if (charsetNames.contains(givenEncoding)) return givenEncoding; - else return guessEncoding(); - } - public static String adjustSystemIdIfNecessary(String currentDir, String scriptFileName, String baseURI, String systemId) { if (systemId == null) return systemId; File file = new File(systemId); From bf2c54704beae88fe56ace9497829dae4e87aaec Mon Sep 17 00:00:00 2001 From: John Shahid Date: Sun, 21 Apr 2019 15:53:57 -0400 Subject: [PATCH 11/11] Remove UncloseableInputStream and NokogiriEncodingReaderWrapper from the manifest --- Manifest.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/Manifest.txt b/Manifest.txt index c623d2bafb3..97c07f425f8 100644 --- a/Manifest.txt +++ b/Manifest.txt @@ -40,7 +40,6 @@ ext/java/nokogiri/internals/HtmlDomParserContext.java ext/java/nokogiri/internals/IgnoreSchemaErrorsErrorHandler.java ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java ext/java/nokogiri/internals/NokogiriDomParser.java -ext/java/nokogiri/internals/NokogiriEncodingReaderWrapper.java ext/java/nokogiri/internals/NokogiriEntityResolver.java ext/java/nokogiri/internals/NokogiriErrorHandler.java ext/java/nokogiri/internals/NokogiriHandler.java @@ -58,7 +57,6 @@ ext/java/nokogiri/internals/ParserContext.java ext/java/nokogiri/internals/ReaderNode.java ext/java/nokogiri/internals/SaveContextVisitor.java ext/java/nokogiri/internals/SchemaErrorHandler.java -ext/java/nokogiri/internals/UncloseableInputStream.java ext/java/nokogiri/internals/XalanDTMManagerPatch.java ext/java/nokogiri/internals/XmlDeclHandler.java ext/java/nokogiri/internals/XmlDomParserContext.java