Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standardize reading from IO like objects, including StringIO #1897

Merged
merged 11 commits into from
Apr 23, 2019
Merged
2 changes: 0 additions & 2 deletions Manifest.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ ext/java/nokogiri/internals/HtmlDomParserContext.java
ext/java/nokogiri/internals/IgnoreSchemaErrorsErrorHandler.java
ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java
ext/java/nokogiri/internals/NokogiriDomParser.java
ext/java/nokogiri/internals/NokogiriEncodingReaderWrapper.java
ext/java/nokogiri/internals/NokogiriEntityResolver.java
ext/java/nokogiri/internals/NokogiriErrorHandler.java
ext/java/nokogiri/internals/NokogiriHandler.java
Expand All @@ -58,7 +57,6 @@ ext/java/nokogiri/internals/ParserContext.java
ext/java/nokogiri/internals/ReaderNode.java
ext/java/nokogiri/internals/SaveContextVisitor.java
ext/java/nokogiri/internals/SchemaErrorHandler.java
ext/java/nokogiri/internals/UncloseableInputStream.java
ext/java/nokogiri/internals/XalanDTMManagerPatch.java
ext/java/nokogiri/internals/XmlDeclHandler.java
ext/java/nokogiri/internals/XmlDomParserContext.java
Expand Down
28 changes: 12 additions & 16 deletions ext/java/nokogiri/HtmlDocument.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
import org.jruby.RubyClass;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.Arity;
import org.jruby.runtime.Helpers;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
Expand Down Expand Up @@ -108,17 +107,6 @@ public IRubyObject getInternalSubset(ThreadContext context) {

return internalSubset;
}

public static IRubyObject do_parse(ThreadContext context,
IRubyObject klass,
IRubyObject[] args) {
Ruby ruby = context.getRuntime();
Arity.checkArgumentCount(ruby, args, 4, 4);
HtmlDomParserContext ctx =
new HtmlDomParserContext(ruby, args[2], args[3]);
ctx.setInputSource(context, args[0], args[1]);
return ctx.parse(context, klass, args[1]);
}

public void setDocumentNode(ThreadContext context, Node node) {
super.setNode(context, node);
Expand Down Expand Up @@ -167,11 +155,15 @@ public String getPraedEncoding() {
* Read the HTML document from +io+ with given +url+, +encoding+,
* and +options+. See Nokogiri::HTML.parse
*/
@JRubyMethod(meta = true, rest = true)
@JRubyMethod(meta = true, required = 4)
public static IRubyObject read_io(ThreadContext context,
IRubyObject cls,
IRubyObject[] args) {
return do_parse(context, cls, args);
Ruby ruby = context.getRuntime();
HtmlDomParserContext ctx =
new HtmlDomParserContext(ruby, args[2], args[3]);
ctx.setIOInputSource(context, args[0], args[1]);
return ctx.parse(context, cls, args[1]);
}

/*
Expand All @@ -181,10 +173,14 @@ public static IRubyObject read_io(ThreadContext context,
* Read the HTML document contained in +string+ with given +url+, +encoding+,
* and +options+. See Nokogiri::HTML.parse
*/
@JRubyMethod(meta = true, rest = true)
@JRubyMethod(meta = true, required = 4)
public static IRubyObject read_memory(ThreadContext context,
IRubyObject cls,
IRubyObject[] args) {
return do_parse(context, cls, args);
Ruby ruby = context.getRuntime();
HtmlDomParserContext ctx =
new HtmlDomParserContext(ruby, args[2], args[3]);
ctx.setStringInputSource(context, args[0], args[1]);
return ctx.parse(context, cls, args[1]);
}
}
100 changes: 24 additions & 76 deletions ext/java/nokogiri/HtmlSaxParserContext.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,28 +32,22 @@

package nokogiri;

import static nokogiri.internals.NokogiriHelpers.rubyStringToString;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.EnumSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import nokogiri.internals.NokogiriHandler;

import org.apache.xerces.parsers.AbstractSAXParser;
import org.cyberneko.html.parsers.SAXParser;
import org.jruby.*;
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyFixnum;
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.xml.sax.SAXException;

import nokogiri.internals.NokogiriHandler;
import nokogiri.internals.NokogiriHelpers;

/**
* Class for Nokogiri::HTML::SAX::ParserContext.
*
Expand All @@ -78,6 +72,11 @@ protected AbstractSAXParser createParser() throws SAXException {
"http://cyberneko.org/html/properties/names/elems", "lower");
parser.setProperty(
"http://cyberneko.org/html/properties/names/attrs", "lower");

// NekoHTML should not try to guess the encoding based on the meta
// tags or other information in the document. This is already
// handled by the EncodingReader.
parser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
return parser;
} catch(SAXException ex) {
throw new SAXException(
Expand All @@ -92,16 +91,11 @@ public static IRubyObject parse_memory(ThreadContext context,
IRubyObject encoding) {
HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
ctx.initialize(context.getRuntime());
String javaEncoding = findEncoding(context, encoding);
if (javaEncoding != null) {
String input = applyEncoding(rubyStringToString(data), javaEncoding);
ByteArrayInputStream istream = new ByteArrayInputStream(input.getBytes());
ctx.setInputSource(istream);
ctx.getInputSource().setEncoding(javaEncoding);
}
ctx.java_encoding = NokogiriHelpers.getValidEncodingOrNull(context.runtime, encoding);
ctx.setStringInputSource(context, data, context.nil);
return ctx;
}

public enum EncodingType {
NONE(0, "NONE"),
UTF_8(1, "UTF-8"),
Expand Down Expand Up @@ -152,44 +146,15 @@ private static String findName(final int value) {
}

private static String findEncoding(ThreadContext context, IRubyObject encoding) {
String rubyEncoding = null;
if (encoding instanceof RubyString) {
rubyEncoding = rubyStringToString(encoding);
}
else if (encoding instanceof RubyFixnum) {
// HTML::Sax::Parser leaks a libxml implementation detail and passes an
// Encoding integer to parse_io. We have to reverse map the integer
// into a name.
if (encoding instanceof RubyFixnum) {
int value = RubyFixnum.fix2int((RubyFixnum) encoding);
rubyEncoding = findName(value);
}
if (rubyEncoding == null) return null;
try {
return Charset.forName(rubyEncoding).displayName();
}
catch (UnsupportedCharsetException e) {
throw context.getRuntime().newEncodingCompatibilityError(rubyEncoding + "is not supported");
return findName(value);
}
catch (IllegalCharsetNameException e) {
throw context.getRuntime().newInvalidEncoding(e.getMessage());
}
}

private static final Pattern CHARSET_PATTERN = Pattern.compile("charset(()|\\s)=(()|\\s)([a-z]|-|_|\\d)+");

private static String applyEncoding(String input, String enc) {
String str = input.toLowerCase();
int start_pos = 0;
int end_pos = 0;
if (input.contains("meta") && input.contains("charset")) {
Matcher m = CHARSET_PATTERN.matcher(str);
while (m.find()) {
start_pos = m.start();
end_pos = m.end();
}
}
if (start_pos != end_pos) {
String substr = input.substring(start_pos, end_pos);
input = input.replace(substr, "charset=" + enc);
}
return input;
return NokogiriHelpers.getValidEncodingOrNull(context.runtime, encoding);
}

@JRubyMethod(name="file", meta=true)
Expand All @@ -199,11 +164,8 @@ public static IRubyObject parse_file(ThreadContext context,
IRubyObject encoding) {
HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
ctx.initialize(context.getRuntime());
ctx.java_encoding = NokogiriHelpers.getValidEncodingOrNull(context.runtime, encoding);
ctx.setInputSourceFile(context, data);
String javaEncoding = findEncoding(context, encoding);
if (javaEncoding != null) {
ctx.getInputSource().setEncoding(javaEncoding);
}
return ctx;
}

Expand All @@ -214,11 +176,8 @@ public static IRubyObject parse_io(ThreadContext context,
IRubyObject encoding) {
HtmlSaxParserContext ctx = (HtmlSaxParserContext) NokogiriService.HTML_SAXPARSER_CONTEXT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass)klazz);
ctx.initialize(context.getRuntime());
ctx.setInputSource(context, data, context.getRuntime().getNil());
String javaEncoding = findEncoding(context, encoding);
if (javaEncoding != null) {
ctx.getInputSource().setEncoding(javaEncoding);
}
ctx.java_encoding = findEncoding(context, encoding);
ctx.setIOInputSource(context, data, context.getRuntime().getNil());
return ctx;
}

Expand All @@ -235,18 +194,7 @@ static HtmlSaxParserContext parse_stream(final Ruby runtime, RubyClass klazz, In

@Override
protected void preParse(final Ruby runtime, IRubyObject handlerRuby, NokogiriHandler handler) {
// final String path = "Nokogiri::XML::FragmentHandler";
// final String docFrag =
// "http://cyberneko.org/html/features/balance-tags/document-fragment";
// RubyObjectAdapter adapter = JavaEmbedUtils.newObjectAdapter();
// IRubyObject doc = adapter.getInstanceVariable(handlerRuby, "@document");
// RubyModule mod = runtime.getClassFromPath(path);
// try {
// if (doc != null && !doc.isNil() && adapter.isKindOf(doc, mod))
// parser.setFeature(docFrag, true);
// } catch (Exception e) {
// // ignore
// }
// this function is meant to be empty. It overrides the one in XmlSaxParserContext
}

}
42 changes: 11 additions & 31 deletions ext/java/nokogiri/XmlDocument.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.javasupport.JavaUtil;
import org.jruby.runtime.Arity;
import org.jruby.runtime.Block;
import org.jruby.runtime.Helpers;
import org.jruby.runtime.ThreadContext;
Expand Down Expand Up @@ -310,45 +309,26 @@ public static IRubyObject load_external_subsets_set(ThreadContext context, IRuby
return context.getRuntime().getNil();
}

/**
* TODO: handle encoding?
*
* @param args[0] a Ruby IO or StringIO
* @param args[1] url or nil
* @param args[2] encoding
* @param args[3] bitset of parser options
*/
public static IRubyObject newFromData(ThreadContext context,
IRubyObject klass,
IRubyObject[] args) {
@JRubyMethod(meta = true, required = 4)
public static IRubyObject read_io(ThreadContext context,
IRubyObject klass,
IRubyObject[] args) {
Ruby ruby = context.getRuntime();
Arity.checkArgumentCount(ruby, args, 4, 4);
XmlDomParserContext ctx =
new XmlDomParserContext(ruby, args[2], args[3]);
ctx.setInputSource(context, args[0], args[1]);
ctx.setIOInputSource(context, args[0], args[1]);
return ctx.parse(context, klass, args[1]);
}

@JRubyMethod(meta = true, rest = true)
public static IRubyObject read_io(ThreadContext context,
IRubyObject klass,
IRubyObject[] args) {
return newFromData(context, klass, args);
}

@JRubyMethod(meta = true, rest = true)
@JRubyMethod(meta = true, required = 4)
public static IRubyObject read_memory(ThreadContext context,
IRubyObject klass,
IRubyObject[] args) {
return newFromData(context, klass, args);
}

/** not a JRubyMethod */
public static IRubyObject read_memory(ThreadContext context,
IRubyObject[] args) {
return read_memory(context,
getNokogiriClass(context.getRuntime(), "Nokogiri::XML::Document"),
args);
Ruby ruby = context.getRuntime();
XmlDomParserContext ctx =
new XmlDomParserContext(ruby, args[2], args[3]);
ctx.setStringInputSource(context, args[0], args[1]);
return ctx.parse(context, klass, args[1]);
}

@JRubyMethod(name="remove_namespaces!")
Expand Down
10 changes: 1 addition & 9 deletions ext/java/nokogiri/XmlNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@
import org.jruby.anno.JRubyClass;
import org.jruby.anno.JRubyMethod;
import org.jruby.exceptions.RaiseException;
import org.jruby.runtime.Helpers;
import org.jruby.runtime.Block;
import org.jruby.runtime.Helpers;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.Visibility;
import org.jruby.runtime.builtin.IRubyObject;
Expand Down Expand Up @@ -1316,14 +1316,6 @@ public IRubyObject previous_sibling(ThreadContext context) {
return getCachedNodeOrCreate(context.getRuntime(), node.getPreviousSibling());
}

@JRubyMethod(meta = true, rest = true)
public static IRubyObject new_from_str(ThreadContext context,
IRubyObject cls,
IRubyObject[] args) {
XmlDocument doc = (XmlDocument) XmlDocument.read_memory(context, args);
return doc.root(context);
}

@JRubyMethod(name = {"node_name", "name"})
public IRubyObject node_name(ThreadContext context) {
return getNodeName(context);
Expand Down
21 changes: 10 additions & 11 deletions ext/java/nokogiri/XmlReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,6 @@
import java.util.List;
import java.util.Stack;

import nokogiri.internals.NokogiriEntityResolver;
import nokogiri.internals.ParserContext;
import nokogiri.internals.ParserContext.Options;
import nokogiri.internals.ReaderNode;
import nokogiri.internals.ReaderNode.ClosingNode;
import nokogiri.internals.ReaderNode.ElementNode;
import nokogiri.internals.ReaderNode.TextNode;
import nokogiri.internals.UncloseableInputStream;

import org.apache.xerces.impl.Constants;
import org.apache.xerces.impl.xs.opti.DefaultXMLDocumentHandler;
import org.apache.xerces.parsers.StandardParserConfiguration;
Expand Down Expand Up @@ -81,6 +72,14 @@
import org.jruby.util.IOInputStream;
import org.xml.sax.InputSource;

import nokogiri.internals.NokogiriEntityResolver;
import nokogiri.internals.ParserContext;
import nokogiri.internals.ParserContext.Options;
import nokogiri.internals.ReaderNode;
import nokogiri.internals.ReaderNode.ClosingNode;
import nokogiri.internals.ReaderNode.ElementNode;
import nokogiri.internals.ReaderNode.TextNode;

/**
* Class for Nokogiri:XML::Reader
*
Expand Down Expand Up @@ -217,7 +216,7 @@ public static IRubyObject from_io(ThreadContext context, IRubyObject cls, IRubyO
options = new ParserContext.Options(2048 | 1);
}

InputStream in = new UncloseableInputStream(new IOInputStream(args[0]));
InputStream in = new IOInputStream(args[0]);
reader.setInput(context, in, url, options);
return reader;
}
Expand Down Expand Up @@ -245,7 +244,7 @@ public static IRubyObject from_memory(ThreadContext context, IRubyObject cls, IR
options = new ParserContext.Options(2048 | 1);
}
IRubyObject stringIO = runtime.getClass("StringIO").newInstance(context, args[0], Block.NULL_BLOCK);
InputStream in = new UncloseableInputStream(new IOInputStream(stringIO));
InputStream in = new IOInputStream(stringIO);
reader.setInput(context, in, url, options);
return reader;
}
Expand Down
Loading