Skip to content

Commit

Permalink
Merge pull request #2775 from sparklemotion/2773-pseudo-io-serialization
Browse files Browse the repository at this point in the history
fix: serialization with pseudo-IO objects like Zip::OutputStream
  • Loading branch information
flavorjones authored Jan 26, 2023
2 parents 1605431 + 952ff44 commit c748078
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 18 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA

---

## 1.14.1 / unreleased

### Fixed

* Serializing documents now works again with pseudo-IO objects that don't support IO's encoding API (like rubyzip's `Zip::OutputStream`). This was a regression in v1.14.0 due to the fix for [#752](https://github.com/sparklemotion/nokogiri/issues/752) in [#2434](https://github.com/sparklemotion/nokogiri/issues/2434), and was not completely fixed by [#2753](https://github.com/sparklemotion/nokogiri/issues/2753). [[#2773](https://github.com/sparklemotion/nokogiri/issues/2773)]

2e260f53e6b84b8f9c1b115b0ded85eebc8155d7


## 1.14.0 / 2023-01-12

### Notable Changes
Expand Down
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ group :development do
gem "minitest-reporters", "= 1.5.0"
gem "ruby_memcheck", "1.2.0" unless RUBY_PLATFORM == "java"
gem "simplecov", "= 0.21.2"
gem "rubyzip", "~> 2.3.2"

# rubocop
if Gem::Requirement.new("~> 3.0").satisfied_by?(Gem::Version.new(RUBY_VERSION))
Expand Down
9 changes: 7 additions & 2 deletions ext/nokogiri/nokogiri.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,13 @@ noko_io_write(void *io, char *c_buffer, int c_buffer_len)
{
VALUE rb_args[2], rb_n_bytes_written;
VALUE rb_io = (VALUE)io;
VALUE rb_enc = rb_funcall(rb_io, id_external_encoding, 0);
rb_encoding *io_encoding = RB_NIL_P(rb_enc) ? rb_ascii8bit_encoding() : rb_to_encoding(rb_enc);
VALUE rb_enc = Qnil;
rb_encoding *io_encoding;

if (rb_respond_to(rb_io, id_external_encoding)) {
rb_enc = rb_funcall(rb_io, id_external_encoding, 0);
}
io_encoding = RB_NIL_P(rb_enc) ? rb_ascii8bit_encoding() : rb_to_encoding(rb_enc);

rb_args[0] = rb_io;
rb_args[1] = rb_enc_str_new(c_buffer, (long)c_buffer_len, io_encoding);
Expand Down
92 changes: 76 additions & 16 deletions test/xml/test_document_encoding.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ class TestDocumentEncoding < Nokogiri::TestCase
describe "Nokogiri::XML::Document encoding" do
let(:shift_jis_document) { Nokogiri::XML(File.read(SHIFT_JIS_XML), SHIFT_JIS_XML) }
let(:ascii_document) { Nokogiri::XML.parse(File.read(XML_FILE), XML_FILE) }
let(:utf16_document) do
# the document needs to be large enough to trigger a libxml2 buffer flush. the buffer size
# is determined by MINLEN in xmlIO.c, which is hardcoded to 4000 code points.
size = 8000
<<~XML.encode(Encoding::UTF_16)
<?xml version="1.0" encoding="UTF-16"?>
<root>
<bar>#{"A" * size}</bar>
</root>
XML
end

describe "#encoding" do
it "describes the document's encoding correctly" do
Expand All @@ -31,36 +42,85 @@ class TestDocumentEncoding < Nokogiri::TestCase
end

it "encodes the URL as UTF-8" do
assert_equal("UTF-8", shift_jis_document.url.encoding.name)
assert_equal(Encoding::UTF_8, shift_jis_document.url.encoding)
end

it "encodes the encoding name as UTF-8" do
assert_equal("UTF-8", shift_jis_document.encoding.encoding.name)
assert_equal(Encoding::UTF_8, shift_jis_document.encoding.encoding)
end

it "encodes the library versions as UTF-8" do
skip_unless_libxml2
assert_equal("UTF-8", Nokogiri::LIBXML_COMPILED_VERSION.encoding.name)
assert_equal("UTF-8", Nokogiri::LIBXSLT_COMPILED_VERSION.encoding.name)

assert_equal(Encoding::UTF_8, Nokogiri::LIBXML_COMPILED_VERSION.encoding)
assert_equal(Encoding::UTF_8, Nokogiri::LIBXSLT_COMPILED_VERSION.encoding)
end

it "parses and serializes UTF-16 correctly" do
xml = <<~XML.encode(Encoding::UTF_16)
<?xml version="1.0" encoding="UTF-16"?>
<root><bar>A</bar></root>
XML
output = Nokogiri::XML(xml).to_xml
output_doc = Nokogiri::XML(output)

# these are descriptive, not prescriptive. the difference is whitespace. this may change
# as implementations change. the intention is to verify that they're _roughly_ the right
# length, they're not zero or half-width or double-width.
expected_bytesize = Nokogiri.jruby? ? 132 : 142

assert_equal(Encoding::UTF_16, output.encoding)
assert_equal("UTF-16", output_doc.encoding)
assert_equal(expected_bytesize, output.bytesize)
output_doc.at_xpath("/root/bar/text()").tap do |node|
assert(node, "unexpected DOM structure in #{output.inspect}")
assert_equal("A", node.content)
end
end

it "serializes UTF-16 correctly across libxml2 buffer flushes" do
# https://github.com/sparklemotion/nokogiri/issues/752
skip_unless_libxml2

# the document needs to be large enough to trigger a libxml2 buffer flush. the buffer size
# is determined by MINLEN in xmlIO.c, which is hardcoded to 4000 code points.
size = 8000
input = String.new(<<~XML, encoding: "UTF-16")
<?xml version="1.0" encoding="UTF-16"?>
<root>
<bar>#{"A" * size}</bar>
</root>
XML
expected_length = (input.bytesize * 2) + 2 # double character width, add BOM bytes 0xFEFF
output = Nokogiri::XML(utf16_document).to_xml

output = Nokogiri::XML(input).to_xml
assert_equal(expected_length, output.bytesize)
assert_equal(Encoding::UTF_16, output.encoding)
assert_equal(utf16_document.bytesize, output.bytesize)
end

describe "pseudo-IO" do
it "serializes correctly with Zip::OutputStream objects" do
# https://github.com/sparklemotion/nokogiri/issues/2773
require "zip"

xml = <<~XML
<?xml version="1.0" encoding="UTF-8"?>
<root>
<bar>A</bar>
</root>
XML

Dir.mktmpdir do |tmpdir|
zipfile_path = File.join(tmpdir, "test.zip")

Zip::OutputStream.open(zipfile_path) do |io|
io.put_next_entry("test-utf8.xml")
Nokogiri::XML(xml).write_to(io, encoding: "UTF-8")
end

Zip::InputStream.open(zipfile_path) do |io|
entry = io.get_next_entry
assert_equal("test-utf8.xml", entry.name)
output = io.read

# no final newline on jruby. descriptive, not prescriptive.
expected_length = Nokogiri.jruby? ? xml.bytesize - 1 : xml.bytesize

assert_equal(Encoding::UTF_8, output.encoding)
assert_equal(expected_length, output.bytesize)
end
end
end
end
end
end
Expand Down

0 comments on commit c748078

Please sign in to comment.