diff --git a/src/bindings/text-buffer-wrapper.cc b/src/bindings/text-buffer-wrapper.cc index 3f919ab9..995ac08b 100644 --- a/src/bindings/text-buffer-wrapper.cc +++ b/src/bindings/text-buffer-wrapper.cc @@ -1,5 +1,7 @@ #include "text-buffer-wrapper.h" -#include +#include +#include +#include #include "point-wrapper.h" #include "range-wrapper.h" #include "text-wrapper.h" @@ -8,8 +10,25 @@ #include "text-slice.h" #include "text-diff.h" #include "noop.h" -#include -#include +#include + +#ifdef WIN32 + +static size_t get_file_size(const char *name) { + struct _stat file_stats; + if (_stat(name, &file_stats) != 0) return -1; + return file_stats.st_size; +} + +#else + +static size_t get_file_size(const char *name) { + struct stat file_stats; + if (stat(name, &file_stats) != 0) return -1; + return file_stats.st_size; +} + +#endif using namespace v8; using std::move; @@ -358,12 +377,10 @@ void TextBufferWrapper::load_sync(const Nan::FunctionCallbackInfo &info) Nan::HandleScope scope; - std::ifstream file(file_path, std::ios_base::binary); - auto beginning = file.tellg(); - file.seekg(0, std::ios::end); - auto end = file.tellg(); - file.seekg(0); - size_t file_size = end - beginning; + FILE *file = fopen(file_path.c_str(), "rb"); + fseek(file, 0L, SEEK_END); + size_t file_size = ftell(file); + rewind(file); vector input_buffer(CHUNK_SIZE); Text::String loaded_string; @@ -380,6 +397,7 @@ void TextBufferWrapper::load_sync(const Nan::FunctionCallbackInfo &info) } } ); + fclose(file); Text loaded_text{move(loaded_string)}; Patch patch = text_diff(text_buffer.base_text(), loaded_text); @@ -443,20 +461,21 @@ void TextBufferWrapper::load_(const Nan::FunctionCallbackInfo &info, bool void Execute(const Nan::AsyncProgressWorkerBase::ExecutionProgress &progress) { if (!loaded_text) { - std::ifstream file(file_name, std::ios_base::binary); - auto beginning = file.tellg(); - file.seekg(0, std::ios::end); - auto end = file.tellg(); - file.seekg(0); - size_t file_size = end - beginning; - if (!file) { + auto conversion = transcoding_from(encoding_name.c_str()); + if (!conversion) { + error_number = INVALID_ENCODING; + return; + } + + size_t file_size = get_file_size(file_name.c_str()); + if (file_size == static_cast(-1)) { error_number = errno; return; } - auto conversion = transcoding_from(encoding_name.c_str()); - if (!conversion) { - error_number = INVALID_ENCODING; + FILE *file = fopen(file_name.c_str(), "rb"); + if (!file) { + error_number = errno; return; } @@ -472,10 +491,12 @@ void TextBufferWrapper::load_(const Nan::FunctionCallbackInfo &info, bool progress.Send(&percent_done, 1); } )) { + fclose(file); error_number = errno; return; } + fclose(file); loaded_text = Text{move(loaded_string)}; } @@ -613,7 +634,12 @@ void TextBufferWrapper::save_sync(const Nan::FunctionCallbackInfo &info) return; } - std::ofstream file(file_path, std::ios_base::binary); + FILE *file = fopen(file_path.c_str(), "w+b"); + if (!file) { + info.GetReturnValue().Set(Nan::False()); + return; + } + vector output_buffer(CHUNK_SIZE); for (TextSlice &chunk : text_buffer.chunks()) { if (!conversion->encode( @@ -654,7 +680,7 @@ void TextBufferWrapper::save(const Nan::FunctionCallbackInfo &info) { return; } - std::ofstream file(file_name, std::ios_base::binary); + FILE *file = fopen(file_name.c_str(), "wb+"); if (!file) { error_number = errno; return; @@ -670,9 +696,12 @@ void TextBufferWrapper::save(const Nan::FunctionCallbackInfo &info) { output_buffer )) { error_number = errno; + fclose(file); return; } } + + fclose(file); } void HandleOKCallback() { diff --git a/src/core/encoding-conversion.cc b/src/core/encoding-conversion.cc index d56602ca..5d29567f 100644 --- a/src/core/encoding-conversion.cc +++ b/src/core/encoding-conversion.cc @@ -114,7 +114,6 @@ int EncodingConversion::convert( case UTF16_TO_UTF8: { auto converter = static_cast(data); - const char *input_start = *input; const char16_t *next_input; char *next_output; int result = converter->facet.out( @@ -131,11 +130,11 @@ int EncodingConversion::convert( switch (result) { case codecvt_base::ok: // When using GCC and libstdc++, `codecvt_utf8_utf16::out` seems to - // incorrectly return `ok` when there is an incomplete multi-byte - // sequence at the end of the input chunk. But it correctly does - // not advance the input pointer, so we can distinguish this - // situation from an actual successful result. - if (*input == input_start && input_start < input_end) return InvalidTrailing; + // return `ok` when there is an incomplete multi-byte sequence at the + // end of the input chunk. But it correctly does not advance the input + // pointer, so we can distinguish this situation from an actual + // successful result. + if (*input < input_end && *output < output_end) return InvalidTrailing; return Ok; case codecvt_base::partial: @@ -171,7 +170,7 @@ int EncodingConversion::convert( return Error; } -bool EncodingConversion::decode(String &string, std::istream &stream, +bool EncodingConversion::decode(String &string, FILE *stream, vector &input_vector, function progress_callback) { char *input_buffer = input_vector.data(); @@ -179,10 +178,9 @@ bool EncodingConversion::decode(String &string, std::istream &stream, size_t total_bytes_read = 0; for (;;) { - errno = 0; - stream.read(input_buffer + bytes_left_over, input_vector.size() - bytes_left_over); - if (!stream && errno != 0) return false; - size_t bytes_read = stream.gcount(); + size_t bytes_to_read = input_vector.size() - bytes_left_over; + size_t bytes_read = fread(input_buffer + bytes_left_over, 1, bytes_to_read, stream); + if (bytes_read < bytes_to_read && ferror(stream)) return false; size_t bytes_to_append = bytes_left_over + bytes_read; if (bytes_to_append == 0) break; @@ -271,13 +269,12 @@ size_t EncodingConversion::decode(String &string, const char *input_start, } bool EncodingConversion::encode(const String &string, size_t start_offset, - size_t end_offset, std::ostream &stream, + size_t end_offset, FILE *stream, vector &output_vector) { char *output_buffer = output_vector.data(); - bool end = false; - for (;;) { - size_t output_bytes_written = encode( + while (start_offset < end_offset) { + size_t bytes_encoded = encode( string, &start_offset, end_offset, @@ -285,11 +282,9 @@ bool EncodingConversion::encode(const String &string, size_t start_offset, output_vector.size(), end ); - errno = 0; - stream.write(output_buffer, output_bytes_written); - if (end) break; - if (output_bytes_written == 0) end = true; - if (!stream && errno != 0) return false; + if (bytes_encoded == 0) end = true; + size_t bytes_written = fwrite(output_buffer, 1, bytes_encoded, stream); + if (bytes_written < bytes_encoded && ferror(stream)) return false; } return true; diff --git a/src/core/encoding-conversion.h b/src/core/encoding-conversion.h index b48f45a6..b6af05d3 100644 --- a/src/core/encoding-conversion.h +++ b/src/core/encoding-conversion.h @@ -3,6 +3,7 @@ #include "optional.h" #include "text.h" +#include class EncodingConversion { void *data; @@ -17,10 +18,10 @@ class EncodingConversion { ~EncodingConversion(); bool encode(const Text::String &, size_t start_offset, size_t end_offset, - std::ostream &stream, std::vector &buffer); + FILE *stream, std::vector &buffer); size_t encode(const Text::String &, size_t *start_offset, size_t end_offset, char *buffer, size_t buffer_size, bool is_last = false); - bool decode(Text::String &, std::istream &stream, std::vector &buffer, + bool decode(Text::String &, FILE *stream, std::vector &buffer, std::function progress_callback); size_t decode(Text::String &, const char *buffer, size_t buffer_size, bool is_last = false); diff --git a/test/js/text-buffer.test.js b/test/js/text-buffer.test.js index a41b36bd..f5953c08 100644 --- a/test/js/text-buffer.test.js +++ b/test/js/text-buffer.test.js @@ -272,7 +272,7 @@ describe('TextBuffer', () => { done(new Error('Expected an error')) }) .catch((error) => { - assert.equal(error.code, isWindows ? 'EACCES' : 'EISDIR') + if (!isWindows) assert.equal(error.code, 'EISDIR') assert.equal(error.path, filePath) done() }) @@ -291,7 +291,7 @@ describe('TextBuffer', () => { done(new Error('Expected an error')) }) .catch((error) => { - assert.equal(error.code, isWindows ? 'EINVAL' : 'ELOOP') + if (!isWindows) assert.equal(error.code, 'ELOOP') assert.equal(error.path, filePath) done() }) diff --git a/test/native/encoding-conversion-test.cc b/test/native/encoding-conversion-test.cc index 20b23b90..743fe88a 100644 --- a/test/native/encoding-conversion-test.cc +++ b/test/native/encoding-conversion-test.cc @@ -9,143 +9,144 @@ using std::vector; using std::u16string; using String = Text::String; -TEST_CASE("EncodingConversion::decode - can decode a UTF-8 stream") { +TEST_CASE("EncodingConversion::decode - basic UTF-8") { auto conversion = transcoding_from("UTF-8"); - stringstream stream("abγdefg\nhijklmnop", std::ios_base::in); + string input("abγdefg\nhijklmnop"); String string; - vector encoding_buffer(3); - vector progress_reports; - conversion->decode(string, stream, encoding_buffer, [&](size_t percent_done) { - progress_reports.push_back(percent_done); - }); - + conversion->decode(string, input.data(), input.size()); REQUIRE(string == u"abγdefg\nhijklmnop"); - REQUIRE(progress_reports == vector({2, 5, 8, 11, 14, 17, 18})); + + // This first chunk ends in the middle of the multi-byte 'γ' character, so + // decoding stops before that character. + String string2; + size_t bytes_read = conversion->decode(string2, input.data(), 3); + REQUIRE(bytes_read == 2); + + // We can pick up where we left off and decode the reset of the input. + conversion->decode(string2, input.data() + 2, input.size() - 2); + REQUIRE(string2 == u"abγdefg\nhijklmnop"); } -TEST_CASE("EncodingConversion::decode - can decode an ISO-8859-1 stream") { +TEST_CASE("EncodingConversion::decode - basic ISO-8859-1") { auto conversion = transcoding_from("ISO-8859-1"); - stringstream stream("qrst" "\xfc" "v", std::ios_base::in); // qrstüv + string input("qrst" "\xfc" "v"); // qrstüv String string; - vector encoding_buffer(3); - conversion->decode(string, stream, encoding_buffer, [&](size_t percent_done) {}); - + conversion->decode(string, input.data(), input.size()); REQUIRE(string == u"qrstüv"); } -TEST_CASE("EncodingConversion::decode - replaces invalid byte sequences in the middle of the stream with the Unicode replacement character") { +TEST_CASE("EncodingConversion::decode - invalid byte sequences in the middle of the input") { auto conversion = transcoding_from("UTF-8"); - stringstream stream("ab" "\xc0" "\xc1" "de", std::ios_base::in); + string input("ab" "\xc0" "\xc1" "de"); String string; - vector encoding_buffer(3); - vector progress_reports; - conversion->decode(string, stream, encoding_buffer, [&](size_t percent_done) { - progress_reports.push_back(percent_done); - }); - + conversion->decode(string, input.data(), input.size()); REQUIRE(string == u"ab" "\ufffd" "\ufffd" "de"); - REQUIRE(progress_reports == vector({ 3, 6 })); } -TEST_CASE("EncodingConversion::decode - replaces invalid byte sequences at the end of the stream with the Unicode replacement characters") { +TEST_CASE("EncodingConversion::decode - invalid byte sequences at the end of the input") { auto conversion = transcoding_from("UTF-8"); - stringstream stream("ab" "\xf0\x9f", std::ios_base::in); // incomplete 4-byte code point for '😁' at the end of the stream + string input("ab" "\xf0\x9f"); // incomplete 4-byte code point for '😁' at the end of the stream String string; - vector encoding_buffer(5); - conversion->decode(string, stream, encoding_buffer, [&](size_t percent_done) {}); - + size_t bytes_encoded = conversion->decode(string, input.data(), input.size()); + REQUIRE(bytes_encoded == 2); + REQUIRE(string == u"ab"); + + // Passing the `is_end` + string.clear(); + bytes_encoded = conversion->decode(string, input.data(), input.size(), true); + REQUIRE(bytes_encoded == 4); REQUIRE(string == u"ab" "\ufffd" "\ufffd"); } -TEST_CASE("EncodingConversion::decode - handles characters that require two 16-bit code units") { +TEST_CASE("EncodingConversion::decode - four-byte UTF-16 characters") { auto conversion = transcoding_from("UTF-8"); - stringstream stream("ab" "\xf0\x9f" "\x98\x81" "cd", std::ios_base::in); // 'ab😁cd' + string input("ab" "\xf0\x9f" "\x98\x81" "cd"); // 'ab😁cd' String string; - vector encoding_buffer(5); - conversion->decode(string, stream, encoding_buffer, [&](size_t percent_done) {}); - + conversion->decode(string, input.data(), input.size()); REQUIRE(string == u"ab" "\xd83d" "\xde01" "cd"); } -TEST_CASE("EncodingConversion::decode - resizes the buffer if the encoding conversion runs out of room") { - auto conversion = transcoding_from("UTF-8"); - stringstream stream("abcdef", std::ios_base::in); - - String string; - vector encoding_buffer(5); - conversion->decode(string, stream, encoding_buffer, [&](size_t percent_done) {}); +TEST_CASE("EncodingConversion::encode - basic") { + auto conversion = transcoding_to("UTF-8"); + u16string content = u"abγdefg\nhijklmnop"; + String string(content.begin(), content.end()); - REQUIRE(string == u"abcdef"); -} + vector output(3); + size_t bytes_encoded = 0, start = 0; -TEST_CASE("EncodingConversion::decode - handles CRLF newlines") { - auto conversion = transcoding_from("UTF-8"); - stringstream stream("abc\r\nde\rf\r\ng\r", std::ios_base::in); + // The 'γ' requires to UTF-8 bytes, so it doesn't fit in the output buffer + bytes_encoded = conversion->encode( + string, &start, string.size(), output.data(), output.size()); + REQUIRE(std::string(output.data(), bytes_encoded) == "ab"); - String string; - vector encoding_buffer(4); - conversion->decode(string, stream, encoding_buffer, [&](size_t percent_done) {}); + bytes_encoded = conversion->encode( + string, &start, string.size(), output.data(), output.size()); + REQUIRE(std::string(output.data(), bytes_encoded) == "γd"); - REQUIRE(string == u"abc\r\nde\rf\r\ng\r"); + bytes_encoded = conversion->encode( + string, &start, string.size(), output.data(), output.size()); + REQUIRE(std::string(output.data(), bytes_encoded) == "efg"); } -TEST_CASE("EncodingConversion::encode - basic") { +TEST_CASE("EncodingConversion::encode - four-byte UTF-16 characters") { auto conversion = transcoding_to("UTF-8"); - - u16string content = u"abγdefg\nhijklmnop"; + u16string content = u"ab" "\xd83d" "\xde01" "cd"; // 'ab😁cd' String string(content.begin(), content.end()); - vector encoding_buffer(3); - stringstream stream; - conversion->encode(string, 0, string.size(), stream, encoding_buffer); - REQUIRE(stream.str() == "abγdefg\nhijklmnop"); + vector output(10); + size_t bytes_encoded = 0, start = 0; + + bytes_encoded = conversion->encode( + string, &start, string.size(), output.data(), output.size()); + REQUIRE(std::string(output.data(), bytes_encoded) == "ab" "\xf0\x9f" "\x98\x81" "cd"); + + // The end offset, 3, is in the middle of the 4-byte character. + start = 0; + bytes_encoded = conversion->encode( + string, &start, 3, output.data(), output.size()); + REQUIRE(std::string(output.data(), bytes_encoded) == "ab"); - stringstream stream2; - conversion->encode(string, 1, string.size(), stream2, encoding_buffer); - REQUIRE(stream2.str() == "bγdefg\nhijklmnop"); + // We can pick up where we left off. + bytes_encoded += conversion->encode( + string, &start, string.size(), output.data() + bytes_encoded, output.size() - bytes_encoded); + REQUIRE(std::string(output.data(), bytes_encoded) == "ab" "\xf0\x9f" "\x98\x81" "cd"); } -TEST_CASE("EncodingConversion::encode - invalid characters") { +TEST_CASE("EncodingConversion::encode - invalid characters in the middle of the string") { auto conversion = transcoding_to("UTF-8"); - u16string content = u"abc" "\xD800" "def"; String string(content.begin(), content.end()); - vector encoding_buffer(3); - stringstream stream; - conversion->encode(string, 0, string.size(), stream, encoding_buffer); - REQUIRE(stream.str() == "abc" "\ufffd" "def"); + vector output(10); + size_t bytes_encoded = 0, start = 0; - stringstream stream2; - conversion->encode(string, 1, string.size(), stream2, encoding_buffer); - REQUIRE(stream2.str() == "bc" "\ufffd" "def"); + bytes_encoded = conversion->encode( + string, &start, string.size(), output.data(), output.size()); + REQUIRE(std::string(output.data(), bytes_encoded) == "abc" "\ufffd" "def"); - stringstream stream3; - conversion->encode(string, 2, string.size(), stream3, encoding_buffer); - REQUIRE(stream3.str() == "c" "\ufffd" "def"); + // Here, the invalid character occurs at the end of a chunk. + start = 0; + bytes_encoded = conversion->encode( + string, &start, 4, output.data(), output.size()); + bytes_encoded += conversion->encode( + string, &start, string.size(), output.data() + bytes_encoded, output.size() - bytes_encoded); + REQUIRE(std::string(output.data(), bytes_encoded) == "abc" "\ufffd" "def"); } -TEST_CASE("EncodingConversion::encode - invalid characters at the end of the slice") { +TEST_CASE("EncodingConversion::encode - invalid characters at the end of the string") { auto conversion = transcoding_to("UTF-8"); - u16string content = u"abc" "\xD800"; String string(content.begin(), content.end()); - vector encoding_buffer(3); - - stringstream stream; - conversion->encode(string, 0, string.size(), stream, encoding_buffer); - REQUIRE(stream.str() == "abc" "\ufffd"); - stringstream stream2; - conversion->encode(string, 1, string.size(), stream2, encoding_buffer); - REQUIRE(stream2.str() == "bc" "\ufffd"); + vector output(10); + size_t bytes_encoded = 0, start = 0; - stringstream stream3; - conversion->encode(string, 2, string.size(), stream3, encoding_buffer); - REQUIRE(stream3.str() == "c" "\ufffd"); + bytes_encoded = conversion->encode( + string, &start, string.size(), output.data(), output.size(), true); + REQUIRE(std::string(output.data(), bytes_encoded) == "abc" "\ufffd"); }