diff --git a/include/ogonek/encoding/encoding_scheme.h++ b/include/ogonek/encoding/encoding_scheme.h++ index d11b295..268a99f 100644 --- a/include/ogonek/encoding/encoding_scheme.h++ +++ b/include/ogonek/encoding/encoding_scheme.h++ @@ -84,21 +84,13 @@ namespace ogonek { using state = typename EncodingForm::state; using code_unit = ogonek::byte; - template ::type, - typename EncodingIterator = encoding_iterator, Iterator>> - static boost::iterator_range encode(SinglePassRange const& r) { - return boost::make_iterator_range( - EncodingIterator { boost::begin(r), boost::end(r) }, - EncodingIterator { boost::end(r), boost::end(r) }); - } - template ::type, - typename DecodingIterator = decoding_iterator, Iterator>> - static boost::iterator_range decode(SinglePassRange const& r) { + typename EncodingIterator = encoding_iterator, Iterator, ValidationCallback>> + static boost::iterator_range encode(SinglePassRange const& r, ValidationCallback&& callback) { return boost::make_iterator_range( - DecodingIterator { boost::begin(r), boost::end(r) }, - DecodingIterator { boost::end(r), boost::end(r) }); + EncodingIterator { boost::begin(r), boost::end(r), callback }, + EncodingIterator { boost::end(r), boost::end(r), callback }); } template - static boost::sub_range decode_one(SinglePassRange const& r, codepoint& out, state& s) { - using code_unit_range = encoding_scheme_detail::byte_ordered_range; - using iterator = typename boost::range_iterator::type; - code_unit_range range { - iterator { boost::begin(r) }, iterator { boost::end(r) } - }; - auto remaining = EncodingForm::decode_one(range, out, s); - return { remaining.begin().it, r.end() }; - } template static boost::sub_range decode_one(SinglePassRange const& r, codepoint& out, state& s, ValidationCallback&& callback) { using code_unit_range = encoding_scheme_detail::byte_ordered_range; diff --git a/include/ogonek/encoding/iterator.h++ b/include/ogonek/encoding/iterator.h++ index 8c772c9..7c514dc 100644 --- a/include/ogonek/encoding/iterator.h++ +++ b/include/ogonek/encoding/iterator.h++ @@ -16,6 +16,7 @@ #include "../traits.h++" #include "../types.h++" +#include "../validation.h++" #include #include @@ -54,17 +55,27 @@ namespace ogonek { std::array array; }; - template + inline void validate(codepoint&, decltype(skip_validation)) {} + inline bool is_surrogate(codepoint u) { return u >= 0xD800 && u <= 0xDFFF; } + template + void validate(codepoint& u, Callback&& callback) { + auto list = { u }; + if(u > 0x10FFFF || is_surrogate(u)) { + callback(validation_result::illegal, boost::sub_range(list), u); // TODO: how to use the result? + } + } + + template struct encoding_iterator : boost::iterator_facade< - encoding_iterator, + encoding_iterator, CodeUnit, std::input_iterator_tag, // TODO CodeUnit > { public: - encoding_iterator(Iterator first, Iterator last) - : first(first), last(last) { + encoding_iterator(Iterator first, Iterator last, ValidationCallback callback) + : first(first), last(last), callback(std::forward(callback)) { encode_next(); } @@ -86,7 +97,9 @@ namespace ogonek { private: void encode_next() { if(first != last) { - encoded = EncodingForm::encode_one(*first++, state); + auto u = *first++; + validate(u, callback); + encoded = EncodingForm::encode_one(u, state); current = 0; } else { current = -1; @@ -94,12 +107,13 @@ namespace ogonek { } Iterator first, last; + typename std::decay::type callback; typename EncodingForm::state state {}; partial_array, EncodingForm::max_width> encoded {}; int current; }; - template + template struct decoding_iterator : boost::iterator_facade< decoding_iterator, @@ -132,39 +146,6 @@ namespace ogonek { typename std::decay::type callback; typename EncodingForm::state state {}; }; - - template - struct decoding_iterator - : boost::iterator_facade< - decoding_iterator, - codepoint, - std::input_iterator_tag, // TODO - codepoint - > { - public: - using range = boost::iterator_range; - - decoding_iterator(Iterator first, Iterator last) - : first(first), last(last), state{} {} - - codepoint dereference() const { - codepoint u; - auto s = state; - EncodingForm::decode_one(boost::sub_range(first, last), u, s); - return u; - } - bool equal(decoding_iterator const& that) const { - return first == that.first || (first == last && that.first == that.last); - } - void increment() { - codepoint dummy; - first = EncodingForm::decode_one(boost::sub_range(first, last), dummy, state).begin(); - } - - private: - Iterator first, last; - typename EncodingForm::state state; - }; } // namespace ogonek #endif // OGONEK_ENCODING_ITERATOR_HPP diff --git a/include/ogonek/encoding/utf16.h++ b/include/ogonek/encoding/utf16.h++ index 16190ab..a36f154 100644 --- a/include/ogonek/encoding/utf16.h++ +++ b/include/ogonek/encoding/utf16.h++ @@ -34,21 +34,13 @@ namespace ogonek { static constexpr bool is_self_synchronizing = true; struct state {}; - template ::type, - typename EncodingIterator = encoding_iterator> - static boost::iterator_range encode(SinglePassRange const& r) { - return boost::make_iterator_range( - EncodingIterator { boost::begin(r), boost::end(r) }, - EncodingIterator { boost::end(r), boost::end(r) }); - } - template ::type, - typename DecodingIterator = decoding_iterator> - static boost::iterator_range decode(SinglePassRange const& r) { + typename EncodingIterator = encoding_iterator> + static boost::iterator_range encode(SinglePassRange const& r, ValidationCallback&& callback) { return boost::make_iterator_range( - DecodingIterator { boost::begin(r), boost::end(r) }, - DecodingIterator { boost::end(r), boost::end(r) }); + EncodingIterator { boost::begin(r), boost::end(r), callback }, + EncodingIterator { boost::end(r), boost::end(r), callback }); } template = 0xD800 && u <= 0xDFFF; }; template - static boost::sub_range decode_one(SinglePassRange const& r, codepoint& out, state&) { + static boost::sub_range decode_one(SinglePassRange const& r, codepoint& out, state&, decltype(skip_validation)) { auto first = boost::begin(r); auto lead = *first++; if(!is_surrogate(lead)) { diff --git a/include/ogonek/encoding/utf32.h++ b/include/ogonek/encoding/utf32.h++ index 58a44ab..ce21347 100644 --- a/include/ogonek/encoding/utf32.h++ +++ b/include/ogonek/encoding/utf32.h++ @@ -30,21 +30,13 @@ namespace ogonek { static constexpr bool is_self_synchronizing = true; struct state {}; - template ::type, - typename EncodingIterator = encoding_iterator> - static boost::iterator_range encode(SinglePassRange const& r) { - return boost::make_iterator_range( - EncodingIterator { boost::begin(r), boost::end(r) }, - EncodingIterator { boost::end(r), boost::end(r) }); - } - template ::type, - typename DecodingIterator = decoding_iterator> - static boost::iterator_range decode(SinglePassRange const& r) { + typename EncodingIterator = encoding_iterator> + static boost::iterator_range encode(SinglePassRange const& r, ValidationCallback&& callback) { return boost::make_iterator_range( - DecodingIterator { boost::begin(r), boost::end(r) }, - DecodingIterator { boost::end(r), boost::end(r) }); + EncodingIterator { boost::begin(r), boost::end(r), callback }, + EncodingIterator { boost::end(r), boost::end(r), callback }); } template - static boost::sub_range decode_one(SinglePassRange const& r, codepoint& out, state&) { + static boost::sub_range decode_one(SinglePassRange const& r, codepoint& out, state&, decltype(skip_validation)) { auto first = boost::begin(r); out = *first++; return { first, boost::end(r) }; diff --git a/include/ogonek/encoding/utf8.h++ b/include/ogonek/encoding/utf8.h++ index 145cf7d..8c1f0e9 100644 --- a/include/ogonek/encoding/utf8.h++ +++ b/include/ogonek/encoding/utf8.h++ @@ -35,23 +35,14 @@ namespace ogonek { static constexpr bool is_self_synchronizing = true; struct state {}; - template ::type, - typename EncodingIterator = encoding_iterator> - static boost::iterator_range encode(SinglePassRange const& r) { - return boost::make_iterator_range( - EncodingIterator { boost::begin(r), boost::end(r) }, - EncodingIterator { boost::end(r), boost::end(r) }); - } - template ::type, - typename DecodingIterator = decoding_iterator> - static boost::iterator_range decode(SinglePassRange const& r) { + typename EncodingIterator = encoding_iterator> + static boost::iterator_range encode(SinglePassRange const& r, ValidationCallback&& callback) { return boost::make_iterator_range( - DecodingIterator { boost::begin(r), boost::end(r) }, - DecodingIterator { boost::end(r), boost::end(r) }); + EncodingIterator { boost::begin(r), boost::end(r), callback }, + EncodingIterator { boost::end(r), boost::end(r), callback }); } - template ::type, typename DecodingIterator = decoding_iterator> @@ -104,7 +95,7 @@ namespace ogonek { return ((b0 & 0x07) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F); } template - static boost::sub_range decode_one(SinglePassRange const& r, codepoint& out, state&) { + static boost::sub_range decode_one(SinglePassRange const& r, codepoint& out, state&, decltype(skip_validation)) { auto first = boost::begin(r); byte b0 = *first++; auto length = sequence_length(b0); diff --git a/include/ogonek/text.h++ b/include/ogonek/text.h++ index 7eb3f6d..86cb5bd 100644 --- a/include/ogonek/text.h++ +++ b/include/ogonek/text.h++ @@ -100,14 +100,12 @@ namespace ogonek { //! Construct from a codepoint range, with validation callback template - basic_text(CodepointRange const& range, ValidationCallback&& /*callback*/) - : basic_text(direct{}, EncodingForm::encode(range)) { // TODO use callback! + basic_text(CodepointRange const& range, ValidationCallback&& callback) + : basic_text(direct{}, EncodingForm::encode(range, std::forward(callback))) { static_assert(std::is_same, codepoint>::value, "Can only construct text from a range of codepoints"); } - // -- code units - // -- storage //! Construct from an underlying container explicit basic_text(Container storage) @@ -116,13 +114,13 @@ namespace ogonek { //** Range ** - using iterator = decoding_iterator; - using const_iterator = decoding_iterator; + using iterator = decoding_iterator; + using const_iterator = decoding_iterator; - iterator begin() { return iterator { storage_.begin(), storage_.end() }; } - iterator end() { return iterator { storage_.end(), storage_.end() }; } - const_iterator begin() const { return const_iterator { storage_.begin(), storage_.end() }; } - const_iterator end() const { return const_iterator { storage_.end(), storage_.end() }; } + iterator begin() { return iterator { storage_.begin(), storage_.end(), skip_validation }; } + iterator end() { return iterator { storage_.end(), storage_.end(), skip_validation }; } + const_iterator begin() const { return const_iterator { storage_.begin(), storage_.end(), skip_validation }; } + const_iterator end() const { return const_iterator { storage_.end(), storage_.end(), skip_validation }; } //** Interoperation ** diff --git a/include/ogonek/validation.h++ b/include/ogonek/validation.h++ index 2d2ef3c..a2578bc 100644 --- a/include/ogonek/validation.h++ +++ b/include/ogonek/validation.h++ @@ -32,27 +32,29 @@ namespace ogonek { } }; - struct { + constexpr struct { template boost::sub_range operator()(validation_result, boost::sub_range const&, codepoint&) const { throw validation_error(); } - } constexpr throw_validation_error = {}; + } throw_validation_error = {}; - struct { + constexpr struct { template boost::sub_range operator()(validation_result, boost::sub_range const& source, codepoint& out) const { out = U'\xFFFD'; return { std::next(boost::begin(source)), boost::end(source) }; } - } constexpr use_replacement_character = {}; + } use_replacement_character = {}; - struct { + constexpr struct { template boost::sub_range operator()(validation_result, boost::sub_range const& source, codepoint&) const { return { std::next(boost::begin(source)), boost::end(source) }; } - } constexpr ignore_errors = {}; + } ignore_errors = {}; + + constexpr struct skip_validation_t {} skip_validation = {}; } // namespace ogonek #endif // OGONEK_VALIDATION_HPP diff --git a/test/encoding_scheme.c++ b/test/encoding_scheme.c++ index 54fa25e..bfd5bab 100644 --- a/test/encoding_scheme.c++ +++ b/test/encoding_scheme.c++ @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -23,7 +24,7 @@ TEST_CASE("utf16le", "UTF-16LE codec") { SECTION("encode", "Encoding UTF-16LE") { auto decoded = { 0x0041_u, 0x00C5_u, 0x1EA0_u, 0x1F4A9_u }; - auto range = ogonek::utf16le::encode(decoded); + auto range = ogonek::utf16le::encode(decoded, ogonek::skip_validation); std::vector encoded(boost::begin(range), boost::end(range)); REQUIRE(encoded.size() == 10); CHECK(encoded[0] == 0x41_b); @@ -40,7 +41,7 @@ TEST_CASE("utf16le", "UTF-16LE codec") { SECTION("decode", "Decoding UTF-16LE") { auto encoded = { 0x41_b, 0x00_b, 0xC5_b, 0x00_b, 0xA0_b, 0x1E_b, 0x3D_b, 0xD8_b, 0xA9_b, 0xDC_b }; - auto range = ogonek::utf16le::decode(encoded); + auto range = ogonek::utf16le::decode(encoded, ogonek::skip_validation); std::vector decoded(boost::begin(range), boost::end(range)); CHECK(decoded.size() == 4); CHECK(decoded[0] == 0x0041_u); @@ -55,7 +56,7 @@ TEST_CASE("utf16be", "UTF-16BE codec") { SECTION("encode", "Encoding UTF-16BE") { auto decoded = { 0x0041_u, 0x00C5_u, 0x1EA0_u, 0x1F4A9_u }; - auto range = ogonek::utf16be::encode(decoded); + auto range = ogonek::utf16be::encode(decoded, ogonek::skip_validation); std::vector encoded(boost::begin(range), boost::end(range)); REQUIRE(encoded.size() == 10); CHECK(encoded[0] == 0x00_b); @@ -72,7 +73,7 @@ TEST_CASE("utf16be", "UTF-16BE codec") { SECTION("decode", "Decoding UTF-16BE") { auto encoded = { 0x00_b, 0x41_b, 0x00_b, 0xC5_b, 0x1E_b, 0xA0_b, 0xD8_b, 0x3D_b, 0xDC_b, 0xA9_b }; - auto range = ogonek::utf16be::decode(encoded); + auto range = ogonek::utf16be::decode(encoded, ogonek::skip_validation); std::vector decoded(boost::begin(range), boost::end(range)); REQUIRE(decoded.size() == 4); CHECK(decoded[0] == 0x0041_u); @@ -87,7 +88,7 @@ TEST_CASE("utf32le", "UTF-32LE codec") { SECTION("encode", "Encoding UTF-32LE") { auto decoded = { 0x0041_u, 0x00C5_u, 0x1EA0_u, 0x1F4A9_u }; - auto range = ogonek::utf32le::encode(decoded); + auto range = ogonek::utf32le::encode(decoded, ogonek::skip_validation); std::vector encoded(boost::begin(range), boost::end(range)); REQUIRE(encoded.size() == 16); CHECK(encoded[0] == 0x41_b); @@ -112,7 +113,7 @@ TEST_CASE("utf32le", "UTF-32LE codec") { 0xC5_b, 0x00_b, 0x00_b, 0x00_b, 0xA0_b, 0x1E_b, 0x00_b, 0x00_b, 0xA9_b, 0xF4_b, 0x01_b, 0x00_b }; - auto range = ogonek::utf32le::decode(encoded); + auto range = ogonek::utf32le::decode(encoded, ogonek::skip_validation); std::vector decoded(boost::begin(range), boost::end(range)); REQUIRE(decoded.size() == 4); CHECK(decoded[0] == 0x0041_u); @@ -128,7 +129,7 @@ TEST_CASE("utf32be", "UTF-32BE codec") { SECTION("encode", "Encoding UTF-32BE") { auto decoded = { 0x0041_u, 0x00C5_u, 0x1EA0_u, 0x1F4A9_u }; - auto range = ogonek::utf32be::encode(decoded); + auto range = ogonek::utf32be::encode(decoded, ogonek::skip_validation); std::vector encoded(boost::begin(range), boost::end(range)); REQUIRE(encoded.size() == 16); CHECK(encoded[0] == 0x00_b); @@ -153,7 +154,7 @@ TEST_CASE("utf32be", "UTF-32BE codec") { 0x00_b, 0x00_b, 0x00_b, 0xC5_b, 0x00_b, 0x00_b, 0x1E_b, 0xA0_b, 0x00_b, 0x01_b, 0xF4_b, 0xA9_b }; - auto range = ogonek::utf32be::decode(encoded); + auto range = ogonek::utf32be::decode(encoded, ogonek::skip_validation); std::vector decoded(boost::begin(range), boost::end(range)); REQUIRE(decoded.size() == 4); CHECK(decoded[0] == 0x0041_u); diff --git a/test/text.c++ b/test/text.c++ index 18588ea..7a4761e 100644 --- a/test/text.c++ +++ b/test/text.c++ @@ -48,7 +48,10 @@ TEST_CASE("text", "text tests") { text16 h { text8 { U"blah\U0001F4A9" } }; REQUIRE(h.storage() == u"blah\U0001F4A9"); - //REQUIRE_THROWS_AS(text16 { U"blah\x200000" }, ogonek::validation_error); + text16 i { std::u16string { u"blah\U0001F4A9" } }; + REQUIRE(d.storage() == u"blah\U0001F4A9"); + + REQUIRE_THROWS_AS(text16 { U"blah\x200000" }, ogonek::validation_error); } } diff --git a/test/utf.c++ b/test/utf.c++ index f2d1f97..0b0e64f 100644 --- a/test/utf.c++ +++ b/test/utf.c++ @@ -27,7 +27,7 @@ TEST_CASE("utf8", "UTF-8 encoding form") { SECTION("encode", "Encoding UTF-8") { auto decoded = { 0x0041_u, 0x00C5_u, 0x1EA0_u, 0x1F4A9_u }; - auto range = ogonek::utf8::encode(decoded); + auto range = ogonek::utf8::encode(decoded, ogonek::skip_validation); std::vector encoded(boost::begin(range), boost::end(range)); REQUIRE(encoded.size() == 10); CHECK(encoded[0] == 0x41_b); @@ -44,7 +44,7 @@ TEST_CASE("utf8", "UTF-8 encoding form") { SECTION("decode", "Decoding UTF-8") { auto encoded = { 0x41_b, 0xC3_b, 0x85_b, 0xE1_b, 0xBA_b, 0xA0_b, 0xF0_b, 0x9F_b, 0x92_b, 0xA9_b }; - auto range = ogonek::utf8::decode(encoded); + auto range = ogonek::utf8::decode(encoded, ogonek::skip_validation); std::vector decoded(boost::begin(range), boost::end(range)); REQUIRE(decoded.size() == 4); CHECK(decoded[0] == U'\x0041'); @@ -59,7 +59,7 @@ TEST_CASE("utf16", "UTF-16 encoding form") { SECTION("encode", "Encoding UTF-16") { auto decoded = { 0x0041_u, 0x00C5_u, 0x1EA0_u, 0x1F4A9_u }; - auto range = ogonek::utf16::encode(decoded); + auto range = ogonek::utf16::encode(decoded, ogonek::skip_validation); std::vector encoded(boost::begin(range), boost::end(range)); REQUIRE(encoded.size() == 5); CHECK(encoded[0] == u'\x0041'); @@ -70,7 +70,7 @@ TEST_CASE("utf16", "UTF-16 encoding form") { } SECTION("decode", "Decoding UTF-16") { std::initializer_list encoded = { 0x0041, 0x00C5, 0x1EA0, 0xD83D, 0xDCA9 }; - auto range = ogonek::utf16::decode(encoded); + auto range = ogonek::utf16::decode(encoded, ogonek::skip_validation); std::vector decoded(boost::begin(range), boost::end(range)); REQUIRE(decoded.size() == 4); CHECK(decoded[0] == U'\x0041'); @@ -85,7 +85,7 @@ TEST_CASE("utf32", "UTF-32 encoding form") { SECTION("encode", "Encoding UTF-32") { auto decoded = { 0x0041_u, 0x00C5_u, 0x1EA0_u, 0x1F4A9_u }; - auto range = ogonek::utf32::encode(decoded); + auto range = ogonek::utf32::encode(decoded, ogonek::skip_validation); std::vector encoded(boost::begin(range), boost::end(range)); REQUIRE(encoded.size() == 4); CHECK(encoded[0] == 0x0041); @@ -95,7 +95,7 @@ TEST_CASE("utf32", "UTF-32 encoding form") { } SECTION("decode", "Decoding UTF-32") { std::initializer_list encoded = { 0x0041, 0x00C5, 0x1EA0, 0x1F4A9 }; - auto range = ogonek::utf32::decode(encoded); + auto range = ogonek::utf32::decode(encoded, ogonek::skip_validation); std::vector decoded(boost::begin(range), boost::end(range)); REQUIRE(decoded.size() == 4); CHECK(decoded[0] == 0x0041_u);