diff --git a/include/natalie/regexp_object.hpp b/include/natalie/regexp_object.hpp index a251895305..18d7e6dfc4 100644 --- a/include/natalie/regexp_object.hpp +++ b/include/natalie/regexp_object.hpp @@ -139,7 +139,7 @@ class RegexpObject : public Object { return m_options & RegexOpts::IgnoreCase; } - int search(Env *env, const StringObject *string_obj, int start, OnigRegion *region, OnigOptionType options); + long search(Env *env, const StringObject *string_obj, long start, OnigRegion *region, OnigOptionType options, bool reverse = false); bool eq(Env *env, Value other) const { assert_initialized(env); @@ -163,6 +163,10 @@ class RegexpObject : public Object { return this->send(env, "=~"_s, { env->global_get("$_"_s) }); } + bool is_fixed_encoding() const { + return m_options & RegexOpts::FixedEncoding; + } + bool has_match(Env *env, Value, Value); Value initialize(Env *, Value, Value); Value inspect(Env *env); diff --git a/include/natalie/string_object.hpp b/include/natalie/string_object.hpp index d7c4c494ae..338d246663 100644 --- a/include/natalie/string_object.hpp +++ b/include/natalie/string_object.hpp @@ -278,6 +278,7 @@ class StringObject : public Object { StringObject *successive_in_place(Env *); Value byteindex(Env *, Value, Value = nullptr) const; + Value byterindex(Env *, Value, Value = nullptr) const; Value index(Env *, Value, Value); Value index(Env *, Value, size_t start); diff --git a/lib/natalie/compiler/binding_gen.rb b/lib/natalie/compiler/binding_gen.rb index 43a44586eb..1b47e625b0 100644 --- a/lib/natalie/compiler/binding_gen.rb +++ b/lib/natalie/compiler/binding_gen.rb @@ -1242,6 +1242,7 @@ def generate_name gen.binding('String', 'b', 'StringObject', 'b', argc: 0, pass_env: true, pass_block: false, return_type: :Object) gen.binding('String', 'bytes', 'StringObject', 'bytes', argc: 0, pass_env: true, pass_block: true, return_type: :Object) gen.binding('String', 'byteindex', 'StringObject', 'byteindex', argc: 1..2, pass_env: true, pass_block: false, return_type: :Object) +gen.binding('String', 'byterindex', 'StringObject', 'byterindex', argc: 1..2, pass_env: true, pass_block: false, return_type: :Object) gen.binding('String', 'byteslice', 'StringObject', 'byteslice', argc: 1..2, pass_env: true, pass_block: false, return_type: :Object) gen.binding('String', 'bytesize', 'StringObject', 'bytesize', argc: 0, pass_env: false, pass_block: false, return_type: :size_t) gen.binding('String', 'capitalize', 'StringObject', 'capitalize', argc: 0..2, pass_env: true, pass_block: false, return_type: :Object) diff --git a/spec/core/string/byterindex_spec.rb b/spec/core/string/byterindex_spec.rb new file mode 100644 index 0000000000..10392b57bb --- /dev/null +++ b/spec/core/string/byterindex_spec.rb @@ -0,0 +1,366 @@ +# -*- encoding: utf-8 -*- +require_relative '../../spec_helper' +require_relative 'fixtures/classes' +require_relative 'shared/byte_index_common.rb' + +describe "String#byterindex with object" do + ruby_version_is "3.2" do + it "tries to convert obj to a string via to_str" do + obj = mock('lo') + def obj.to_str() "lo" end + "hello".byterindex(obj).should == "hello".byterindex("lo") + + obj = mock('o') + def obj.respond_to?(arg, *) true end + def obj.method_missing(*args) "o" end + "hello".byterindex(obj).should == "hello".byterindex("o") + end + + it "calls #to_int to convert the second argument" do + offset = mock("string index offset") + offset.should_receive(:to_int).and_return(3) + "abc".byterindex("c", offset).should == 2 + end + + it "does not raise IndexError when byte offset is correct or on string boundary" do + "わ".byterindex("", 0).should == 0 + "わ".byterindex("", 3).should == 3 + "わ".byterindex("").should == 3 + end + + it_behaves_like :byte_index_common, :byterindex + end +end + +describe "String#byterindex with String" do + ruby_version_is "3.2" do + it "behaves the same as String#byterindex(char) for one-character strings" do + "blablabla hello cruel world...!".split("").uniq.each do |str| + chr = str[0] + str.byterindex(str).should == str.byterindex(chr) + + 0.upto(str.size + 1) do |start| + str.byterindex(str, start).should == str.byterindex(chr, start) + end + + (-str.size - 1).upto(-1) do |start| + str.byterindex(str, start).should == str.byterindex(chr, start) + end + end + end + + it "behaves the same as String#byterindex(?char) for one-character strings" do + "blablabla hello cruel world...!".split("").uniq.each do |str| + # NATFIXME: dynamic eval + # chr = str[0] =~ / / ? str[0] : eval("?#{str[0]}") + chr = str[0] + str.byterindex(str).should == str.byterindex(chr) + + 0.upto(str.size + 1) do |start| + str.byterindex(str, start).should == str.byterindex(chr, start) + end + + (-str.size - 1).upto(-1) do |start| + str.byterindex(str, start).should == str.byterindex(chr, start) + end + end + end + + it "returns the index of the last occurrence of the given substring" do + "blablabla".byterindex("").should == 9 + "blablabla".byterindex("a").should == 8 + "blablabla".byterindex("la").should == 7 + "blablabla".byterindex("bla").should == 6 + "blablabla".byterindex("abla").should == 5 + "blablabla".byterindex("labla").should == 4 + "blablabla".byterindex("blabla").should == 3 + "blablabla".byterindex("ablabla").should == 2 + "blablabla".byterindex("lablabla").should == 1 + "blablabla".byterindex("blablabla").should == 0 + + "blablabla".byterindex("l").should == 7 + "blablabla".byterindex("bl").should == 6 + "blablabla".byterindex("abl").should == 5 + "blablabla".byterindex("labl").should == 4 + "blablabla".byterindex("blabl").should == 3 + "blablabla".byterindex("ablabl").should == 2 + "blablabla".byterindex("lablabl").should == 1 + "blablabla".byterindex("blablabl").should == 0 + + "blablabla".byterindex("b").should == 6 + "blablabla".byterindex("ab").should == 5 + "blablabla".byterindex("lab").should == 4 + "blablabla".byterindex("blab").should == 3 + "blablabla".byterindex("ablab").should == 2 + "blablabla".byterindex("lablab").should == 1 + "blablabla".byterindex("blablab").should == 0 + end + + it "ignores string subclasses" do + "blablabla".byterindex(StringSpecs::MyString.new("bla")).should == 6 + StringSpecs::MyString.new("blablabla").byterindex("bla").should == 6 + StringSpecs::MyString.new("blablabla").byterindex(StringSpecs::MyString.new("bla")).should == 6 + end + + it "starts the search at the given offset" do + "blablabla".byterindex("bl", 0).should == 0 + "blablabla".byterindex("bl", 1).should == 0 + "blablabla".byterindex("bl", 2).should == 0 + "blablabla".byterindex("bl", 3).should == 3 + + "blablabla".byterindex("bla", 0).should == 0 + "blablabla".byterindex("bla", 1).should == 0 + "blablabla".byterindex("bla", 2).should == 0 + "blablabla".byterindex("bla", 3).should == 3 + + "blablabla".byterindex("blab", 0).should == 0 + "blablabla".byterindex("blab", 1).should == 0 + "blablabla".byterindex("blab", 2).should == 0 + "blablabla".byterindex("blab", 3).should == 3 + "blablabla".byterindex("blab", 6).should == 3 + "blablablax".byterindex("blab", 6).should == 3 + + "blablabla".byterindex("la", 1).should == 1 + "blablabla".byterindex("la", 2).should == 1 + "blablabla".byterindex("la", 3).should == 1 + "blablabla".byterindex("la", 4).should == 4 + + "blablabla".byterindex("lab", 1).should == 1 + "blablabla".byterindex("lab", 2).should == 1 + "blablabla".byterindex("lab", 3).should == 1 + "blablabla".byterindex("lab", 4).should == 4 + + "blablabla".byterindex("ab", 2).should == 2 + "blablabla".byterindex("ab", 3).should == 2 + "blablabla".byterindex("ab", 4).should == 2 + "blablabla".byterindex("ab", 5).should == 5 + + "blablabla".byterindex("", 0).should == 0 + "blablabla".byterindex("", 1).should == 1 + "blablabla".byterindex("", 2).should == 2 + "blablabla".byterindex("", 7).should == 7 + "blablabla".byterindex("", 8).should == 8 + "blablabla".byterindex("", 9).should == 9 + "blablabla".byterindex("", 10).should == 9 + end + + it "starts the search at offset + self.length if offset is negative" do + str = "blablabla" + + ["bl", "bla", "blab", "la", "lab", "ab", ""].each do |needle| + (-str.length .. -1).each do |offset| + str.byterindex(needle, offset).should == + str.byterindex(needle, offset + str.length) + end + end + end + + it "returns nil if the substring isn't found" do + "blablabla".byterindex("B").should == nil + "blablabla".byterindex("z").should == nil + "blablabla".byterindex("BLA").should == nil + "blablabla".byterindex("blablablabla").should == nil + + "hello".byterindex("lo", 0).should == nil + "hello".byterindex("lo", 1).should == nil + "hello".byterindex("lo", 2).should == nil + + "hello".byterindex("llo", 0).should == nil + "hello".byterindex("llo", 1).should == nil + + "hello".byterindex("el", 0).should == nil + "hello".byterindex("ello", 0).should == nil + + "hello".byterindex("", -6).should == nil + "hello".byterindex("", -7).should == nil + + "hello".byterindex("h", -6).should == nil + end + + it "tries to convert start_offset to an integer via to_int" do + obj = mock('5') + def obj.to_int() 5 end + "str".byterindex("st", obj).should == 0 + + obj = mock('5') + def obj.respond_to?(arg, *) true end + def obj.method_missing(*args) 5 end + "str".byterindex("st", obj).should == 0 + end + + it "raises a TypeError when given offset is nil" do + -> { "str".byterindex("st", nil) }.should raise_error(TypeError) + end + + it "handles a substring in a superset encoding" do + 'abc'.dup.force_encoding(Encoding::US_ASCII).byterindex('é').should == nil + end + + it "handles a substring in a subset encoding" do + 'été'.byterindex('t'.dup.force_encoding(Encoding::US_ASCII)).should == 2 + end + end +end + +describe "String#byterindex with Regexp" do + ruby_version_is "3.2" do + it "behaves the same as String#byterindex(string) for escaped string regexps" do + NATFIXME "I don't think this will work with upstream Onigmo.", exception: SpecFailedException do + ["blablabla", "hello cruel world...!"].each do |str| + ["", "b", "bla", "lab", "o c", "d."].each do |needle| + regexp = Regexp.new(Regexp.escape(needle)) + str.byterindex(regexp).should == str.byterindex(needle) + + 0.upto(str.size + 1) do |start| + str.byterindex(regexp, start).should == str.byterindex(needle, start) + end + + (-str.size - 1).upto(-1) do |start| + str.byterindex(regexp, start).should == str.byterindex(needle, start) + end + end + end + end + end + + it "returns the index of the first match from the end of string of regexp" do + "blablabla".byterindex(/bla/).should == 6 + "blablabla".byterindex(/BLA/i).should == 6 + + "blablabla".byterindex(/.{0}/).should == 9 + "blablabla".byterindex(/.{1}/).should == 8 + "blablabla".byterindex(/.{2}/).should == 7 + "blablabla".byterindex(/.{6}/).should == 3 + "blablabla".byterindex(/.{9}/).should == 0 + + "blablabla".byterindex(/.*/).should == 9 + "blablabla".byterindex(/.+/).should == 8 + + "blablabla".byterindex(/bla|a/).should == 8 + + not_supported_on :opal do + "blablabla".byterindex(/\A/).should == 0 + "blablabla".byterindex(/\Z/).should == 9 + "blablabla".byterindex(/\z/).should == 9 + "blablabla\n".byterindex(/\Z/).should == 10 + "blablabla\n".byterindex(/\z/).should == 10 + end + + "blablabla".byterindex(/^/).should == 0 + not_supported_on :opal do + "\nblablabla".byterindex(/^/).should == 1 + "b\nlablabla".byterindex(/^/).should == 2 + end + "blablabla".byterindex(/$/).should == 9 + + "blablabla".byterindex(/.l./).should == 6 + end + + it "starts the search at the given offset" do + NATFIXME 'Onigmo cannot handle this case. CRuby has forked Onigmo and the patches have not gone upstream.', exception: SpecFailedException do + "blablabla".byterindex(/.{1}/, 5).should == 5 + "blablabla".byterindex(/.{2}/, 5).should == 5 + "blablabla".byterindex(/.{3}/, 5).should == 5 + "blablabla".byterindex(/.{4}/, 5).should == 5 + + "blablabla".byterindex(/.{0}/, 3).should == 3 + "blablabla".byterindex(/.{1}/, 3).should == 3 + "blablabla".byterindex(/.{2}/, 3).should == 3 + "blablabla".byterindex(/.{5}/, 3).should == 3 + "blablabla".byterindex(/.{6}/, 3).should == 3 + + "blablabla".byterindex(/.l./, 0).should == 0 + "blablabla".byterindex(/.l./, 1).should == 0 + "blablabla".byterindex(/.l./, 2).should == 0 + "blablabla".byterindex(/.l./, 3).should == 3 + + "blablablax".byterindex(/.x/, 10).should == 8 + "blablablax".byterindex(/.x/, 9).should == 8 + "blablablax".byterindex(/.x/, 8).should == 8 + + "blablablax".byterindex(/..x/, 10).should == 7 + "blablablax".byterindex(/..x/, 9).should == 7 + "blablablax".byterindex(/..x/, 8).should == 7 + "blablablax".byterindex(/..x/, 7).should == 7 + + not_supported_on :opal do + "blablabla\n".byterindex(/\Z/, 9).should == 9 + end + end + end + + it "starts the search at offset + self.length if offset is negative" do + str = "blablabla" + + ["bl", "bla", "blab", "la", "lab", "ab", ""].each do |needle| + (-str.length .. -1).each do |offset| + str.byterindex(needle, offset).should == + str.byterindex(needle, offset + str.length) + end + end + end + + it "returns nil if the substring isn't found" do + "blablabla".byterindex(/BLA/).should == nil + "blablabla".byterindex(/.{10}/).should == nil + "blablablax".byterindex(/.x/, 7).should == nil + "blablablax".byterindex(/..x/, 6).should == nil + + not_supported_on :opal do + "blablabla".byterindex(/\Z/, 5).should == nil + "blablabla".byterindex(/\z/, 5).should == nil + "blablabla\n".byterindex(/\z/, 9).should == nil + end + end + + not_supported_on :opal do + it "supports \\G which matches at the given start offset" do + NATFIXME 'Add \G support', exception: SpecFailedException, message: /should be ==/ do + "helloYOU.".byterindex(/YOU\G/, 8).should == 5 + "helloYOU.".byterindex(/YOU\G/).should == nil + + idx = "helloYOUall!".index("YOU") + re = /YOU.+\G.+/ + # The # marks where \G will match. + [ + ["helloYOU#all.", nil], + ["helloYOUa#ll.", idx], + ["helloYOUal#l.", idx], + ["helloYOUall#.", idx], + ["helloYOUall.#", nil] + ].each do |i| + start = i[0].index("#") + str = i[0].delete("#") + + str.byterindex(re, start).should == i[1] + end + end + end + end + + it "tries to convert start_offset to an integer" do + obj = mock('5') + def obj.to_int() 5 end + "str".byterindex(/../, obj).should == 1 + + obj = mock('5') + def obj.respond_to?(arg, *) true end + def obj.method_missing(*args); 5; end + "str".byterindex(/../, obj).should == 1 + end + + it "raises a TypeError when given offset is nil" do + -> { "str".byterindex(/../, nil) }.should raise_error(TypeError) + end + + it "returns the reverse byte index of a multibyte character" do + "ありがりがとう".byterindex("が").should == 12 + "ありがりがとう".byterindex(/が/).should == 12 + end + + it "returns the byte index before the finish" do + "ありがりがとう".byterindex("が", 9).should == 6 + "ありがりがとう".byterindex(/が/, 9).should == 6 + end + end +end diff --git a/src/regexp_object.cpp b/src/regexp_object.cpp index c3be76f6d5..c89af799ac 100644 --- a/src/regexp_object.cpp +++ b/src/regexp_object.cpp @@ -497,16 +497,13 @@ bool RegexpObject::has_match(Env *env, Value other, Value start) { start_index += str_obj->length(); } - OnigRegion *region = onig_region_new(); - int result = search(env, str_obj, start_index, region, ONIG_OPTION_NONE); + int result = search(env, str_obj, start_index, nullptr, ONIG_OPTION_NONE); if (result >= 0) { return true; } else if (result == ONIG_MISMATCH) { - onig_region_free(region, true); return false; } else { - onig_region_free(region, true); OnigUChar s[ONIG_MAX_ERROR_MESSAGE_LEN]; onig_error_code_to_str(s, result); env->raise("RuntimeError", (char *)s); @@ -561,15 +558,14 @@ bool RegexpObject::operator==(const RegexpObject &other) const { return m_pattern->string() == other.m_pattern->string() && our_options == their_options; } -int RegexpObject::search(Env *env, const StringObject *string_obj, int start, OnigRegion *region, OnigOptionType options) { +long RegexpObject::search(Env *env, const StringObject *string_obj, long start, OnigRegion *region, OnigOptionType options, bool reverse) { auto string = string_obj->string(); const unsigned char *unsigned_str = (unsigned char *)string.c_str(); const unsigned char *char_end = unsigned_str + string.size(); const unsigned char *char_start = unsigned_str + start; - const unsigned char *char_range = char_end; + const unsigned char *char_range = reverse ? unsigned_str : char_end; - // FIXME: check if it's already FIXEDENCODING - if (string_obj->encoding() != encoding()) { + if (!is_fixed_encoding() && string_obj->encoding() != encoding()) { RegexpObject temp_regexp; temp_regexp.initialize_internal(env, m_pattern, m_options | RegexOpts::FixedEncoding); return onig_search(temp_regexp.m_regex, unsigned_str, char_end, char_start, char_range, region, options); diff --git a/src/string_object.cpp b/src/string_object.cpp index 0e1b68c1e9..f8f74f2e99 100644 --- a/src/string_object.cpp +++ b/src/string_object.cpp @@ -667,7 +667,7 @@ bool StringObject::end_with(Env *env, Args args) const { return false; } -static Value byteindex_regexp_needle(Env *env, const StringObject *haystack, RegexpObject *needle, size_t offset) { +static Value byteindex_regexp_needle(Env *env, const StringObject *haystack, RegexpObject *needle, OnigPosition offset, bool reverse = false) { if (!haystack->negotiate_compatible_encoding(needle->pattern())) { auto exception_class = fetch_nested_const({ "Encoding"_s, "CompatibilityError"_s })->as_class(); auto enc1 = needle->pattern()->encoding()->name()->string(); @@ -679,8 +679,11 @@ static Value byteindex_regexp_needle(Env *env, const StringObject *haystack, Reg return Value::integer(offset); OnigRegion *region = onig_region_new(); - int result = needle->search(env, haystack, offset, region, ONIG_OPTION_NONE); + + int result = needle->search(env, haystack, offset, region, ONIG_OPTION_NONE, reverse); + if (result == ONIG_MISMATCH) { + onig_region_free(region, true); env->caller()->set_last_match(nullptr); return NilObject::the(); } @@ -691,12 +694,17 @@ static Value byteindex_regexp_needle(Env *env, const StringObject *haystack, Reg return Value::integer(byte_index); } -static Value byteindex_string_needle(Env *env, const StringObject *haystack, StringObject *needle_obj, size_t offset) { +static Value byteindex_string_needle(Env *env, const StringObject *haystack, StringObject *needle_obj, size_t offset, bool reverse = false) { haystack->assert_compatible_string(env, needle_obj); String needle = needle_obj->string(); - if ((size_t)offset + needle.size() > haystack->bytesize()) - return NilObject::the(); + if (reverse) { + if ((size_t)offset > haystack->bytesize()) + return NilObject::the(); + } else { + if ((size_t)offset + needle.size() > haystack->bytesize()) + return NilObject::the(); + } if ((size_t)offset < haystack->bytesize()) { auto character_check = new StringObject { haystack->string().substring(offset, std::min(haystack->bytesize() - offset, (size_t)4)) }; @@ -709,10 +717,23 @@ static Value byteindex_string_needle(Env *env, const StringObject *haystack, Str if (needle.is_empty()) return Value::integer(offset); - if ((size_t)offset >= haystack->bytesize()) + if (!reverse && (size_t)offset >= haystack->bytesize()) return NilObject::the(); - auto pointer = memmem(haystack->c_str() + offset, haystack->bytesize() - offset, needle.c_str(), needle.size()); + void *pointer = nullptr; + if (reverse) { + if (offset + needle.size() >= haystack->bytesize()) + offset = haystack->bytesize() - needle.size(); + for (ssize_t i = offset; i >= 0; i--) { + if (memcmp(haystack->c_str() + i, needle.c_str(), needle.size()) == 0) { + pointer = (void *)(haystack->c_str() + i); + break; + } + } + } else { + pointer = memmem(haystack->c_str() + offset, haystack->bytesize() - offset, needle.c_str(), needle.size()); + } + if (!pointer) return NilObject::the(); @@ -736,6 +757,23 @@ Value StringObject::byteindex(Env *env, Value needle_obj, Value offset_obj) cons return byteindex_string_needle(env, this, needle, offset); } +Value StringObject::byterindex(Env *env, Value needle_obj, Value offset_obj) const { + ssize_t offset = bytesize(); + if (offset_obj) + offset = IntegerObject::convert_to_native_type(env, offset_obj); + if (offset < 0) + offset += bytesize(); + if (offset < 0) + return NilObject::the(); + offset = std::min((size_t)offset, bytesize()); + + if (needle_obj->is_regexp()) + return byteindex_regexp_needle(env, this, needle_obj->as_regexp(), offset, true); + + auto needle = needle_obj->to_str2(env); + return byteindex_string_needle(env, this, needle, offset, true); +} + Value StringObject::index(Env *env, Value needle, Value offset) { int offset_i = (offset) ? IntegerObject::convert_to_int(env, offset) : 0; int len = char_count(env); @@ -775,6 +813,7 @@ nat_int_t StringObject::index_int(Env *env, Value needle, size_t byte_start) { OnigRegion *region = onig_region_new(); int result = needle->as_regexp()->search(env, this, byte_start, region, ONIG_OPTION_NONE); if (result == ONIG_MISMATCH) { + onig_region_free(region, true); env->caller()->set_last_match(nullptr); return -1; }