Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Start work on String#byteindex #2172

Merged
merged 2 commits into from
Jun 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions include/natalie/env.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,16 @@ class Env : public Cell {
[[noreturn]] void raise_name_error(StringObject *name, String);
[[noreturn]] void raise_not_comparable_error(Value lhs, Value rhs);

// Old error message style, e.g.:
// - no implicit conversion from nil to string
// - no implicit conversion of Integer into String
[[noreturn]] void raise_type_error(const Object *obj, const char *expected);

// New error message style, e.g.:
// - no implicit conversion of nil into String
// - no implicit conversion of Integer into String
[[noreturn]] void raise_type_error2(const Object *obj, const char *expected);

template <typename... Args>
[[noreturn]] void raise_name_error(SymbolObject *name, const char *format, Args... args) {
auto message = String::format(format, args...);
Expand Down
9 changes: 9 additions & 0 deletions include/natalie/object.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -376,8 +376,17 @@ class Object : public Cell {
IoObject *to_io(Env *env);
IntegerObject *to_int(Env *env);
StringObject *to_s(Env *env);

// Old error message style, e.g.:
// - no implicit conversion from nil to string
// - no implicit conversion of Integer into String
StringObject *to_str(Env *env);

// New error message style, e.g.:
// - no implicit conversion of nil into String
// - no implicit conversion of Integer into String
StringObject *to_str2(Env *env);

protected:
ClassObject *m_klass { nullptr };

Expand Down
4 changes: 3 additions & 1 deletion include/natalie/string_object.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,8 @@ class StringObject : public Object {
StringObject *successive(Env *);
StringObject *successive_in_place(Env *);

Value byteindex(Env *, Value, Value = nullptr) const;

Value index(Env *, Value, Value);
Value index(Env *, Value, size_t start);
nat_int_t index_int(Env *, Value, size_t byte_start);
Expand Down Expand Up @@ -336,7 +338,7 @@ class StringObject : public Object {
Value delete_suffix(Env *, Value);
Value delete_suffix_in_place(Env *, Value);
StringObject *downcase(Env *, Value, Value);
Value downcase_in_place(Env *, Value, Value);
Value downcase_in_place(Env *, Value = nullptr, Value = nullptr);
Value dump(Env *);
Value each_byte(Env *, Block *);
Value encode(Env *, Value = nullptr, Value = nullptr, HashObject * = nullptr);
Expand Down
1 change: 1 addition & 0 deletions lib/natalie/compiler/binding_gen.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1241,6 +1241,7 @@ def generate_name
gen.binding('String', 'ascii_only?', 'StringObject', 'is_ascii_only', argc: 0, pass_env: false, pass_block: false, return_type: :bool)
gen.binding('String', 'b', 'StringObject', 'b', argc: 0, pass_env: true, pass_block: false, return_type: :Object)
gen.binding('String', 'bytes', 'StringObject', 'bytes', argc: 0, pass_env: true, pass_block: true, return_type: :Object)
gen.binding('String', 'byteindex', 'StringObject', 'byteindex', argc: 1..2, pass_env: true, pass_block: false, return_type: :Object)
gen.binding('String', 'byteslice', 'StringObject', 'byteslice', argc: 1..2, pass_env: true, pass_block: false, return_type: :Object)
gen.binding('String', 'bytesize', 'StringObject', 'bytesize', argc: 0, pass_env: false, pass_block: false, return_type: :size_t)
gen.binding('String', 'capitalize', 'StringObject', 'capitalize', argc: 0..2, pass_env: true, pass_block: false, return_type: :Object)
Expand Down
324 changes: 324 additions & 0 deletions spec/core/string/byteindex_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,324 @@
# -*- encoding: utf-8 -*-
require_relative '../../spec_helper'
require_relative 'fixtures/classes'
require_relative 'shared/byte_index_common.rb'

describe "String#byteindex" do
ruby_version_is "3.2" do
it "calls #to_str to convert the first argument" do
char = mock("string index char")
char.should_receive(:to_str).and_return("b")
"abc".byteindex(char).should == 1
end

it "calls #to_int to convert the second argument" do
offset = mock("string index offset")
offset.should_receive(:to_int).and_return(1)
"abc".byteindex("c", offset).should == 2
end

it "does not raise IndexError when byte offset is correct or on string boundary" do
"わ".byteindex("").should == 0
"わ".byteindex("", 0).should == 0
"わ".byteindex("", 3).should == 3
end

it_behaves_like :byte_index_common, :byteindex
end
end

describe "String#byteindex with String" do
ruby_version_is "3.2" do
it "behaves the same as String#byteindex(char) for one-character strings" do
"blablabla hello cruel world...!".split("").uniq.each do |str|
chr = str[0]
str.byteindex(str).should == str.byteindex(chr)

0.upto(str.size + 1) do |start|
str.byteindex(str, start).should == str.byteindex(chr, start)
end

(-str.size - 1).upto(-1) do |start|
str.byteindex(str, start).should == str.byteindex(chr, start)
end
end
end

it "returns the byteindex of the first occurrence of the given substring" do
"blablabla".byteindex("").should == 0
"blablabla".byteindex("b").should == 0
"blablabla".byteindex("bla").should == 0
"blablabla".byteindex("blabla").should == 0
"blablabla".byteindex("blablabla").should == 0

"blablabla".byteindex("l").should == 1
"blablabla".byteindex("la").should == 1
"blablabla".byteindex("labla").should == 1
"blablabla".byteindex("lablabla").should == 1

"blablabla".byteindex("a").should == 2
"blablabla".byteindex("abla").should == 2
"blablabla".byteindex("ablabla").should == 2
end

it "treats the offset as a byteindex" do
"aaaaa".byteindex("a", 0).should == 0
"aaaaa".byteindex("a", 2).should == 2
"aaaaa".byteindex("a", 4).should == 4
end

it "ignores string subclasses" do
"blablabla".byteindex(StringSpecs::MyString.new("bla")).should == 0
StringSpecs::MyString.new("blablabla").byteindex("bla").should == 0
StringSpecs::MyString.new("blablabla").byteindex(StringSpecs::MyString.new("bla")).should == 0
end

it "starts the search at the given offset" do
"blablabla".byteindex("bl", 0).should == 0
"blablabla".byteindex("bl", 1).should == 3
"blablabla".byteindex("bl", 2).should == 3
"blablabla".byteindex("bl", 3).should == 3

"blablabla".byteindex("bla", 0).should == 0
"blablabla".byteindex("bla", 1).should == 3
"blablabla".byteindex("bla", 2).should == 3
"blablabla".byteindex("bla", 3).should == 3

"blablabla".byteindex("blab", 0).should == 0
"blablabla".byteindex("blab", 1).should == 3
"blablabla".byteindex("blab", 2).should == 3
"blablabla".byteindex("blab", 3).should == 3

"blablabla".byteindex("la", 1).should == 1
"blablabla".byteindex("la", 2).should == 4
"blablabla".byteindex("la", 3).should == 4
"blablabla".byteindex("la", 4).should == 4

"blablabla".byteindex("lab", 1).should == 1
"blablabla".byteindex("lab", 2).should == 4
"blablabla".byteindex("lab", 3).should == 4
"blablabla".byteindex("lab", 4).should == 4

"blablabla".byteindex("ab", 2).should == 2
"blablabla".byteindex("ab", 3).should == 5
"blablabla".byteindex("ab", 4).should == 5
"blablabla".byteindex("ab", 5).should == 5

"blablabla".byteindex("", 0).should == 0
"blablabla".byteindex("", 1).should == 1
"blablabla".byteindex("", 2).should == 2
"blablabla".byteindex("", 7).should == 7
"blablabla".byteindex("", 8).should == 8
"blablabla".byteindex("", 9).should == 9
end

it "starts the search at offset + self.length if offset is negative" do
str = "blablabla"

["bl", "bla", "blab", "la", "lab", "ab", ""].each do |needle|
(-str.length .. -1).each do |offset|
str.byteindex(needle, offset).should ==
str.byteindex(needle, offset + str.length)
end
end
end

it "returns nil if the substring isn't found" do
"blablabla".byteindex("B").should == nil
"blablabla".byteindex("z").should == nil
"blablabla".byteindex("BLA").should == nil
"blablabla".byteindex("blablablabla").should == nil
"blablabla".byteindex("", 10).should == nil

"hello".byteindex("he", 1).should == nil
"hello".byteindex("he", 2).should == nil
"I’ve got a multibyte character.\n".byteindex("\n\n").should == nil
end

it "returns the character byteindex of a multibyte character" do
"ありがとう".byteindex("が").should == 6
end

it "returns the character byteindex after offset" do
"われわれ".byteindex("わ", 3).should == 6
"ありがとうありがとう".byteindex("が", 9).should == 21
end

it "returns the character byteindex after a partial first match" do
"</</h".byteindex("</h").should == 2
end

it "raises an Encoding::CompatibilityError if the encodings are incompatible" do
char = "れ".encode Encoding::EUC_JP
-> do
"あれ".byteindex(char)
end.should raise_error(Encoding::CompatibilityError)
end

it "handles a substring in a superset encoding" do
'abc'.dup.force_encoding(Encoding::US_ASCII).byteindex('é').should == nil
end

it "handles a substring in a subset encoding" do
'été'.byteindex('t'.dup.force_encoding(Encoding::US_ASCII)).should == 2
end
end
end

describe "String#byteindex with Regexp" do
ruby_version_is "3.2" do
it "behaves the same as String#byteindex(string) for escaped string regexps" do
NATFIXME 'Support Regexp', exception: TypeError do
["blablabla", "hello cruel world...!"].each do |str|
["", "b", "bla", "lab", "o c", "d."].each do |needle|
regexp = Regexp.new(Regexp.escape(needle))
str.byteindex(regexp).should == str.byteindex(needle)

0.upto(str.size + 1) do |start|
str.byteindex(regexp, start).should == str.byteindex(needle, start)
end

(-str.size - 1).upto(-1) do |start|
str.byteindex(regexp, start).should == str.byteindex(needle, start)
end
end
end
end
end

it "returns the byteindex of the first match of regexp" do
NATFIXME 'Support Regexp', exception: TypeError do
"blablabla".byteindex(/bla/).should == 0
"blablabla".byteindex(/BLA/i).should == 0

"blablabla".byteindex(/.{0}/).should == 0
"blablabla".byteindex(/.{6}/).should == 0
"blablabla".byteindex(/.{9}/).should == 0

"blablabla".byteindex(/.*/).should == 0
"blablabla".byteindex(/.+/).should == 0

"blablabla".byteindex(/lab|b/).should == 0

not_supported_on :opal do
"blablabla".byteindex(/\A/).should == 0
"blablabla".byteindex(/\Z/).should == 9
"blablabla".byteindex(/\z/).should == 9
"blablabla\n".byteindex(/\Z/).should == 9
"blablabla\n".byteindex(/\z/).should == 10
end

"blablabla".byteindex(/^/).should == 0
"\nblablabla".byteindex(/^/).should == 0
"b\nablabla".byteindex(/$/).should == 1
"bl\nablabla".byteindex(/$/).should == 2

"blablabla".byteindex(/.l./).should == 0
end
end

it "starts the search at the given offset" do
NATFIXME 'Support Regexp', exception: TypeError do
"blablabla".byteindex(/.{0}/, 5).should == 5
"blablabla".byteindex(/.{1}/, 5).should == 5
"blablabla".byteindex(/.{2}/, 5).should == 5
"blablabla".byteindex(/.{3}/, 5).should == 5
"blablabla".byteindex(/.{4}/, 5).should == 5

"blablabla".byteindex(/.{0}/, 3).should == 3
"blablabla".byteindex(/.{1}/, 3).should == 3
"blablabla".byteindex(/.{2}/, 3).should == 3
"blablabla".byteindex(/.{5}/, 3).should == 3
"blablabla".byteindex(/.{6}/, 3).should == 3

"blablabla".byteindex(/.l./, 0).should == 0
"blablabla".byteindex(/.l./, 1).should == 3
"blablabla".byteindex(/.l./, 2).should == 3
"blablabla".byteindex(/.l./, 3).should == 3

"xblaxbla".byteindex(/x./, 0).should == 0
"xblaxbla".byteindex(/x./, 1).should == 4
"xblaxbla".byteindex(/x./, 2).should == 4

not_supported_on :opal do
"blablabla\n".byteindex(/\Z/, 9).should == 9
end
end
end

it "starts the search at offset + self.length if offset is negative" do
str = "blablabla"

["bl", "bla", "blab", "la", "lab", "ab", ""].each do |needle|
(-str.length .. -1).each do |offset|
str.byteindex(needle, offset).should ==
str.byteindex(needle, offset + str.length)
end
end
end

it "returns nil if the substring isn't found" do
NATFIXME 'Support Regexp', exception: TypeError do
"blablabla".byteindex(/BLA/).should == nil

"blablabla".byteindex(/.{10}/).should == nil
"blaxbla".byteindex(/.x/, 3).should == nil
"blaxbla".byteindex(/..x/, 2).should == nil
end
end

it "returns nil if the Regexp matches the empty string and the offset is out of range" do
NATFIXME 'Support Regexp', exception: TypeError do
"ruby".byteindex(//, 12).should be_nil
end
end

it "supports \\G which matches at the given start offset" do
NATFIXME 'Support Regexp', exception: TypeError do
"helloYOU.".byteindex(/\GYOU/, 5).should == 5
"helloYOU.".byteindex(/\GYOU/).should == nil

re = /\G.+YOU/
# The # marks where \G will match.
[
["#hi!YOUall.", 0],
["h#i!YOUall.", 1],
["hi#!YOUall.", 2],
["hi!#YOUall.", nil]
].each do |spec|

start = spec[0].byteindex("#")
str = spec[0].delete("#")

str.byteindex(re, start).should == spec[1]
end
end
end

it "converts start_offset to an integer via to_int" do
NATFIXME 'Support Regexp', exception: TypeError do
obj = mock('1')
obj.should_receive(:to_int).and_return(1)
"RWOARW".byteindex(/R./, obj).should == 4
end
end

it "returns the character byteindex of a multibyte character" do
NATFIXME 'Support Regexp', exception: TypeError do
"ありがとう".byteindex(/が/).should == 6
end
end

it "returns the character byteindex after offset" do
NATFIXME 'Support Regexp', exception: TypeError do
"われわれ".byteindex(/わ/, 3).should == 6
end
end

it "treats the offset as a byteindex" do
NATFIXME 'Support Regexp', exception: TypeError do
"われわわれ".byteindex(/わ/, 6).should == 6
end
end
end
end
Loading
Loading