Skip to content

Commit

Permalink
ARROW-9390: [C++][Followup] Add underscores to is* string functions
Browse files Browse the repository at this point in the history
Closes #7764 from wesm/cpp-compute-more-renaming

Authored-by: Wes McKinney <[email protected]>
Signed-off-by: Wes McKinney <[email protected]>
  • Loading branch information
wesm committed Jul 14, 2020
1 parent 3fc83c2 commit f131fe6
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 93 deletions.
44 changes: 22 additions & 22 deletions cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -944,33 +944,33 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry);
MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry);

AddUnaryStringPredicate<IsAscii>("string_isascii", registry);

AddUnaryStringPredicate<IsAlphaNumericAscii>("ascii_isalnum", registry);
AddUnaryStringPredicate<IsAlphaAscii>("ascii_isalpha", registry);
AddUnaryStringPredicate<IsDecimalAscii>("ascii_isdecimal", registry);
// no isdigic for ascii, since it is the same as isdecimal
AddUnaryStringPredicate<IsLowerAscii>("ascii_islower", registry);
// no isnumeric for ascii, since it is the same as isdecimal
AddUnaryStringPredicate<IsPrintableAscii>("ascii_isprintable", registry);
AddUnaryStringPredicate<IsSpaceAscii>("ascii_isspace", registry);
AddUnaryStringPredicate<IsTitleAscii>("ascii_istitle", registry);
AddUnaryStringPredicate<IsUpperAscii>("ascii_isupper", registry);
AddUnaryStringPredicate<IsAscii>("string_is_ascii", registry);

AddUnaryStringPredicate<IsAlphaNumericAscii>("ascii_is_alnum", registry);
AddUnaryStringPredicate<IsAlphaAscii>("ascii_is_alpha", registry);
AddUnaryStringPredicate<IsDecimalAscii>("ascii_is_decimal", registry);
// no is_digit for ascii, since it is the same as is_decimal
AddUnaryStringPredicate<IsLowerAscii>("ascii_is_lower", registry);
// no is_numeric for ascii, since it is the same as is_decimal
AddUnaryStringPredicate<IsPrintableAscii>("ascii_is_printable", registry);
AddUnaryStringPredicate<IsSpaceAscii>("ascii_is_space", registry);
AddUnaryStringPredicate<IsTitleAscii>("ascii_is_title", registry);
AddUnaryStringPredicate<IsUpperAscii>("ascii_is_upper", registry);

#ifdef ARROW_WITH_UTF8PROC
MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry);
MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry);

AddUnaryStringPredicate<IsAlphaNumericUnicode>("utf8_isalnum", registry);
AddUnaryStringPredicate<IsAlphaUnicode>("utf8_isalpha", registry);
AddUnaryStringPredicate<IsDecimalUnicode>("utf8_isdecimal", registry);
AddUnaryStringPredicate<IsDigitUnicode>("utf8_isdigit", registry);
AddUnaryStringPredicate<IsLowerUnicode>("utf8_islower", registry);
AddUnaryStringPredicate<IsNumericUnicode>("utf8_isnumeric", registry);
AddUnaryStringPredicate<IsPrintableUnicode>("utf8_isprintable", registry);
AddUnaryStringPredicate<IsSpaceUnicode>("utf8_isspace", registry);
AddUnaryStringPredicate<IsTitleUnicode>("utf8_istitle", registry);
AddUnaryStringPredicate<IsUpperUnicode>("utf8_isupper", registry);
AddUnaryStringPredicate<IsAlphaNumericUnicode>("utf8_is_alnum", registry);
AddUnaryStringPredicate<IsAlphaUnicode>("utf8_is_alpha", registry);
AddUnaryStringPredicate<IsDecimalUnicode>("utf8_is_decimal", registry);
AddUnaryStringPredicate<IsDigitUnicode>("utf8_is_digit", registry);
AddUnaryStringPredicate<IsLowerUnicode>("utf8_is_lower", registry);
AddUnaryStringPredicate<IsNumericUnicode>("utf8_is_numeric", registry);
AddUnaryStringPredicate<IsPrintableUnicode>("utf8_is_printable", registry);
AddUnaryStringPredicate<IsSpaceUnicode>("utf8_is_space", registry);
AddUnaryStringPredicate<IsTitleUnicode>("utf8_is_title", registry);
AddUnaryStringPredicate<IsUpperUnicode>("utf8_is_upper", registry);
#endif

AddBinaryLength(registry);
Expand Down
77 changes: 40 additions & 37 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ TYPED_TEST(TestStringKernels, Utf8Lower) {
this->CheckUnary("utf8_lower", "[\"aAazZæÆ&\", null, \"\", \"b\"]", this->type(),
"[\"aaazzææ&\", null, \"\", \"b\"]");

// test varying encoding lenghts and thus changing indices/offsets
// test varying encoding lengths and thus changing indices/offsets
this->CheckUnary("utf8_lower", "[\"ⱭɽⱤoW\", null, \"ıI\", \"B\"]", this->type(),
"[\"ɑɽɽow\", null, \"ıi\", \"b\"]");

Expand All @@ -149,81 +149,81 @@ TYPED_TEST(TestStringKernels, Utf8Lower) {
TYPED_TEST(TestStringKernels, IsAlphaNumericUnicode) {
// U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is
// UTF8PROC_CATEGORY_LO
this->CheckUnary("utf8_isalnum", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]", boolean(),
"[true, null, true, false, false]");
this->CheckUnary("utf8_is_alnum", "[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\"]",
boolean(), "[true, null, true, false, false]");
}

TYPED_TEST(TestStringKernels, IsAlphaUnicode) {
// U+08BE (utf8: \xE0\xA2\xBE) is undefined, but utf8proc things it is
// UTF8PROC_CATEGORY_LO
this->CheckUnary("utf8_isalpha", "[\"ⱭɽⱤoW\", null, \"Ɑ2\", \"!\", \"\"]", boolean(),
this->CheckUnary("utf8_is_alpha", "[\"ⱭɽⱤoW\", null, \"Ɑ2\", \"!\", \"\"]", boolean(),
"[true, null, false, false, false]");
}

TYPED_TEST(TestStringKernels, IsAscii) {
this->CheckUnary("string_isascii", "[\"azAZ~\", null, \"\", \"\"]", boolean(),
this->CheckUnary("string_is_ascii", "[\"azAZ~\", null, \"\", \"\"]", boolean(),
"[true, null, false, true]");
}

TYPED_TEST(TestStringKernels, IsDecimalUnicode) {
// ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal)
this->CheckUnary("utf8_isdecimal", "[\"12\", null, \"٣\", \"\", \"1a\", \"\"]",
this->CheckUnary("utf8_is_decimal", "[\"12\", null, \"٣\", \"\", \"1a\", \"\"]",
boolean(), "[true, null, true, false, false, false]");
}

TYPED_TEST(TestStringKernels, IsDigitUnicode) {
// These are digits according to Python, but we don't have the information in
// utf8proc for this
// this->CheckUnary("utf8_isdigit", "[\"²\", \"①\"]", boolean(), "[true,
// this->CheckUnary("utf8_is_digit", "[\"²\", \"①\"]", boolean(), "[true,
// true]");
}

TYPED_TEST(TestStringKernels, IsNumericUnicode) {
// ٣ is arabic 3 (decimal), Ⅳ roman (non-decimal)
this->CheckUnary("utf8_isnumeric", "[\"12\", null, \"٣\", \"\", \"1a\", \"\"]",
this->CheckUnary("utf8_is_numeric", "[\"12\", null, \"٣\", \"\", \"1a\", \"\"]",
boolean(), "[true, null, true, true, false, false]");
// These are numerical according to Python, but we don't have the information in
// utf8proc for this
// this->CheckUnary("utf8_isnumeric", "[\"㐅\", \"卌\"]", boolean(),
// this->CheckUnary("utf8_is_numeric", "[\"㐅\", \"卌\"]", boolean(),
// "[true, null, true, true, false, false]");
}

TYPED_TEST(TestStringKernels, IsLowerUnicode) {
// ٣ is arabic 3 (decimal), Φ capital
this->CheckUnary("utf8_islower",
this->CheckUnary("utf8_is_lower",
"[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"Φ\", \"\", \"with space\", "
"\"With space\"]",
boolean(),
"[false, null, true, false, true, false, false, true, false]");
// lower case character utf8proc does not know about
// this->CheckUnary("utf8_islower", "[\"ª\", \"ₕ\"]", boolean(), "[true,
// this->CheckUnary("utf8_is_lower", "[\"ª\", \"ₕ\"]", boolean(), "[true,
// true]");
}

TYPED_TEST(TestStringKernels, IsPrintableUnicode) {
// U+2008 (utf8: \xe2\x80\x88) is punctuaction space, it is NOT printable
// U+2008 (utf8: \xe2\x80\x88) is punctuation space, it is NOT printable
// U+0378 (utf8: \xCD\xB8) is an undefined char, it has no category
this->CheckUnary(
"utf8_isprintable",
"utf8_is_printable",
"[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\", \"\xCD\xB8\"]", boolean(),
"[true, null, false, true, false, false]");
}

TYPED_TEST(TestStringKernels, IsSpaceUnicode) {
// U+2008 (utf8: \xe2\x80\x88) is punctuaction space
this->CheckUnary("utf8_isspace", "[\" \", null, \" \", \"\\t\\r\"]", boolean(),
// U+2008 (utf8: \xe2\x80\x88) is punctuation space
this->CheckUnary("utf8_is_space", "[\" \", null, \" \", \"\\t\\r\"]", boolean(),
"[true, null, true, true]");
this->CheckUnary("utf8_isspace", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
this->CheckUnary("utf8_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
boolean(), "[false, null, false, false, true]");
}

TYPED_TEST(TestStringKernels, IsTitleUnicode) {
// ٣ is arabic 3 (decimal), Φ capital
this->CheckUnary("utf8_istitle",
this->CheckUnary("utf8_is_title",
"[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]",
boolean(), "[true, null, true, true, true, true, true]");
this->CheckUnary(
"utf8_istitle",
"utf8_is_title",
"[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsDŽ\", \"ΦΦ\", \"dž\", \"_\"]",
boolean(), "[false, null, false, false, false, false, false, false]");
}
Expand All @@ -233,9 +233,10 @@ TYPED_TEST(TestStringKernels, IsTitleUnicode) {

TYPED_TEST(TestStringKernels, IsUpperUnicode) {
// ٣ is arabic 3 (decimal), Φ capital
this->CheckUnary(
"utf8_isupper", "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"\", \"\"]",
boolean(), "[false, null, false, true, true, true, false, true, true]");
this->CheckUnary("utf8_is_upper",
"[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\", \"\", \"\"]",
boolean(),
"[false, null, false, true, true, true, false, true, true]");
// * Ⅰ to Ⅿ is a special case (roman capital), as well as Ⓐ to Ⓩ
// * ϒ - \xCF\x92 - Greek Upsilon with Hook Symbol - upper case, but has no direct lower
// case
Expand All @@ -245,7 +246,7 @@ TYPED_TEST(TestStringKernels, IsUpperUnicode) {
// * U+A7BA - Ꞻ - \xEA\x9E\xBA - Latin Capital Letter Glottal A - new in unicode 13
// (not tested since it depends on the version of libutf8proc)
// * U+A7BB - ꞻ - \xEA\x9E\xBB - Latin Small Letter Glottal A - new in unicode 13
this->CheckUnary("utf8_isupper",
this->CheckUnary("utf8_is_upper",
"[\"\", \"\", \"ϒ\", \"\", \"\xEA\x9E\xBA\", \"xF0x90x90x80\"]",
boolean(), "[true, true, true, false, true, false]");
}
Expand All @@ -255,61 +256,63 @@ TYPED_TEST(TestStringKernels, IsUpperUnicode) {
#endif // ARROW_WITH_UTF8PROC

TYPED_TEST(TestStringKernels, IsAlphaNumericAscii) {
this->CheckUnary("ascii_isalnum",
this->CheckUnary("ascii_is_alnum",
"[\"ⱭɽⱤoW123\", null, \"Ɑ2\", \"!\", \"\", \"a space\", \"1 space\"]",
boolean(), "[false, null, false, false, false, false, false]");
this->CheckUnary("ascii_isalnum", "[\"aRoW123\", null, \"a2\", \"a\", \"2\", \"\"]",
this->CheckUnary("ascii_is_alnum", "[\"aRoW123\", null, \"a2\", \"a\", \"2\", \"\"]",
boolean(), "[true, null, true, true, true, false]");
}

TYPED_TEST(TestStringKernels, IsAlphaAscii) {
this->CheckUnary("ascii_isalpha", "[\"ⱭɽⱤoW\", \"arrow\", null, \"a2\", \"!\", \"\"]",
this->CheckUnary("ascii_is_alpha", "[\"ⱭɽⱤoW\", \"arrow\", null, \"a2\", \"!\", \"\"]",
boolean(), "[false, true, null, false, false, false]");
}

TYPED_TEST(TestStringKernels, IsDecimalAscii) {
// ٣ is arabic 3
this->CheckUnary("ascii_isdecimal", "[\"12\", null, \"٣\", \"\", \"1a\", \"\"]",
this->CheckUnary("ascii_is_decimal", "[\"12\", null, \"٣\", \"\", \"1a\", \"\"]",
boolean(), "[true, null, false, false, false, false]");
}

TYPED_TEST(TestStringKernels, IsLowerAscii) {
// ٣ is arabic 3 (decimal), φ lower greek
this->CheckUnary("ascii_islower", "[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"φ\", \"\"]",
boolean(), "[false, null, true, false, true, false, false]");
this->CheckUnary("ascii_is_lower",
"[\"12\", null, \"٣a\", \"٣A\", \"1a\", \"φ\", \"\"]", boolean(),
"[false, null, true, false, true, false, false]");
}
TYPED_TEST(TestStringKernels, IsPrintableAscii) {
// \xe2\x80\x88 is punctuaction space
this->CheckUnary("ascii_isprintable",
// \xe2\x80\x88 is punctuation space
this->CheckUnary("ascii_is_printable",
"[\" 123azAZ!~\", null, \"\xe2\x80\x88\", \"\", \"\\r\"]", boolean(),
"[true, null, false, true, false]");
}

TYPED_TEST(TestStringKernels, IsSpaceAscii) {
// \xe2\x80\x88 is punctuaction space
// \xe2\x80\x88 is punctuation space
// Note: for ascii version, the non-ascii chars are seen as non-cased
this->CheckUnary("ascii_isspace", "[\" \", null, \" \", \"\\t\\r\"]", boolean(),
this->CheckUnary("ascii_is_space", "[\" \", null, \" \", \"\\t\\r\"]", boolean(),
"[true, null, true, true]");
this->CheckUnary("ascii_isspace", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
this->CheckUnary("ascii_is_space", "[\" a\", null, \"a \", \"~\", \"\xe2\x80\x88\"]",
boolean(), "[false, null, false, false, false]");
}

TYPED_TEST(TestStringKernels, IsTitleAscii) {
// ٣ is arabic 3 (decimal), Φ capital
// Note: for ascii version, the non-ascii chars are seen as non-cased
this->CheckUnary("ascii_istitle",
this->CheckUnary("ascii_is_title",
"[\"Is\", null, \"Is Title\", \"Is٣Title\", \"Is_DŽ\", \"Φ\", \"DŽ\"]",
boolean(), "[true, null, true, true, true, false, false]");
this->CheckUnary(
"ascii_istitle",
"ascii_is_title",
"[\"IsN\", null, \"IsNoTitle\", \"Is No T٣tle\", \"IsDŽ\", \"ΦΦ\", \"dž\", \"_\"]",
boolean(), "[false, null, false, false, true, false, false, false]");
}

TYPED_TEST(TestStringKernels, IsUpperAscii) {
// ٣ is arabic 3 (decimal), Φ capital greek
this->CheckUnary("ascii_isupper", "[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\"]",
boolean(), "[false, null, false, true, true, false, false]");
this->CheckUnary("ascii_is_upper",
"[\"12\", null, \"٣a\", \"٣A\", \"1A\", \"Φ\", \"\"]", boolean(),
"[false, null, false, true, true, false, false]");
}

TYPED_TEST(TestStringKernels, MatchSubstring) {
Expand Down
40 changes: 20 additions & 20 deletions python/pyarrow/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,26 +110,26 @@ def func(left, right):
utf8_upper = _simple_unary_function('utf8_upper')
utf8_lower = _simple_unary_function('utf8_lower')

string_isascii = _simple_unary_function('string_isascii')

ascii_isalnum = _simple_unary_function('ascii_isalnum')
utf8_isalnum = _simple_unary_function('utf8_isalnum')
ascii_isalpha = _simple_unary_function('ascii_isalpha')
utf8_isalpha = _simple_unary_function('utf8_isalpha')
ascii_isdecimal = _simple_unary_function('ascii_isdecimal')
utf8_isdecimal = _simple_unary_function('utf8_isdecimal')
ascii_isdigit = ascii_isdecimal # alias
utf8_isdigit = _simple_unary_function('utf8_isdigit')
ascii_islower = _simple_unary_function('ascii_islower')
utf8_islower = _simple_unary_function('utf8_islower')
ascii_isnumeric = ascii_isdecimal # alias
utf8_isnumeric = _simple_unary_function('utf8_isnumeric')
ascii_isprintable = _simple_unary_function('ascii_isprintable')
utf8_isprintable = _simple_unary_function('utf8_isprintable')
ascii_istitle = _simple_unary_function('ascii_istitle')
utf8_istitle = _simple_unary_function('utf8_istitle')
ascii_isupper = _simple_unary_function('ascii_isupper')
utf8_isupper = _simple_unary_function('utf8_isupper')
string_is_ascii = _simple_unary_function('string_is_ascii')

ascii_is_alnum = _simple_unary_function('ascii_is_alnum')
utf8_is_alnum = _simple_unary_function('utf8_is_alnum')
ascii_is_alpha = _simple_unary_function('ascii_is_alpha')
utf8_is_alpha = _simple_unary_function('utf8_is_alpha')
ascii_is_decimal = _simple_unary_function('ascii_is_decimal')
utf8_is_decimal = _simple_unary_function('utf8_is_decimal')
ascii_is_digit = ascii_is_decimal # alias
utf8_is_digit = _simple_unary_function('utf8_is_digit')
ascii_is_lower = _simple_unary_function('ascii_is_lower')
utf8_is_lower = _simple_unary_function('utf8_is_lower')
ascii_is_numeric = ascii_is_decimal # alias
utf8_is_numeric = _simple_unary_function('utf8_is_numeric')
ascii_is_printable = _simple_unary_function('ascii_is_printable')
utf8_is_printable = _simple_unary_function('utf8_is_printable')
ascii_is_title = _simple_unary_function('ascii_is_title')
utf8_is_title = _simple_unary_function('utf8_is_title')
ascii_is_upper = _simple_unary_function('ascii_is_upper')
utf8_is_upper = _simple_unary_function('utf8_is_upper')

is_valid = _simple_unary_function('is_valid')
is_null = _simple_unary_function('is_null')
Expand Down
29 changes: 15 additions & 14 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def find_new_unicode_codepoints():
new = set()
characters = [chr(c) for c in range(0x80, 0x11000)
if not (0xD800 <= c < 0xE000)]
is_printable = pc.utf8_isprintable(pa.array(characters)).to_pylist()
is_printable = pc.utf8_is_printable(pa.array(characters)).to_pylist()
for i, c in enumerate(characters):
if is_printable[i] != c.isprintable():
new.add(ord(c))
Expand All @@ -134,9 +134,9 @@ def find_new_unicode_codepoints():

# Python claims there are not alpha, not sure why, they are in
# gc='Other Letter': https://graphemica.com/%E1%B3%B2
unknown_issue_isalpha = {0x1cf2, 0x1cf3}
unknown_issue_is_alpha = {0x1cf2, 0x1cf3}
# utf8proc does not know if codepoints are lower case
utf8proc_issue_islower = {
utf8proc_issue_is_lower = {
0xaa, 0xba, 0x2b0, 0x2b1, 0x2b2, 0x2b3, 0x2b4,
0x2b5, 0x2b6, 0x2b7, 0x2b8, 0x2c0, 0x2c1, 0x2e0,
0x2e1, 0x2e2, 0x2e3, 0x2e4, 0x37a, 0x1d2c, 0x1d2d,
Expand Down Expand Up @@ -208,23 +208,24 @@ def find_new_unicode_codepoints():
0xf978, 0xf9b2, 0xf9d1, 0xf9d3, 0xf9fd, }

codepoints_ignore = {
'isalnum': numeric_info_missing | digit_info_missing |
unknown_issue_isalpha,
'isalpha': unknown_issue_isalpha,
'isdigit': digit_info_missing,
'isnumeric': numeric_info_missing,
'islower': utf8proc_issue_islower
'is_alnum': numeric_info_missing | digit_info_missing |
unknown_issue_is_alpha,
'is_alpha': unknown_issue_is_alpha,
'is_digit': digit_info_missing,
'is_numeric': numeric_info_missing,
'is_lower': utf8proc_issue_is_lower
}


@pytest.mark.parametrize('function_name', ['isalnum', 'isalpha', 'isascii',
'isdecimal', 'isdigit', 'islower',
'isnumeric', 'isprintable',
'isspace', 'isupper', ])
@pytest.mark.parametrize('function_name', ['is_alnum', 'is_alpha',
'is_ascii', 'is_decimal',
'is_digit', 'is_lower',
'is_numeric', 'is_printable',
'is_space', 'is_upper', ])
@pytest.mark.parametrize('variant', ['ascii', 'utf8'])
def test_string_py_compat_boolean(function_name, variant):
arrow_name = variant + "_" + function_name
py_name = function_name
py_name = function_name.replace('_', '')
ignore = codepoints_ignore.get(function_name, set()) |\
find_new_unicode_codepoints()
for i in range(128 if ascii else 0x11000):
Expand Down

0 comments on commit f131fe6

Please sign in to comment.