Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding strrpos Presto function #2903

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions velox/docs/functions/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,17 @@ String Functions
``instance`` must be a positive number.
Positions start with ``1``. If not found, ``0`` is returned.

.. function:: strrpos(string, substring) -> bigint

Returns the starting position of the last instance of ``substring`` in
``string``. Positions start with ``1``. If not found, ``0`` is returned.

.. function:: strrpos(string, substring, instance) -> bigint

Returns the position of the N-th ``instance`` of ``substring`` in ``string`` starting from the end of the string.
``instance`` must be a positive number.
Positions start with ``1``. If not found, ``0`` is returned.

.. function:: substr(string, start) -> varchar

Returns the rest of ``string`` from the starting position ``start``.
Expand Down
36 changes: 33 additions & 3 deletions velox/functions/lib/string/StringCore.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ lengthUnicode(const char* inputBuffer, size_t bufferLength) {
/// substring and then computing the length of substring[0, byteIndex). This is
/// safe because in UTF8 a char can not be a subset of another char (in bytes
/// representation).
static int64_t findNthInstanceByteIndex(
static int64_t findNthInstanceByteIndexFromStart(
const std::string_view& string,
const std::string_view subString,
const size_t instance = 1,
Expand All @@ -254,10 +254,39 @@ static int64_t findNthInstanceByteIndex(
}

// Find next occurrence
return findNthInstanceByteIndex(
return findNthInstanceByteIndexFromStart(
string, subString, instance - 1, byteIndex + subString.size());
}

/// Returns the start byte index of the Nth instance of subString in
/// string from the end. Search starts from endPosition. Positions start with 0.
/// If not found, -1 is returned.
inline int64_t findNthInstanceByteIndexFromEnd(
const std::string_view string,
const std::string_view subString,
const size_t instance = 1) {
assert(instance > 0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please use VELOX_CHECK_GT() instead of assert


if (subString.empty()) {
return 0;
}

size_t foundCnt = 0;
size_t index = string.size();
do {
if (index == 0) {
return -1;
}

index = string.rfind(subString, index - 1);
if (index == std::string_view::npos) {
return -1;
}
++foundCnt;
} while (foundCnt < instance);
return index;
}

/// Replace replaced with replacement in inputString and write results in
/// outputString. If inPlace=true inputString and outputString are assumed to
/// tbe the same. When replaced is empty, replacement is added before and after
Expand All @@ -281,7 +310,8 @@ inline static size_t replace(
bool doCopyUnreplaced = !inPlace || (replaced.size() != replacement.size());

auto findNextReplaced = [&]() {
return findNthInstanceByteIndex(inputString, replaced, 1, readPosition);
return findNthInstanceByteIndexFromStart(
inputString, replaced, 1, readPosition);
};

auto writeUnchanged = [&](ssize_t size) {
Expand Down
29 changes: 19 additions & 10 deletions velox/functions/lib/string/StringImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,22 +148,30 @@ FOLLY_ALWAYS_INLINE int32_t charToCodePoint(const T& inputString) {
return codePoint;
}

/// Returns the starting position in characters of the Nth instance of the
/// substring in string. Positions start with 1. If not found, 0 is returned. If
/// subString is empty result is 1.
template <bool isAscii, typename T>
/// Returns the starting position in characters of the Nth instance(counting
/// from the left if lpos==true and from the end otherwise) of the substring in
/// string. Positions start with 1. If not found, 0 is returned. If subString is
/// empty result is 1.
template <bool isAscii, bool lpos = true, typename T>
FOLLY_ALWAYS_INLINE int64_t
stringPosition(const T& string, const T& subString, int64_t instance = 0) {
VELOX_USER_CHECK_GT(instance, 0, "'instance' must be a positive number");
if (subString.size() == 0) {
return 1;
}

VELOX_USER_CHECK_GT(instance, 0, "'instance' must be a positive number");

auto byteIndex = findNthInstanceByteIndex(
std::string_view(string.data(), string.size()),
std::string_view(subString.data(), subString.size()),
instance);
int64_t byteIndex = -1;
if constexpr (lpos) {
byteIndex = findNthInstanceByteIndexFromStart(
std::string_view(string.data(), string.size()),
std::string_view(subString.data(), subString.size()),
instance);
} else {
byteIndex = findNthInstanceByteIndexFromEnd(
std::string_view(string.data(), string.size()),
std::string_view(subString.data(), subString.size()),
instance);
}

if (byteIndex == -1) {
return 0;
Expand Down Expand Up @@ -700,4 +708,5 @@ FOLLY_ALWAYS_INLINE void pad(
padString.data(),
padPrefixByteLength);
}

} // namespace facebook::velox::functions::stringImpl
130 changes: 0 additions & 130 deletions velox/functions/prestosql/StringFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,131 +278,6 @@ class ConcatFunction : public exec::VectorFunction {
std::vector<StringView> constantStringViews_;
};

/**
* strpos(string, substring) → bigint
* Returns the starting position of the first instance of substring in string.
* Positions start with 1. If not found, 0 is returned.
*
* strpos(string, substring, instance) → bigint
* Returns the position of the N-th instance of substring in string. instance
* must be a positive number. Positions start with 1. If not found, 0 is
* returned.
**/
class StringPosition : public exec::VectorFunction {
private:
/// A function that can be wrapped with ascii mode
template <bool isAscii>
struct ApplyInternal {
template <
typename StringReader,
typename SubStringReader,
typename InstanceReader>
static void apply(
StringReader stringReader,
SubStringReader subStringReader,
InstanceReader instanceReader,
const SelectivityVector& rows,
FlatVector<int64_t>* resultFlatVector) {
rows.applyToSelected([&](int row) {
auto result = stringImpl::stringPosition<isAscii>(
stringReader(row), subStringReader(row), instanceReader(row));
resultFlatVector->set(row, result);
});
}
};

public:
void apply(
const SelectivityVector& rows,
std::vector<VectorPtr>& args,
const TypePtr& /* outputType */,
exec::EvalCtx& context,
VectorPtr& result) const override {
exec::DecodedArgs decodedArgs(rows, args, context);
auto decodedStringInput = decodedArgs.at(0);
auto decodedSubStringInput = decodedArgs.at(1);

auto stringArgStringEncoding = isAscii(args.at(0).get(), rows);
context.ensureWritable(rows, BIGINT(), result);

auto* resultFlatVector = result->as<FlatVector<int64_t>>();

auto stringReader = [&](const vector_size_t row) {
return decodedStringInput->valueAt<StringView>(row);
};

auto substringReader = [&](const vector_size_t row) {
return decodedSubStringInput->valueAt<StringView>(row);
};

// If there's no "instance" parameter.
if (args.size() <= 2) {
StringEncodingTemplateWrapper<ApplyInternal>::apply(
stringArgStringEncoding,
stringReader,
substringReader,
[](const vector_size_t) { return 1L; },
rows,
resultFlatVector);
}
// If there's an "instance" parameter, check if it's BIGINT or INTEGER.
else {
auto decodedInstanceInput = decodedArgs.at(2);

if (args[2]->typeKind() == TypeKind::BIGINT) {
auto instanceReader = [&](const vector_size_t row) {
return decodedInstanceInput->valueAt<int64_t>(row);
};
StringEncodingTemplateWrapper<ApplyInternal>::apply(
stringArgStringEncoding,
stringReader,
substringReader,
instanceReader,
rows,
resultFlatVector);
} else if (args[2]->typeKind() == TypeKind::INTEGER) {
auto instanceReader = [&](const vector_size_t row) {
return decodedInstanceInput->valueAt<int32_t>(row);
};
StringEncodingTemplateWrapper<ApplyInternal>::apply(
stringArgStringEncoding,
stringReader,
substringReader,
instanceReader,
rows,
resultFlatVector);
} else {
VELOX_UNREACHABLE();
}
}
}

static std::vector<std::shared_ptr<exec::FunctionSignature>> signatures() {
return {
// varchar, varchar -> bigint
exec::FunctionSignatureBuilder()
.returnType("bigint")
.argumentType("varchar")
.argumentType("varchar")
.build(),
// varchar, varchar, integer -> bigint
exec::FunctionSignatureBuilder()
.returnType("bigint")
.argumentType("varchar")
.argumentType("varchar")
.argumentType("integer")
.build(),
// varchar, varchar, bigint -> bigint
exec::FunctionSignatureBuilder()
.returnType("bigint")
.argumentType("varchar")
.argumentType("varchar")
.argumentType("bigint")
.build(),
};
}
};

/**
* replace(string, search) → varchar
* Removes all instances of search from string.
Expand Down Expand Up @@ -577,11 +452,6 @@ VELOX_DECLARE_STATEFUL_VECTOR_FUNCTION_WITH_METADATA(
return std::make_unique<ConcatFunction>(name, inputs);
});

VELOX_DECLARE_VECTOR_FUNCTION(
udf_strpos,
StringPosition::signatures(),
std::make_unique<StringPosition>());

VELOX_DECLARE_VECTOR_FUNCTION(
udf_replace,
Replace::signatures(),
Expand Down
48 changes: 48 additions & 0 deletions velox/functions/prestosql/StringFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
#pragma once

#include <cstdint>
#define XXH_INLINE_ALL
#include <xxhash.h>

Expand Down Expand Up @@ -349,4 +350,51 @@ struct LPadFunction : public PadFunctionBase<T, true> {};
template <typename T>
struct RPadFunction : public PadFunctionBase<T, false> {};

/// strpos and strrpos functions
/// strpos(string, substring) → bigint
/// Returns the starting position of the first instance of substring in
/// string. Positions start with 1. If not found, 0 is returned.
/// strpos(string, substring, instance) → bigint
/// Returns the position of the N-th instance of substring in string.
/// instance must be a positive number. Positions start with 1. If not
/// found, 0 is returned.
/// strrpos(string, substring) → bigint
/// Returns the starting position of the first instance of substring in
/// string counting from the end. Positions start with 1. If not found, 0 is
/// returned.
/// strrpos(string, substring, instance) → bigint
/// Returns the position of the N-th instance of substring in string
/// counting from the end. Instance must be a positive number. Positions
/// start with 1. If not found, 0 is returned.
template <typename T, bool lpos>
struct StrPosFunctionBase {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE bool call(
out_type<int64_t>& result,
const arg_type<Varchar>& string,
const arg_type<Varchar>& subString,
const arg_type<int64_t>& instance = 1) {
result = stringImpl::stringPosition<false /*isAscii*/, lpos>(
string, subString, instance);
return true;
}

FOLLY_ALWAYS_INLINE bool callAscii(
out_type<int64_t>& result,
const arg_type<Varchar>& string,
const arg_type<Varchar>& subString,
const arg_type<int64_t>& instance = 1) {
result = stringImpl::stringPosition<true /*isAscii*/, lpos>(
string, subString, instance);
return true;
}
};

template <typename T>
struct StrLPosFunction : public StrPosFunctionBase<T, true> {};

template <typename T>
struct StrRPosFunction : public StrPosFunctionBase<T, false> {};

} // namespace facebook::velox::functions
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,6 @@ void registerStringFunctions() {
VELOX_REGISTER_VECTOR_FUNCTION(udf_upper, "upper");
VELOX_REGISTER_VECTOR_FUNCTION(udf_split, "split");
VELOX_REGISTER_VECTOR_FUNCTION(udf_concat, "concat");
VELOX_REGISTER_VECTOR_FUNCTION(udf_strpos, "strpos");
VELOX_REGISTER_VECTOR_FUNCTION(udf_replace, "replace");
VELOX_REGISTER_VECTOR_FUNCTION(udf_reverse, "reverse");
VELOX_REGISTER_VECTOR_FUNCTION(udf_to_utf8, "to_utf8");
Expand All @@ -94,5 +93,12 @@ void registerStringFunctions() {
"regexp_extract_all", re2ExtractAllSignatures(), makeRe2ExtractAll);
exec::registerStatefulVectorFunction(
"regexp_like", re2SearchSignatures(), makeRe2Search);

registerFunction<StrLPosFunction, int64_t, Varchar, Varchar>({"strpos"});
registerFunction<StrLPosFunction, int64_t, Varchar, Varchar, int64_t>(
{"strpos"});
registerFunction<StrRPosFunction, int64_t, Varchar, Varchar>({"strrpos"});
registerFunction<StrRPosFunction, int64_t, Varchar, Varchar, int64_t>(
{"strrpos"});
}
} // namespace facebook::velox::functions
Loading