Skip to content

Commit

Permalink
Add sha2 SparkSQL function (facebookincubator#143)
Browse files Browse the repository at this point in the history
  • Loading branch information
izchen authored Mar 6, 2023
1 parent 2e42fc7 commit b85f80c
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 0 deletions.
2 changes: 2 additions & 0 deletions velox/functions/sparksql/Register.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ void registerFunctions(const std::string& prefix) {
registerFunction<Md5Function, Varchar, Varbinary>({prefix + "md5"});
registerFunction<Sha1HexStringFunction, Varchar, Varbinary>(
{prefix + "sha1"});
registerFunction<Sha2HexStringFunction, Varchar, Varbinary, int32_t>(
{prefix + "sha2"});

exec::registerStatefulVectorFunction(
prefix + "regexp_extract", re2ExtractSignatures(), makeRegexExtract);
Expand Down
48 changes: 48 additions & 0 deletions velox/functions/sparksql/String.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,54 @@ struct Sha1HexStringFunction {
}
};

/// sha2 function
/// sha2(varbinary, bitLength) -> string
/// Calculate SHA-2 family of functions (SHA-224, SHA-256,
/// SHA-384, and SHA-512) and convert the result to a hex string.
/// The second argument indicates the desired bit length of the result, which
/// must have a value of 224, 256, 384, 512, or 0 (which is equivalent to 256).
/// If asking for an unsupported bitLength, the return value is NULL.
/// Returns SHA-2 digest as hex string.
template <typename T>
struct Sha2HexStringFunction {
VELOX_DEFINE_FUNCTION_TYPES(T);

FOLLY_ALWAYS_INLINE
bool call(
out_type<Varchar>& result,
const arg_type<Varbinary>& input,
const int32_t& bitLength) {
const int32_t nonzeroBitLength = (bitLength == 0) ? 256 : bitLength;
const EVP_MD* hashAlgorithm;
switch (nonzeroBitLength) {
case 224:
hashAlgorithm = EVP_sha224();
break;
case 256:
hashAlgorithm = EVP_sha256();
break;
case 384:
hashAlgorithm = EVP_sha384();
break;
case 512:
hashAlgorithm = EVP_sha512();
break;
default:
// For an unsupported bitLength, the return value is NULL.
return false;
}
const int32_t digestLength = nonzeroBitLength >> 3;
result.resize(digestLength * 2);
auto resultBuffer =
folly::MutableByteRange((uint8_t*)result.data(), digestLength);
auto inputBuffer =
folly::ByteRange((const uint8_t*)input.data(), input.size());
folly::ssl::OpenSSLHash::hash(resultBuffer, hashAlgorithm, inputBuffer);
encodeDigestToBase16((uint8_t*)result.data(), digestLength);
return true;
}
};

/// contains function
/// contains(string, string) -> bool
/// Searches the second argument in the first one.
Expand Down
63 changes: 63 additions & 0 deletions velox/functions/sparksql/tests/StringTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,13 @@ class StringTest : public SparkFunctionBaseTest {
"sha1(c0)", {arg}, {VARBINARY()});
}

std::optional<std::string> sha2(
std::optional<std::string> str,
std::optional<int32_t> bitLength) {
return evaluateOnce<std::string, std::string, int32_t>(
"sha2(cast(c0 as varbinary), c1)", str, bitLength);
}

bool compareFunction(
const std::string& function,
const std::optional<std::string>& str,
Expand Down Expand Up @@ -155,6 +162,62 @@ TEST_F(StringTest, sha1) {
"a26704c04fc5f10db5aab58468035531cc542485");
}

TEST_F(StringTest, sha2) {
EXPECT_EQ(sha2("Spark", -1), std::nullopt);
EXPECT_EQ(sha2("Spark", 1), std::nullopt);
EXPECT_EQ(
sha2("", 0),
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855");
EXPECT_EQ(
sha2("Spark", 0),
"529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b");
EXPECT_EQ(
sha2("0123456789abcdefghijklmnopqrstuvwxyz", 0),
"74e7e5bb9d22d6db26bf76946d40fff3ea9f0346b884fd0694920fccfad15e33");
EXPECT_EQ(
sha2("", 224),
"d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f");
EXPECT_EQ(
sha2("Spark", 224),
"dbeab94971678d36af2195851c0f7485775a2a7c60073d62fc04549c");
EXPECT_EQ(
sha2("0123456789abcdefghijklmnopqrstuvwxyz", 224),
"e6e4a6be069cc9bead8b6050856d2b26da6b3f7efa0951e5fb3a54dd");
EXPECT_EQ(
sha2("", 256),
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855");
EXPECT_EQ(
sha2("Spark", 256),
"529bc3b07127ecb7e53a4dcf1991d9152c24537d919178022b2c42657f79a26b");
EXPECT_EQ(
sha2("0123456789abcdefghijklmnopqrstuvwxyz", 256),
"74e7e5bb9d22d6db26bf76946d40fff3ea9f0346b884fd0694920fccfad15e33");
EXPECT_EQ(
sha2("", 384),
"38b060a751ac96384cd9327eb1b1e36a21fdb71114be0743"
"4c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b95b");
EXPECT_EQ(
sha2("Spark", 384),
"1e40b8d06c248a1cc32428c22582b6219d072283078fa140"
"d9ad297ecadf2cabefc341b857ad36226aa8d6d79f2ab67d");
EXPECT_EQ(
sha2("0123456789abcdefghijklmnopqrstuvwxyz", 384),
"ce6d4ea5442bc6c830bea1942d4860db9f7b96f0e9d2c307"
"3ffe47a0e1166d95612d840ff15e5efdd23c1f273096da32");
EXPECT_EQ(
sha2("", 512),
"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce"
"47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e");
EXPECT_EQ(
sha2("Spark", 512),
"44844a586c54c9a212da1dbfe05c5f1705de1af5fda1f0d36297623249b279fd"
"8f0ccec03f888f4fb13bf7cd83fdad58591c797f81121a23cfdd5e0897795238");
EXPECT_EQ(
sha2("0123456789abcdefghijklmnopqrstuvwxyz", 512),
"95cadc34aa46b9fdef432f62fe5bad8d9f475bfbecf797d5802bb5f2937a85d9"
"3ce4857a6262b03834c01c610d74cd1215f9a466dc6ad3dd15078e3309a03a6d");
}

TEST_F(StringTest, startsWith) {
EXPECT_EQ(startsWith("hello", "ello"), false);
EXPECT_EQ(startsWith("hello", "hell"), true);
Expand Down

0 comments on commit b85f80c

Please sign in to comment.