From 961c922f938975dce645f18b44b91f50cb110590 Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 12 Aug 2024 19:23:33 +0000 Subject: [PATCH 1/7] redact sensitive confs --- cpp/core/config/GlutenConfig.cc | 39 ++++++++++++++----- cpp/core/config/GlutenConfig.h | 2 + .../org/apache/gluten/GlutenConfig.scala | 7 +++- 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/cpp/core/config/GlutenConfig.cc b/cpp/core/config/GlutenConfig.cc index fa04ecfa4e5c..caa17f92b4ab 100644 --- a/cpp/core/config/GlutenConfig.cc +++ b/cpp/core/config/GlutenConfig.cc @@ -20,26 +20,47 @@ #include "compute/ProtobufUtils.h" #include "config.pb.h" #include "jni/JniError.h" +#include +#include namespace gluten { + +const std::string REDACTED_VALUE = "*********(redacted)"; +const std::string REGEX_REDACT_KEY = "spark.gluten.redaction.regex"; + std::unordered_map -parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength) { - std::unordered_map sparkConfs; - ConfigMap pConfigMap; - gluten::parseProtobuf(planData, planDataLength, &pConfigMap); - for (const auto& pair : pConfigMap.configs()) { - sparkConfs.emplace(pair.first, pair.second); + parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength) { + std::unordered_map sparkConfs; + ConfigMap pConfigMap; + gluten::parseProtobuf(planData, planDataLength, &pConfigMap); + for (const auto& pair : pConfigMap.configs()) { + sparkConfs.emplace(pair.first, pair.second); } - return sparkConfs; } +std::optional getRedactionRegex(const std::unordered_map& conf) { + auto it = conf.find(REGEX_REDACT_KEY); + if (it != conf.end()) { + return std::regex(it->second); + } + return std::nullopt; +} + std::string printConfig(const std::unordered_map& conf) { std::ostringstream oss; oss << std::endl; - for (auto& [k, v] : conf) { - oss << " [" << k << ", " << v << "]\n"; + + auto redactionRegex = getRedactionRegex(conf); + + for (const auto& [k, v] : conf) { + if (redactionRegex && std::regex_match(k, *redactionRegex)) { + oss << " [" << k << ", " << REDACTED_VALUE << "]\n"; + } else { + oss << " [" << k << ", " << v << "]\n"; + } } return oss.str(); } + } // namespace gluten diff --git a/cpp/core/config/GlutenConfig.h b/cpp/core/config/GlutenConfig.h index 060bbe111265..db93c0cc0946 100644 --- a/cpp/core/config/GlutenConfig.h +++ b/cpp/core/config/GlutenConfig.h @@ -64,6 +64,8 @@ const std::string kShuffleCompressionCodecBackend = "spark.gluten.sql.columnar.s const std::string kQatBackendName = "qat"; const std::string kIaaBackendName = "iaa"; +const std::string kRedactionRegex = "spark.gluten.redaction.regex"; + std::unordered_map parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength); diff --git a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala index e3f6f1d984ed..435fa04ca714 100644 --- a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala +++ b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala @@ -622,6 +622,7 @@ object GlutenConfig { val GLUTEN_COST_EVALUATOR_ENABLED = "spark.gluten.sql.adaptive.costEvaluator.enabled" + val GLUTEN_REGEX_LOG_REDACTION = "spark.gluten.redaction.regex" var ins: GlutenConfig = _ def getConf: GlutenConfig = { @@ -673,7 +674,8 @@ object GlutenConfig { // gcs config SPARK_GCS_STORAGE_ROOT_URL, SPARK_GCS_AUTH_TYPE, - SPARK_GCS_AUTH_SERVICE_ACCOUNT_JSON_KEYFILE + SPARK_GCS_AUTH_SERVICE_ACCOUNT_JSON_KEYFILE, + GLUTEN_REGEX_LOG_REDACTION ) nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava) @@ -757,7 +759,8 @@ object GlutenConfig { GLUTEN_TASK_OFFHEAP_SIZE_IN_BYTES_KEY, GLUTEN_OFFHEAP_ENABLED, SESSION_LOCAL_TIMEZONE.key, - DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key + DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key, + GLUTEN_REGEX_LOG_REDACTION ) nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava) From 2bad963bcbd2824ce812a2492ac270f50f74bca3 Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 12 Aug 2024 19:39:58 +0000 Subject: [PATCH 2/7] format --- cpp/core/config/GlutenConfig.cc | 34 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/cpp/core/config/GlutenConfig.cc b/cpp/core/config/GlutenConfig.cc index caa17f92b4ab..0adf0bf060b4 100644 --- a/cpp/core/config/GlutenConfig.cc +++ b/cpp/core/config/GlutenConfig.cc @@ -16,12 +16,11 @@ */ #include - +#include +#include #include "compute/ProtobufUtils.h" #include "config.pb.h" #include "jni/JniError.h" -#include -#include namespace gluten { @@ -29,35 +28,36 @@ const std::string REDACTED_VALUE = "*********(redacted)"; const std::string REGEX_REDACT_KEY = "spark.gluten.redaction.regex"; std::unordered_map - parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength) { - std::unordered_map sparkConfs; - ConfigMap pConfigMap; - gluten::parseProtobuf(planData, planDataLength, &pConfigMap); - for (const auto& pair : pConfigMap.configs()) { - sparkConfs.emplace(pair.first, pair.second); +parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength) { + std::unordered_map sparkConfs; + ConfigMap pConfigMap; + gluten::parseProtobuf(planData, planDataLength, &pConfigMap); + for (const auto& pair : pConfigMap.configs()) { + sparkConfs.emplace(pair.first, pair.second); } + return sparkConfs; } std::optional getRedactionRegex(const std::unordered_map& conf) { - auto it = conf.find(REGEX_REDACT_KEY); - if (it != conf.end()) { - return std::regex(it->second); - } - return std::nullopt; + auto it = conf.find(REGEX_REDACT_KEY); + if (it != conf.end()) { + return std::regex(it->second); + } + return std::nullopt; } std::string printConfig(const std::unordered_map& conf) { std::ostringstream oss; oss << std::endl; - auto redactionRegex = getRedactionRegex(conf); + auto redactionRegex = getRedactionRegex(conf); for (const auto& [k, v] : conf) { if (redactionRegex && std::regex_match(k, *redactionRegex)) { - oss << " [" << k << ", " << REDACTED_VALUE << "]\n"; + oss << " [" << k << ", " << REDACTED_VALUE << "]\n"; } else { - oss << " [" << k << ", " << v << "]\n"; + oss << " [" << k << ", " << v << "]\n"; } } return oss.str(); From e938ff7c633eb5db57430530372afce0f1f9a909 Mon Sep 17 00:00:00 2001 From: arnavb Date: Tue, 13 Aug 2024 10:11:17 +0000 Subject: [PATCH 3/7] comments --- cpp/core/config/GlutenConfig.cc | 21 ++++++++++++--------- cpp/core/config/GlutenConfig.h | 2 -- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/cpp/core/config/GlutenConfig.cc b/cpp/core/config/GlutenConfig.cc index 0adf0bf060b4..36288bcd6577 100644 --- a/cpp/core/config/GlutenConfig.cc +++ b/cpp/core/config/GlutenConfig.cc @@ -22,10 +22,21 @@ #include "config.pb.h" #include "jni/JniError.h" +namespace { + +const std::string REGEX_REDACT_KEY = "spark.gluten.redaction.regex"; +std::optional getRedactionRegex(const std::unordered_map& conf) { + auto it = conf.find(REGEX_REDACT_KEY); + if (it != conf.end()) { + return std::regex(it->second); + } + return std::nullopt; +} +} // namespace anonymous + namespace gluten { const std::string REDACTED_VALUE = "*********(redacted)"; -const std::string REGEX_REDACT_KEY = "spark.gluten.redaction.regex"; std::unordered_map parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength) { @@ -39,14 +50,6 @@ parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength) return sparkConfs; } -std::optional getRedactionRegex(const std::unordered_map& conf) { - auto it = conf.find(REGEX_REDACT_KEY); - if (it != conf.end()) { - return std::regex(it->second); - } - return std::nullopt; -} - std::string printConfig(const std::unordered_map& conf) { std::ostringstream oss; oss << std::endl; diff --git a/cpp/core/config/GlutenConfig.h b/cpp/core/config/GlutenConfig.h index db93c0cc0946..060bbe111265 100644 --- a/cpp/core/config/GlutenConfig.h +++ b/cpp/core/config/GlutenConfig.h @@ -64,8 +64,6 @@ const std::string kShuffleCompressionCodecBackend = "spark.gluten.sql.columnar.s const std::string kQatBackendName = "qat"; const std::string kIaaBackendName = "iaa"; -const std::string kRedactionRegex = "spark.gluten.redaction.regex"; - std::unordered_map parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength); From 497d6dc32fc9bc8bffd015003d5447ce34330a1b Mon Sep 17 00:00:00 2001 From: arnavb Date: Tue, 13 Aug 2024 10:18:23 +0000 Subject: [PATCH 4/7] format --- cpp/core/config/GlutenConfig.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/core/config/GlutenConfig.cc b/cpp/core/config/GlutenConfig.cc index 36288bcd6577..824e88b9e554 100644 --- a/cpp/core/config/GlutenConfig.cc +++ b/cpp/core/config/GlutenConfig.cc @@ -32,7 +32,7 @@ std::optional getRedactionRegex(const std::unordered_map Date: Sun, 18 Aug 2024 08:19:23 +0000 Subject: [PATCH 5/7] address comments --- cpp/core/config/GlutenConfig.cc | 7 ++----- cpp/core/config/GlutenConfig.h | 3 +++ .../src/main/scala/org/apache/gluten/GlutenConfig.scala | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/core/config/GlutenConfig.cc b/cpp/core/config/GlutenConfig.cc index 824e88b9e554..5674c7e7f0c4 100644 --- a/cpp/core/config/GlutenConfig.cc +++ b/cpp/core/config/GlutenConfig.cc @@ -24,9 +24,8 @@ namespace { -const std::string REGEX_REDACT_KEY = "spark.gluten.redaction.regex"; std::optional getRedactionRegex(const std::unordered_map& conf) { - auto it = conf.find(REGEX_REDACT_KEY); + auto it = conf.find(gluten::kSparkRedactionRegex); if (it != conf.end()) { return std::regex(it->second); } @@ -36,8 +35,6 @@ std::optional getRedactionRegex(const std::unordered_map parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength) { std::unordered_map sparkConfs; @@ -58,7 +55,7 @@ std::string printConfig(const std::unordered_map& conf for (const auto& [k, v] : conf) { if (redactionRegex && std::regex_match(k, *redactionRegex)) { - oss << " [" << k << ", " << REDACTED_VALUE << "]\n"; + oss << " [" << k << ", " << kSparkRedactionString << "]\n"; } else { oss << " [" << k << ", " << v << "]\n"; } diff --git a/cpp/core/config/GlutenConfig.h b/cpp/core/config/GlutenConfig.h index 060bbe111265..e8eb65861e83 100644 --- a/cpp/core/config/GlutenConfig.h +++ b/cpp/core/config/GlutenConfig.h @@ -64,6 +64,9 @@ const std::string kShuffleCompressionCodecBackend = "spark.gluten.sql.columnar.s const std::string kQatBackendName = "qat"; const std::string kIaaBackendName = "iaa"; +const std::string kSparkRedactionRegex = "spark.redaction.regex"; +const std::string kSparkRedactionString = "*********(redacted)"; + std::unordered_map parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength); diff --git a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala index 435fa04ca714..9c3e24233259 100644 --- a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala +++ b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala @@ -534,6 +534,7 @@ object GlutenConfig { val GLUTEN_ONHEAP_SIZE_KEY = "spark.executor.memory" val GLUTEN_OFFHEAP_SIZE_KEY = "spark.memory.offHeap.size" val GLUTEN_OFFHEAP_ENABLED = "spark.memory.offHeap.enabled" + val SPARK_REDACTION_REGEX = "spark.redaction.regex" // For Soft Affinity Scheduling // Enable Soft Affinity Scheduling, defalut value is false @@ -622,7 +623,6 @@ object GlutenConfig { val GLUTEN_COST_EVALUATOR_ENABLED = "spark.gluten.sql.adaptive.costEvaluator.enabled" - val GLUTEN_REGEX_LOG_REDACTION = "spark.gluten.redaction.regex" var ins: GlutenConfig = _ def getConf: GlutenConfig = { @@ -675,7 +675,7 @@ object GlutenConfig { SPARK_GCS_STORAGE_ROOT_URL, SPARK_GCS_AUTH_TYPE, SPARK_GCS_AUTH_SERVICE_ACCOUNT_JSON_KEYFILE, - GLUTEN_REGEX_LOG_REDACTION + SPARK_REDACTION_REGEX ) nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava) @@ -760,7 +760,7 @@ object GlutenConfig { GLUTEN_OFFHEAP_ENABLED, SESSION_LOCAL_TIMEZONE.key, DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key, - GLUTEN_REGEX_LOG_REDACTION + SPARK_REDACTION_REGEX ) nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava) From 85e181879ee2e75866e72b0cd4f009e68d856b1d Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 19 Aug 2024 02:37:27 +0000 Subject: [PATCH 6/7] boost regex --- cpp/core/config/GlutenConfig.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/core/config/GlutenConfig.cc b/cpp/core/config/GlutenConfig.cc index 5674c7e7f0c4..357c6c76feb2 100644 --- a/cpp/core/config/GlutenConfig.cc +++ b/cpp/core/config/GlutenConfig.cc @@ -17,17 +17,17 @@ #include #include -#include +#include #include "compute/ProtobufUtils.h" #include "config.pb.h" #include "jni/JniError.h" namespace { -std::optional getRedactionRegex(const std::unordered_map& conf) { +std::optional getRedactionRegex(const std::unordered_map& conf) { auto it = conf.find(gluten::kSparkRedactionRegex); if (it != conf.end()) { - return std::regex(it->second); + return boost::regex(it->second); } return std::nullopt; } @@ -54,7 +54,7 @@ std::string printConfig(const std::unordered_map& conf auto redactionRegex = getRedactionRegex(conf); for (const auto& [k, v] : conf) { - if (redactionRegex && std::regex_match(k, *redactionRegex)) { + if (redactionRegex && boost::regex_match(k, *redactionRegex)) { oss << " [" << k << ", " << kSparkRedactionString << "]\n"; } else { oss << " [" << k << ", " << v << "]\n"; From 2dd1db8c659ac53c5e8c8df216e553838d4bec07 Mon Sep 17 00:00:00 2001 From: arnavb Date: Mon, 19 Aug 2024 02:41:00 +0000 Subject: [PATCH 7/7] clang-format --- cpp/core/config/GlutenConfig.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/core/config/GlutenConfig.cc b/cpp/core/config/GlutenConfig.cc index 357c6c76feb2..bc6ad1cbe859 100644 --- a/cpp/core/config/GlutenConfig.cc +++ b/cpp/core/config/GlutenConfig.cc @@ -15,9 +15,9 @@ * limitations under the License. */ +#include #include #include -#include #include "compute/ProtobufUtils.h" #include "config.pb.h" #include "jni/JniError.h"