From 0c509be13af5e798ecdc986a82750d81fa65a04f Mon Sep 17 00:00:00 2001 From: Degant Puri Date: Mon, 5 Jun 2023 17:14:36 -0700 Subject: [PATCH] [SPARK-41958][CORE][3.3] Disallow arbitrary custom classpath with proxy user in cluster mode Backporting fix for SPARK-41958 to 3.3 branch from #39474 Below description from original PR. -------------------------- ### What changes were proposed in this pull request? This PR proposes to disallow arbitrary custom classpath with proxy user in cluster mode by default. ### Why are the changes needed? To avoid arbitrary classpath in spark cluster. ### Does this PR introduce _any_ user-facing change? Yes. User should reenable this feature by `spark.submit.proxyUser.allowCustomClasspathInClusterMode`. ### How was this patch tested? Manually tested. Closes #39474 from Ngone51/dev. Lead-authored-by: Peter Toth Co-authored-by: Yi Wu Signed-off-by: Hyukjin Kwon (cherry picked from commit 909da96e1471886a01a9e1def93630c4fd40e74a) ### What changes were proposed in this pull request? ### Why are the changes needed? ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? Closes #41428 from degant/spark-41958-3.3. Lead-authored-by: Degant Puri Co-authored-by: Peter Toth Signed-off-by: Dongjoon Hyun --- .../org/apache/spark/deploy/SparkSubmit.scala | 15 +++++++++++++++ .../apache/spark/internal/config/package.scala | 7 +++++++ docs/core-migration-guide.md | 2 ++ 3 files changed, 24 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index cf840879dcf9c..d0f7805efea66 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -310,6 +310,10 @@ private[spark] class SparkSubmit extends Logging { val isKubernetesClient = clusterManager == KUBERNETES && deployMode == CLIENT val isKubernetesClusterModeDriver = isKubernetesClient && sparkConf.getBoolean("spark.kubernetes.submitInDriver", false) + val isCustomClasspathInClusterModeDisallowed = + !sparkConf.get(ALLOW_CUSTOM_CLASSPATH_BY_PROXY_USER_IN_CLUSTER_MODE) && + args.proxyUser != null && + (isYarnCluster || isMesosCluster || isStandAloneCluster || isKubernetesCluster) if (!isMesosCluster && !isStandAloneCluster) { // Resolve maven dependencies if there are any and add classpath to jars. Add them to py-files @@ -870,6 +874,13 @@ private[spark] class SparkSubmit extends Logging { sparkConf.set("spark.app.submitTime", System.currentTimeMillis().toString) + if (childClasspath.nonEmpty && isCustomClasspathInClusterModeDisallowed) { + childClasspath.clear() + logWarning(s"Ignore classpath ${childClasspath.mkString(", ")} with proxy user specified " + + s"in Cluster mode when ${ALLOW_CUSTOM_CLASSPATH_BY_PROXY_USER_IN_CLUSTER_MODE.key} is " + + s"disabled") + } + (childArgs.toSeq, childClasspath.toSeq, sparkConf, childMainClass) } @@ -923,6 +934,10 @@ private[spark] class SparkSubmit extends Logging { logInfo(s"Classpath elements:\n${childClasspath.mkString("\n")}") logInfo("\n") } + assert(!(args.deployMode == "cluster" && args.proxyUser != null && childClasspath.nonEmpty) || + sparkConf.get(ALLOW_CUSTOM_CLASSPATH_BY_PROXY_USER_IN_CLUSTER_MODE), + s"Classpath of spark-submit should not change in cluster mode if proxy user is specified " + + s"when ${ALLOW_CUSTOM_CLASSPATH_BY_PROXY_USER_IN_CLUSTER_MODE.key} is disabled") val loader = getSubmitClassLoader(sparkConf) for (jar <- childClasspath) { addJarToClasspath(jar, loader) diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala index aa8f63e14efc7..74247a1a2ad92 100644 --- a/core/src/main/scala/org/apache/spark/internal/config/package.scala +++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala @@ -2355,4 +2355,11 @@ package object config { .version("3.3.0") .intConf .createWithDefault(5) + + private[spark] val ALLOW_CUSTOM_CLASSPATH_BY_PROXY_USER_IN_CLUSTER_MODE = + ConfigBuilder("spark.submit.proxyUser.allowCustomClasspathInClusterMode") + .internal() + .version("3.3.3") + .booleanConf + .createWithDefault(true) } diff --git a/docs/core-migration-guide.md b/docs/core-migration-guide.md index 745b80d6eecb2..50c91b7f156cf 100644 --- a/docs/core-migration-guide.md +++ b/docs/core-migration-guide.md @@ -26,6 +26,8 @@ license: | - Since Spark 3.3, Spark migrates its log4j dependency from 1.x to 2.x because log4j 1.x has reached end of life and is no longer supported by the community. Vulnerabilities reported after August 2015 against log4j 1.x were not checked and will not be fixed. Users should rewrite original log4j properties files using log4j2 syntax (XML, JSON, YAML, or properties format). Spark rewrites the `conf/log4j.properties.template` which is included in Spark distribution, to `conf/log4j2.properties.template` with log4j2 properties format. +- Since Spark 3.3.3, `spark.submit.proxyUser.allowCustomClasspathInClusterMode` allows users to disable custom class path in cluster mode by proxy users. It still defaults to `true` to maintain backward compatibility. + ## Upgrading from Core 3.1 to 3.2 - Since Spark 3.2, `spark.scheduler.allocation.file` supports read remote file using hadoop filesystem which means if the path has no scheme Spark will respect hadoop configuration to read it. To restore the behavior before Spark 3.2, you can specify the local scheme for `spark.scheduler.allocation.file` e.g. `file:///path/to/file`.