Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into SPARK-32444
Browse files Browse the repository at this point in the history
# Conflicts:
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q10.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q10/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q14a.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q14a/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q14b.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q14b/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q16/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23a/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23b.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q23b/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q33.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q33/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q35.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q35/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q56.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q56/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q58.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q58/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q60.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q60/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q69.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q69/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q70.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q70/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q83/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q93.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q93/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q94/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q95/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q10a.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q10a/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q14.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q14/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q14a.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q14a/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q35.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q35/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q35a.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q35a/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q5a.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q5a/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q70a.sf100/explain.txt
#	sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q70a/explain.txt
#	sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
  • Loading branch information
wangyum committed Aug 28, 2020
2 parents c436bc4 + 5775073 commit b5bb9a2
Show file tree
Hide file tree
Showing 461 changed files with 6,918 additions and 12,828 deletions.
26 changes: 20 additions & 6 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ on:
pull_request:
branches:
- master
workflow_dispatch:
inputs:
target:
description: 'Target branch to run'
required: true

jobs:
# Build: build Spark and run the tests for specified modules.
Expand Down Expand Up @@ -82,18 +87,26 @@ jobs:
# GitHub Actions' default miniconda to use in pip packaging test.
CONDA_PREFIX: /usr/share/miniconda
GITHUB_PREV_SHA: ${{ github.event.before }}
GITHUB_INPUT_BRANCH: ${{ github.event.inputs.target }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
# In order to fetch changed files
with:
fetch-depth: 0
- name: Merge dispatched input branch
if: ${{ github.event.inputs.target != '' }}
run: git merge --progress --ff-only origin/${{ github.event.inputs.target }}
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
- name: Cache Scala, SBT, Maven and Zinc
uses: actions/cache@v1
uses: actions/cache@v2
with:
path: build
key: build-${{ hashFiles('**/pom.xml') }}
path: |
build/apache-maven-*
build/zinc-*
build/scala-*
build/*.jar
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Maven local repository
Expand All @@ -107,7 +120,7 @@ jobs:
uses: actions/cache@v2
with:
path: ~/.ivy2/cache
key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml') }}-${{ hashFiles('**/plugins.sbt') }}
key: ${{ matrix.java }}-${{ matrix.hadoop }}-ivy-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
${{ matrix.java }}-${{ matrix.hadoop }}-ivy-
- name: Install JDK ${{ matrix.java }}
Expand Down Expand Up @@ -217,7 +230,7 @@ jobs:
run: |
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme
pip3 install flake8 'sphinx<3.1.0' numpy pydata_sphinx_theme ipython nbsphinx
- name: Install R 4.0
run: |
sudo sh -c "echo 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' >> /etc/apt/sources.list"
Expand All @@ -236,10 +249,11 @@ jobs:
ruby-version: 2.7
- name: Install dependencies for documentation generation
run: |
# pandoc is required to generate PySpark APIs as well in nbsphinx.
sudo apt-get install -y libcurl4-openssl-dev pandoc
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme
pip install 'sphinx<3.1.0' mkdocs numpy pydata_sphinx_theme ipython nbsphinx
gem install jekyll jekyll-redirect-from rouge
sudo Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')"
- name: Scala linter
Expand Down
1 change: 1 addition & 0 deletions binder/apt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
openjdk-8-jre
24 changes: 24 additions & 0 deletions binder/postBuild
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# This file is used for Binder integration to install PySpark available in
# Jupyter notebook.

VERSION=$(python -c "exec(open('python/pyspark/version.py').read()); print(__version__)")
pip install "pyspark[sql,ml,mllib]<=$VERSION"
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ public boolean sharedByteBufAllocators() {
}

/**
* If enabled then off-heap byte buffers will be prefered for the shared ByteBuf allocators.
* If enabled then off-heap byte buffers will be preferred for the shared ByteBuf allocators.
*/
public boolean preferDirectBufsForSharedByteBufAllocators() {
return conf.getBoolean("spark.network.io.preferDirectBufs", true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public interface DriverPlugin {
* initialization.
* <p>
* It's recommended that plugins be careful about what operations are performed in this call,
* preferrably performing expensive operations in a separate thread, or postponing them until
* preferably performing expensive operations in a separate thread, or postponing them until
* the application has fully started.
*
* @param sc The SparkContext loading the plugin.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ private[spark] trait ExecutorAllocationClient {
* Default implementation delegates to kill, scheduler must override
* if it supports graceful decommissioning.
*
* @param executorsAndDecominfo identifiers of executors & decom info.
* @param executorsAndDecomInfo identifiers of executors & decom info.
* @param adjustTargetNumExecutors whether the target number of executors will be adjusted down
* after these executors have been decommissioned.
* @return the ids of the executors acknowledged by the cluster manager to be removed.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,9 @@ private[spark] class ExecutorAllocationManager(
numExecutorsTargetPerResourceProfileId.keys.foreach { rpId =>
numExecutorsTargetPerResourceProfileId(rpId) = initialNumExecutors
}
numExecutorsToAddPerResourceProfileId.keys.foreach { rpId =>
numExecutorsToAddPerResourceProfileId(rpId) = 1
}
executorMonitor.reset()
}

Expand Down Expand Up @@ -595,7 +598,7 @@ private[spark] class ExecutorAllocationManager(
// reset the newExecutorTotal to the existing number of executors
if (testing || executorsRemoved.nonEmpty) {
if (decommissionEnabled) {
executorMonitor.executorsDecommissioned(executorsRemoved)
executorMonitor.executorsDecommissioned(executorsRemoved.toSeq)
} else {
executorMonitor.executorsKilled(executorsRemoved.toSeq)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ private class HistoryServerMemoryManager(
conf: SparkConf) extends Logging {

private val maxUsage = conf.get(MAX_IN_MEMORY_STORE_USAGE)
private val currentUsage = new AtomicLong(0L)
private val active = new HashMap[(String, Option[String]), Long]()
// Visible for testing.
private[history] val currentUsage = new AtomicLong(0L)
private[history] val active = new HashMap[(String, Option[String]), Long]()

def initialize(): Unit = {
logInfo("Initialized memory manager: " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ private[history] class HybridStore extends KVStore {
private var backgroundThread: Thread = null

// A hash map that stores all classes that had been writen to inMemoryStore
private val klassMap = new ConcurrentHashMap[Class[_], Boolean]
// Visible for testing
private[history] val klassMap = new ConcurrentHashMap[Class[_], Boolean]

override def getMetadata[T](klass: Class[T]): T = {
getStore().getMetadata(klass)
Expand Down Expand Up @@ -165,8 +166,9 @@ private[history] class HybridStore extends KVStore {

/**
* This method return the store that we should use.
* Visible for testing.
*/
private def getStore(): KVStore = {
private[history] def getStore(): KVStore = {
if (shouldUseInMemoryStore.get) {
inMemoryStore
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import org.apache.spark.util.Utils.executeAndGetOutput
/**
* The default plugin that is loaded into a Spark application to control how custom
* resources are discovered. This executes the discovery script specified by the user
* and gets the json output back and contructs ResourceInformation objects from that.
* and gets the json output back and constructs ResourceInformation objects from that.
* If the user specifies custom plugins, this is the last one to be executed and
* throws if the resource isn't discovered.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1825,7 +1825,7 @@ private[spark] class DAGScheduler(
if (bmAddress != null) {
val externalShuffleServiceEnabled = env.blockManager.externalShuffleServiceEnabled
val isHostDecommissioned = taskScheduler
.getExecutorDecommissionInfo(bmAddress.executorId)
.getExecutorDecommissionState(bmAddress.executorId)
.exists(_.isHostDecommissioned)

// Shuffle output of all executors on host `bmAddress.host` may be lost if:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,23 @@
package org.apache.spark.scheduler

/**
* Provides more detail when an executor is being decommissioned.
* Message providing more detail when an executor is being decommissioned.
* @param message Human readable reason for why the decommissioning is happening.
* @param isHostDecommissioned Whether the host (aka the `node` or `worker` in other places) is
* being decommissioned too. Used to infer if the shuffle data might
* be lost even if the external shuffle service is enabled.
*/
private[spark]
case class ExecutorDecommissionInfo(message: String, isHostDecommissioned: Boolean)

/**
* State related to decommissioning that is kept by the TaskSchedulerImpl. This state is derived
* from the info message above but it is kept distinct to allow the state to evolve independently
* from the message.
*/
case class ExecutorDecommissionState(
// Timestamp the decommissioning commenced as per the Driver's clock,
// to estimate when the executor might eventually be lost if EXECUTOR_DECOMMISSION_KILL_INTERVAL
// is configured.
startTime: Long,
isHostDecommissioned: Boolean)
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ private[spark] class ShuffleMapStage(
* Partitions that either haven't yet been computed, or that were computed on an executor
* that has since been lost, so should be re-computed. This variable is used by the
* DAGScheduler to determine when a stage has completed. Task successes in both the active
* attempt for the stage or in earlier attempts for this stage can cause paritition ids to get
* attempt for the stage or in earlier attempts for this stage can cause partition ids to get
* removed from pendingPartitions. As a result, this variable may be inconsistent with the pending
* tasks in the TaskSetManager for the active attempt for the stage (the partitions stored here
* will always be a subset of the partitions that the TaskSetManager thinks are pending).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ private[spark] trait TaskScheduler {
/**
* If an executor is decommissioned, return its corresponding decommission info
*/
def getExecutorDecommissionInfo(executorId: String): Option[ExecutorDecommissionInfo]
def getExecutorDecommissionState(executorId: String): Option[ExecutorDecommissionState]

/**
* Process a lost executor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ private[spark] class TaskSchedulerImpl(

// We add executors here when we first get decommission notification for them. Executors can
// continue to run even after being asked to decommission, but they will eventually exit.
val executorsPendingDecommission = new HashMap[String, ExecutorDecommissionInfo]
val executorsPendingDecommission = new HashMap[String, ExecutorDecommissionState]

// When they exit and we know of that via heartbeat failure, we will add them to this cache.
// This cache is consulted to know if a fetch failure is because a source executor was
Expand All @@ -152,7 +152,7 @@ private[spark] class TaskSchedulerImpl(
.ticker(new Ticker{
override def read(): Long = TimeUnit.MILLISECONDS.toNanos(clock.getTimeMillis())
})
.build[String, ExecutorDecommissionInfo]()
.build[String, ExecutorDecommissionState]()
.asMap()

def runningTasksByExecutors: Map[String, Int] = synchronized {
Expand Down Expand Up @@ -293,7 +293,7 @@ private[spark] class TaskSchedulerImpl(
private[scheduler] def createTaskSetManager(
taskSet: TaskSet,
maxTaskFailures: Int): TaskSetManager = {
new TaskSetManager(this, taskSet, maxTaskFailures, blacklistTrackerOpt)
new TaskSetManager(this, taskSet, maxTaskFailures, blacklistTrackerOpt, clock)
}

override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = synchronized {
Expand Down Expand Up @@ -922,22 +922,36 @@ private[spark] class TaskSchedulerImpl(
synchronized {
// Don't bother noting decommissioning for executors that we don't know about
if (executorIdToHost.contains(executorId)) {
// The scheduler can get multiple decommission updates from multiple sources,
// and some of those can have isHostDecommissioned false. We merge them such that
// if we heard isHostDecommissioned ever true, then we keep that one since it is
// most likely coming from the cluster manager and thus authoritative
val oldDecomInfo = executorsPendingDecommission.get(executorId)
if (!oldDecomInfo.exists(_.isHostDecommissioned)) {
executorsPendingDecommission(executorId) = decommissionInfo
val oldDecomStateOpt = executorsPendingDecommission.get(executorId)
val newDecomState = if (oldDecomStateOpt.isEmpty) {
// This is the first time we are hearing of decommissioning this executor,
// so create a brand new state.
ExecutorDecommissionState(
clock.getTimeMillis(),
decommissionInfo.isHostDecommissioned)
} else {
val oldDecomState = oldDecomStateOpt.get
if (!oldDecomState.isHostDecommissioned && decommissionInfo.isHostDecommissioned) {
// Only the cluster manager is allowed to send decommission messages with
// isHostDecommissioned set. So the new decommissionInfo is from the cluster
// manager and is thus authoritative. Flip isHostDecommissioned to true but keep the old
// decommission start time.
ExecutorDecommissionState(
oldDecomState.startTime,
isHostDecommissioned = true)
} else {
oldDecomState
}
}
executorsPendingDecommission(executorId) = newDecomState
}
}
rootPool.executorDecommission(executorId)
backend.reviveOffers()
}

override def getExecutorDecommissionInfo(executorId: String)
: Option[ExecutorDecommissionInfo] = synchronized {
override def getExecutorDecommissionState(executorId: String)
: Option[ExecutorDecommissionState] = synchronized {
executorsPendingDecommission
.get(executorId)
.orElse(Option(decommissionedExecutorsRemoved.get(executorId)))
Expand All @@ -948,14 +962,14 @@ private[spark] class TaskSchedulerImpl(
val reason = givenReason match {
// Handle executor process loss due to decommissioning
case ExecutorProcessLost(message, origWorkerLost, origCausedByApp) =>
val executorDecommissionInfo = getExecutorDecommissionInfo(executorId)
val executorDecommissionState = getExecutorDecommissionState(executorId)
ExecutorProcessLost(
message,
// Also mark the worker lost if we know that the host was decommissioned
origWorkerLost || executorDecommissionInfo.exists(_.isHostDecommissioned),
origWorkerLost || executorDecommissionState.exists(_.isHostDecommissioned),
// Executor loss is certainly not caused by app if we knew that this executor is being
// decommissioned
causedByApp = executorDecommissionInfo.isEmpty && origCausedByApp)
causedByApp = executorDecommissionState.isEmpty && origCausedByApp)
case e => e
}

Expand Down Expand Up @@ -1047,8 +1061,8 @@ private[spark] class TaskSchedulerImpl(
}


val decomInfo = executorsPendingDecommission.remove(executorId)
decomInfo.foreach(decommissionedExecutorsRemoved.put(executorId, _))
val decomState = executorsPendingDecommission.remove(executorId)
decomState.foreach(decommissionedExecutorsRemoved.put(executorId, _))

if (reason != LossReasonPending) {
executorIdToHost -= executorId
Expand Down Expand Up @@ -1085,12 +1099,12 @@ private[spark] class TaskSchedulerImpl(

// exposed for test
protected final def isExecutorDecommissioned(execId: String): Boolean =
getExecutorDecommissionInfo(execId).nonEmpty
getExecutorDecommissionState(execId).isDefined

// exposed for test
protected final def isHostDecommissioned(host: String): Boolean = {
hostToExecutors.get(host).exists { executors =>
executors.exists(e => getExecutorDecommissionInfo(e).exists(_.isHostDecommissioned))
executors.exists(e => getExecutorDecommissionState(e).exists(_.isHostDecommissioned))
}
}

Expand Down
Loading

0 comments on commit b5bb9a2

Please sign in to comment.