apache · attilapiros · Apr 11, 2018 · Apr 13, 2018 · Apr 15, 2018 · Apr 18, 2018
diff --git a/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala b/core/src/main/scala/org/apache/spark/scheduler/BlacklistTracker.scala
@@ -77,7 +77,7 @@ private[scheduler] class BlacklistTracker (
    * An immutable copy of the set of nodes that are currently blacklisted.  Kept in an
    * AtomicReference to make [[nodeBlacklist()]] thread-safe.
    */
-  private val _nodeBlacklist = new AtomicReference[Set[String]](Set())
+  private val _nodeBlacklist = new AtomicReference[Map[String, Long]](Map())
   /**
    * Time when the next blacklist will expire.  Used as a
    * shortcut to avoid iterating over all entries in the blacklist when none will have expired.
@@ -126,7 +126,7 @@ private[scheduler] class BlacklistTracker (
           nodeIdToBlacklistExpiryTime.remove(node)
           listenerBus.post(SparkListenerNodeUnblacklisted(now, node))
         }
-        _nodeBlacklist.set(nodeIdToBlacklistExpiryTime.keySet.toSet)
+        updateNodeBlacklist()
       }
       updateNextExpiryTime()
     }
@@ -196,7 +196,7 @@ private[scheduler] class BlacklistTracker (
 
           nodeIdToBlacklistExpiryTime.put(host, expiryTimeForNewBlacklists)
           listenerBus.post(SparkListenerNodeBlacklisted(now, host, 1))
-          _nodeBlacklist.set(nodeIdToBlacklistExpiryTime.keySet.toSet)
+          updateNodeBlacklist()
           killExecutorsOnBlacklistedNode(host)
           updateNextExpiryTime()
         }
@@ -216,6 +216,10 @@ private[scheduler] class BlacklistTracker (
     }
   }
 
+  private def updateNodeBlacklist(): Unit = {
+    _nodeBlacklist.set(collection.immutable.Map(nodeIdToBlacklistExpiryTime.toSeq: _*))
+  }
+
   def updateBlacklistForSuccessfulTaskSet(
       stageId: Int,
       stageAttemptId: Int,
@@ -258,7 +262,7 @@ private[scheduler] class BlacklistTracker (
             s"executors blacklisted: ${blacklistedExecsOnNode}")
           nodeIdToBlacklistExpiryTime.put(node, expiryTimeForNewBlacklists)
           listenerBus.post(SparkListenerNodeBlacklisted(now, node, blacklistedExecsOnNode.size))
-          _nodeBlacklist.set(nodeIdToBlacklistExpiryTime.keySet.toSet)
+          updateNodeBlacklist()
           killExecutorsOnBlacklistedNode(node)
         }
       }
@@ -273,7 +277,7 @@ private[scheduler] class BlacklistTracker (
    * Get the full set of nodes that are blacklisted.  Unlike other methods in this class, this *IS*
    * thread-safe -- no lock required on a taskScheduler.
    */
-  def nodeBlacklist(): Set[String] = {
+  def nodeBlacklist(): Map[String, Long] = {
     _nodeBlacklist.get()
   }
 

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -651,8 +651,8 @@ private[spark] class TaskSchedulerImpl(
    * Get a snapshot of the currently blacklisted nodes for the entire application.  This is
    * thread-safe -- it can be called without a lock on the TaskScheduler.
    */
-  def nodeBlacklist(): scala.collection.immutable.Set[String] = {
-    blacklistTrackerOpt.map(_.nodeBlacklist()).getOrElse(scala.collection.immutable.Set())
+  def nodeBlacklistWithExpiryTimes(): scala.collection.immutable.Map[String, Long] = {
+    blacklistTrackerOpt.map(_.nodeBlacklist()).getOrElse(scala.collection.immutable.Map())
   }
 
   // By default, rack is unknown

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -110,7 +110,7 @@ private[spark] object CoarseGrainedClusterMessages {
       requestedTotal: Int,
       localityAwareTasks: Int,
       hostToLocalTaskCount: Map[String, Int],
-      nodeBlacklist: Set[String])
+      nodeBlacklistWithExpiryTimes: Map[String, Long])
     extends CoarseGrainedClusterMessage
 
   // Check if an executor was force-killed but for a reason unrelated to the running tasks.

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -170,8 +170,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
         if (executorDataMap.contains(executorId)) {
           executorRef.send(RegisterExecutorFailed("Duplicate executor ID: " + executorId))
           context.reply(true)
-        } else if (scheduler.nodeBlacklist != null &&
-          scheduler.nodeBlacklist.contains(hostname)) {
+        } else if (scheduler.nodeBlacklistWithExpiryTimes != null &&
+          scheduler.nodeBlacklistWithExpiryTimes.contains(hostname)) {
           // If the cluster manager gives us an executor on a blacklisted node (because it
           // already started allocating those resources before we informed it of our blacklist,
           // or if it ignored our blacklist), then we reject that executor immediately.

diff --git a/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala b/core/src/test/scala/org/apache/spark/HeartbeatReceiverSuite.scala
@@ -272,7 +272,7 @@ private class FakeSchedulerBackend(
 
   protected override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = {
     clusterManagerEndpoint.ask[Boolean](
-      RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount, Set.empty[String]))
+      RequestExecutors(requestedTotal, localityAwareTasks, hostToLocalTaskCount, Map.empty))
   }
 
   protected override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = {

diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
@@ -501,7 +501,8 @@ class StandaloneDynamicAllocationSuite
 
     // Get "localhost" on a blacklist.
     val taskScheduler = mock(classOf[TaskSchedulerImpl])
-    when(taskScheduler.nodeBlacklist()).thenReturn(Set("blacklisted-host"))
+    when(taskScheduler.nodeBlacklistWithExpiryTimes())
+      .thenReturn(Map("blacklisted-host" -> Long.MaxValue))
     when(taskScheduler.sc).thenReturn(sc)
     sc.taskScheduler = taskScheduler
 

diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistTrackerSuite.scala
@@ -177,7 +177,7 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
         "hostA", exec = "1", index = partition, failureReason = "testing")
     }
     blacklist.updateBlacklistForSuccessfulTaskSet(0, 0, taskSetBlacklist0.execToFailures)
-    assert(blacklist.nodeBlacklist() === Set())
+    assert(blacklist.nodeBlacklist().keySet === Set())
     assertEquivalentToSet(blacklist.isNodeBlacklisted(_), Set())
     assertEquivalentToSet(blacklist.isExecutorBlacklisted(_), Set("1"))
     verify(listenerBusMock).post(SparkListenerExecutorBlacklisted(0, "1", 4))
@@ -191,7 +191,7 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
         "hostA", exec = "2", index = partition, failureReason = "testing")
     }
     blacklist.updateBlacklistForSuccessfulTaskSet(0, 0, taskSetBlacklist1.execToFailures)
-    assert(blacklist.nodeBlacklist() === Set("hostA"))
+    assert(blacklist.nodeBlacklist().keySet === Set("hostA"))
     assertEquivalentToSet(blacklist.isNodeBlacklisted(_), Set("hostA"))
     verify(listenerBusMock).post(SparkListenerNodeBlacklisted(0, "hostA", 2))
     assertEquivalentToSet(blacklist.isExecutorBlacklisted(_), Set("1", "2"))
@@ -202,7 +202,7 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     val timeout = blacklist.BLACKLIST_TIMEOUT_MILLIS + 1
     clock.advance(timeout)
     blacklist.applyBlacklistTimeout()
-    assert(blacklist.nodeBlacklist() === Set())
+    assert(blacklist.nodeBlacklist().keySet === Set())
     assertEquivalentToSet(blacklist.isNodeBlacklisted(_), Set())
     assertEquivalentToSet(blacklist.isExecutorBlacklisted(_), Set())
     verify(listenerBusMock).post(SparkListenerExecutorUnblacklisted(timeout, "2"))
@@ -215,7 +215,7 @@ class BlacklistTrackerSuite extends SparkFunSuite with BeforeAndAfterEach with M
     taskSetBlacklist2.updateBlacklistForFailedTask(
       "hostA", exec = "1", index = 0, failureReason = "testing")
     blacklist.updateBlacklistForSuccessfulTaskSet(2, 0, taskSetBlacklist2.execToFailures)
-    assert(blacklist.nodeBlacklist() === Set())
+    assert(blacklist.nodeBlacklist().keySet === Set())
     assertEquivalentToSet(blacklist.isNodeBlacklisted(_), Set())
     assertEquivalentToSet(blacklist.isExecutorBlacklisted(_), Set())
   }

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -729,7 +729,7 @@ private[spark] class ApplicationMaster(args: ApplicationMasterArguments) extends
         Option(allocator) match {
           case Some(a) =>
             if (a.requestTotalExecutorsWithPreferredLocalities(r.requestedTotal,
-              r.localityAwareTasks, r.hostToLocalTaskCount, r.nodeBlacklist)) {
+              r.localityAwareTasks, r.hostToLocalTaskCount, r.nodeBlacklistWithExpiryTimes)) {
               resetAllocatorInterval()
             }
             context.reply(true)

diff --git a/...s/yarn/src/main/scala/org/apache/spark/deploy/yarn/FailureWithinTimeIntervalTracker.scala b/...s/yarn/src/main/scala/org/apache/spark/deploy/yarn/FailureWithinTimeIntervalTracker.scala
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.deploy.yarn
+
+import scala.collection.mutable
+
+import org.apache.spark.SparkConf
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.{Clock, SystemClock}
+
+private[spark] class FailureWithinTimeIntervalTracker(sparkConf: SparkConf) extends Logging {
+
+  private var clock: Clock = new SystemClock
+
+  private val executorFailuresValidityInterval =
+    sparkConf.get(config.EXECUTOR_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS).getOrElse(-1L)
+
+  // Queue to store the timestamp of failed executors for each host
+  private val failedExecutorsTimeStampsPerHost = mutable.Map[String, mutable.Queue[Long]]()
+
+  private val failedExecutorsTimeStamps = new mutable.Queue[Long]()
+
+  private def getNumFailuresWithinValidityInterval(
+      failedExecutorsTimeStampsForHost: mutable.Queue[Long],
+      endTime: Long): Int = {
+    while (executorFailuresValidityInterval > 0
+      && failedExecutorsTimeStampsForHost.nonEmpty
+      && failedExecutorsTimeStampsForHost.head < endTime - executorFailuresValidityInterval) {
+      failedExecutorsTimeStampsForHost.dequeue()
+    }
+    failedExecutorsTimeStampsForHost.size
+  }
+
+  /**
+   * Use a different clock. This is mainly used for testing.
+   */
+  def setClock(newClock: Clock): Unit = {
+    clock = newClock
+  }
+
+  def getNumExecutorsFailed: Int = synchronized {
+    getNumFailuresWithinValidityInterval(failedExecutorsTimeStamps, clock.getTimeMillis())
+  }
+
+  def registerFailureOnHost(hostname: String): Unit = synchronized {
+    val timeMillis = clock.getTimeMillis()
+    failedExecutorsTimeStamps.enqueue(timeMillis)
+    val failedExecutorsOnHost =
+      failedExecutorsTimeStampsPerHost.getOrElse(hostname, {
+        val failureOnHost = mutable.Queue[Long]()
+        failedExecutorsTimeStampsPerHost.put(hostname, failureOnHost)
+        failureOnHost
+      })
+    failedExecutorsOnHost.enqueue(timeMillis)  }
+
+  def registerExecutorFailure(): Unit = synchronized {
+    val timeMillis = clock.getTimeMillis()
+    failedExecutorsTimeStamps.enqueue(timeMillis)
+  }
+
+  def getNumExecutorFailuresOnHost(hostname: String): Int =
+    failedExecutorsTimeStampsPerHost.get(hostname).map { failedExecutorsOnHost =>
+      getNumFailuresWithinValidityInterval(failedExecutorsOnHost, clock.getTimeMillis())
+    }.getOrElse(0)
+
+}
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -42,7 +42,7 @@ import org.apache.spark.scheduler.{ExecutorExited, ExecutorLossReason}
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RemoveExecutor
 import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages.RetrieveLastAllocatedExecutorId
 import org.apache.spark.scheduler.cluster.SchedulerBackendUtils
-import org.apache.spark.util.{Clock, SystemClock, ThreadUtils}
+import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
 
 /**
  * YarnAllocator is charged with requesting containers from the YARN ResourceManager and deciding
@@ -102,18 +102,14 @@ private[yarn] class YarnAllocator(
   private var executorIdCounter: Int =
     driverRef.askSync[Int](RetrieveLastAllocatedExecutorId)
 
-  // Queue to store the timestamp of failed executors
-  private val failedExecutorsTimeStamps = new Queue[Long]()
+  private val failureWithinTimeIntervalTracker = new FailureWithinTimeIntervalTracker(sparkConf)
 
-  private var clock: Clock = new SystemClock
-
-  private val executorFailuresValidityInterval =
-    sparkConf.get(EXECUTOR_ATTEMPT_FAILURE_VALIDITY_INTERVAL_MS).getOrElse(-1L)
+  private val allocatorBlacklistTracker =
+    new YarnAllocatorBlacklistTracker(sparkConf, amClient, failureWithinTimeIntervalTracker)
 
   @volatile private var targetNumExecutors =
     SchedulerBackendUtils.getInitialTargetExecutorNumber(sparkConf)
 
-  private var currentNodeBlacklist = Set.empty[String]
 
   // Executor loss reason requests that are pending - maps from executor ID for inquiry to a
   // list of requesters that should be responded to once we find out why the given executor
@@ -149,7 +145,6 @@ private[yarn] class YarnAllocator(
 
   private val labelExpression = sparkConf.get(EXECUTOR_NODE_LABEL_EXPRESSION)
 
-
   // A map to store preferred hostname and possible task numbers running on it.
   private var hostToLocalTaskCounts: Map[String, Int] = Map.empty
 
@@ -160,26 +155,12 @@ private[yarn] class YarnAllocator(
   private[yarn] val containerPlacementStrategy =
     new LocalityPreferredContainerPlacementStrategy(sparkConf, conf, resource, resolver)
 
-  /**
-   * Use a different clock for YarnAllocator. This is mainly used for testing.
-   */
-  def setClock(newClock: Clock): Unit = {
-    clock = newClock
-  }
-
   def getNumExecutorsRunning: Int = runningExecutors.size()
 
-  def getNumExecutorsFailed: Int = synchronized {
-    val endTime = clock.getTimeMillis()
+  def getNumExecutorsFailed: Int = failureWithinTimeIntervalTracker.getNumExecutorsFailed
 
-    while (executorFailuresValidityInterval > 0
-      && failedExecutorsTimeStamps.nonEmpty
-      && failedExecutorsTimeStamps.head < endTime - executorFailuresValidityInterval) {
-      failedExecutorsTimeStamps.dequeue()
-    }
-
-    failedExecutorsTimeStamps.size
-  }
+  def getFailureWithinTimeIntervalTracker: FailureWithinTimeIntervalTracker =
+    failureWithinTimeIntervalTracker
 
   /**
    * A sequence of pending container requests that have not yet been fulfilled.
@@ -204,35 +185,23 @@ private[yarn] class YarnAllocator(
    * @param localityAwareTasks number of locality aware tasks to be used as container placement hint
    * @param hostToLocalTaskCount a map of preferred hostname to possible task counts to be used as
    *                             container placement hint.
-   * @param nodeBlacklist a set of blacklisted nodes, which is passed in to avoid allocating new
-    *                      containers on them. It will be used to update the application master's
-    *                      blacklist.
+   * @param schedulerBlacklistedNodesWithExpiry blacklisted nodes with expiry times, which is passed
+   *                       in to avoid allocating new containers on them. It will be used to update
+   *                       the application master's blacklist.
    * @return Whether the new requested total is different than the old value.
    */
   def requestTotalExecutorsWithPreferredLocalities(
-      requestedTotal: Int,
-      localityAwareTasks: Int,
-      hostToLocalTaskCount: Map[String, Int],
-      nodeBlacklist: Set[String]): Boolean = synchronized {
+    requestedTotal: Int,
+    localityAwareTasks: Int,
+    hostToLocalTaskCount: Map[String, Int],
+    schedulerBlacklistedNodesWithExpiry: Map[String, Long]): Boolean = synchronized {
     this.numLocalityAwareTasks = localityAwareTasks
     this.hostToLocalTaskCounts = hostToLocalTaskCount
 
     if (requestedTotal != targetNumExecutors) {
       logInfo(s"Driver requested a total number of $requestedTotal executor(s).")
       targetNumExecutors = requestedTotal
-
-      // Update blacklist infomation to YARN ResouceManager for this application,
-      // in order to avoid allocating new Containers on the problematic nodes.
-      val blacklistAdditions = nodeBlacklist -- currentNodeBlacklist
-      val blacklistRemovals = currentNodeBlacklist -- nodeBlacklist
-      if (blacklistAdditions.nonEmpty) {
-        logInfo(s"adding nodes to YARN application master's blacklist: $blacklistAdditions")
-      }
-      if (blacklistRemovals.nonEmpty) {
-        logInfo(s"removing nodes from YARN application master's blacklist: $blacklistRemovals")
-      }
-      amClient.updateBlacklist(blacklistAdditions.toList.asJava, blacklistRemovals.toList.asJava)
-      currentNodeBlacklist = nodeBlacklist
+      allocatorBlacklistTracker.setSchedulerBlacklistedNodes(schedulerBlacklistedNodesWithExpiry)
       true
     } else {
       false
@@ -268,6 +237,7 @@ private[yarn] class YarnAllocator(
     val allocateResponse = amClient.allocate(progressIndicator)
 
     val allocatedContainers = allocateResponse.getAllocatedContainers()
+    allocatorBlacklistTracker.setNumClusterNodes(allocateResponse.getNumClusterNodes)
 
     if (allocatedContainers.size > 0) {
       logDebug(("Allocated containers: %d. Current executor count: %d. " +
@@ -602,8 +572,7 @@ private[yarn] class YarnAllocator(
               completedContainer.getDiagnostics,
               PMEM_EXCEEDED_PATTERN))
           case _ =>
-            // Enqueue the timestamp of failed executor
-            failedExecutorsTimeStamps.enqueue(clock.getTimeMillis())
+            allocatorBlacklistTracker.handleResourceAllocationFailure(hostOpt)
             (true, "Container marked as failed: " + containerId + onHostStr +
               ". Exit status: " + completedContainer.getExitStatus +
               ". Diagnostics: " + completedContainer.getDiagnostics)