apache · HeartSaVioR · Feb 7, 2022 · Feb 7, 2022 · Feb 7, 2022 · Feb 8, 2022
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -90,6 +90,35 @@ case class ClusteredDistribution(
   }
 }
 
+/**
+ * Represents the requirement of distribution on the stateful operator.
+ *
+ * Each partition in stateful operator initializes state store(s), which are independent with state
+ * store(s) in other partitions. Since it is not possible to repartition the data in state store,
+ * Spark should make sure the physical partitioning of the stateful operator is unchanged across
+ * Spark versions. Violation of this requirement may bring silent correctness issue.
+ *
+ * Since this distribution relies on [[HashPartitioning]] on the physical partitioning of the
+ * stateful operator, only [[HashPartitioning]] (and HashPartitioning in
+ * [[PartitioningCollection]]) can satisfy this distribution.
+ */
+case class StatefulOpClusteredDistribution(
+    expressions: Seq[Expression],
+    requiredNumPartitions: Option[Int] = None) extends Distribution {
+  require(
+    expressions != Nil,
+    "The expressions for hash of a StatefulOpClusteredDistribution should not be Nil. " +
+      "An AllTuples should be used to represent a distribution that only has " +
+      "a single partition.")
+
+  override def createPartitioning(numPartitions: Int): Partitioning = {
+    assert(requiredNumPartitions.isEmpty || requiredNumPartitions.get == numPartitions,
+      s"This StatefulOpClusteredDistribution requires ${requiredNumPartitions.get} " +
+        s"partitions, but the actual number of partitions is $numPartitions.")
+    HashPartitioning(expressions, numPartitions)
+  }
+}
+
 /**
  * Represents data where tuples have been ordered according to the `ordering`
  * [[Expression Expressions]]. Its requirement is defined as the following:
@@ -200,6 +229,11 @@ case object SinglePartition extends Partitioning {
  * Represents a partitioning where rows are split up across partitions based on the hash
  * of `expressions`.  All rows where `expressions` evaluate to the same values are guaranteed to be
  * in the same partition.
+ *
+ * Since [[StatefulOpClusteredDistribution]] relies on this partitioning and Spark requires
+ * stateful operators to retain the same physical partitioning during the lifetime of the query
+ * (including restart), the result of evaluation on `partitionIdExpression` must be unchanged
+ * across Spark versions. Violation of this requirement may bring silent correctness issue.
 test("streaming join should require HashClusteredDistribution from children") { 
   val input1 = MemoryStream[Int] 
   val input2 = MemoryStream[Int] 
   val df1 = input1.toDF.select('value as 'a, 'value * 2 as 'b) 
   val df2 = input2.toDF.select('value as 'a, 'value * 2 as 'b).repartition('b) 
   val joined = df1.join(df2, Seq("a", "b")).select('a) 
   testStream(joined)( 
     AddData(input1, 1.to(1000): _*), 
     AddData(input2, 1.to(1000): _*), 
     CheckAnswer(1.to(1000): _*), 
     Execute { query => 
       // Verify the query plan 
       def partitionExpressionsColumns(expressions: Seq[Expression]): Seq[String] = { 
         expressions.flatMap { 
           case ref: AttributeReference => Some(ref.name) 
         } 
       } 
       val numPartitions = spark.sqlContext.conf.getConf(SQLConf.SHUFFLE_PARTITIONS) 
       assert(query.lastExecution.executedPlan.collect { 
         case j @ StreamingSymmetricHashJoinExec(_, _, _, _, _, _, _, _, 
           ShuffleExchangeExec(opA: HashPartitioning, _, _), 
           ShuffleExchangeExec(opB: HashPartitioning, _, _)) 
             if partitionExpressionsColumns(opA.expressions) === Seq("a", "b") 
               && partitionExpressionsColumns(opB.expressions) === Seq("a", "b") 
               && opA.numPartitions == numPartitions && opB.numPartitions == numPartitions => j 
       }.size == 1) 
     }) 
 } 
 test("streaming join should require HashClusteredDistribution from children") { 
   val input1 = MemoryStream[Int] 
   val input2 = MemoryStream[Int] 
  
   val df1 = input1.toDF.select('value as 'a, 'value * 2 as 'b) 
   val df2 = input2.toDF.select('value as 'a, 'value * 2 as 'b).repartition('b) 
   val joined = df1.join(df2, Seq("a", "b")).select('a) 
  
   testStream(joined)( 
     AddData(input1, 1.to(1000): _*), 
     AddData(input2, 1.to(1000): _*), 
     CheckAnswer(1.to(1000): _*), 
     Execute { query => 
       // Verify the query plan 
       def partitionExpressionsColumns(expressions: Seq[Expression]): Seq[String] = { 
         expressions.flatMap { 
           case ref: AttributeReference => Some(ref.name) 
         } 
       } 
  
       val numPartitions = spark.sqlContext.conf.getConf(SQLConf.SHUFFLE_PARTITIONS) 
  
       assert(query.lastExecution.executedPlan.collect { 
         case j @ StreamingSymmetricHashJoinExec(_, _, _, _, _, _, _, _, 
           ShuffleExchangeExec(opA: HashPartitioning, _, _), 
           ShuffleExchangeExec(opB: HashPartitioning, _, _)) 
             if partitionExpressionsColumns(opA.expressions) === Seq("a", "b") 
               && partitionExpressionsColumns(opB.expressions) === Seq("a", "b") 
               && opA.numPartitions == numPartitions && opB.numPartitions == numPartitions => j 
       }.size == 1) 
     }) 
 } 
  */
 case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
   extends Expression with Partitioning with Unevaluable {
@@ -211,6 +245,10 @@ case class HashPartitioning(expressions: Seq[Expression], numPartitions: Int)
   override def satisfies0(required: Distribution): Boolean = {
     super.satisfies0(required) || {
       required match {
+        case h: StatefulOpClusteredDistribution =>
+          expressions.length == h.expressions.length && expressions.zip(h.expressions).forall {
+            case (l, r) => l.semanticEquals(r)
+          }
         case ClusteredDistribution(requiredClustering, _) =>
           expressions.forall(x => requiredClustering.exists(_.semanticEquals(x)))
         case _ => false

diff --git a/.../main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala b/.../main/scala/org/apache/spark/sql/execution/streaming/StreamingSymmetricHashJoinExec.scala
@@ -185,8 +185,8 @@ case class StreamingSymmetricHashJoinExec(
   val nullRight = new GenericInternalRow(right.output.map(_.withNullability(true)).length)
 
   override def requiredChildDistribution: Seq[Distribution] =
-    ClusteredDistribution(leftKeys, stateInfo.map(_.numPartitions)) ::
-      ClusteredDistribution(rightKeys, stateInfo.map(_.numPartitions)) :: Nil
+    StatefulOpClusteredDistribution(leftKeys, stateInfo.map(_.numPartitions)) ::
+      StatefulOpClusteredDistribution(rightKeys, stateInfo.map(_.numPartitions)) :: Nil
 
   override def output: Seq[Attribute] = joinType match {
     case _: InnerLike => left.output ++ right.output

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
@@ -571,7 +571,7 @@ class StreamingInnerJoinSuite extends StreamingJoinSuite {
       CheckNewAnswer((5, 10, 5, 15, 5, 25)))
   }
 
-  test("streaming join should require HashClusteredDistribution from children") {
+  test("streaming join should require StatefulOpClusteredDistribution from children") {
     val input1 = MemoryStream[Int]
     val input2 = MemoryStream[Int]