apache · eason-yuchen-liu · Jul 2, 2024 · Jul 2, 2024 · Jul 2, 2024 · Jul 2, 2024
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -3812,7 +3812,7 @@
   "STATE_STORE_PROVIDER_DOES_NOT_SUPPORT_FINE_GRAINED_STATE_REPLAY" : {
     "message" : [
       "The given State Store Provider <inputClass> does not extend org.apache.spark.sql.execution.streaming.state.SupportsFineGrainedReplay.",
-      "Therefore, it does not support option snapshotStartBatchId in state data source."
+      "Therefore, it does not support option snapshotStartBatchId or readChangeFeed in state data source."
     ],
     "sqlState" : "42K06"
   },

diff --git a/.../src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala b/.../src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.execution.streaming.StreamingCheckpointConstants.{DI
 import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper.{LeftSide, RightSide}
 import org.apache.spark.sql.execution.streaming.state.{StateSchemaCompatibilityChecker, StateStore, StateStoreConf, StateStoreId, StateStoreProviderId}
 import org.apache.spark.sql.sources.DataSourceRegister
-import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.apache.spark.util.SerializableConfiguration
 
@@ -94,10 +94,20 @@ class StateDataSource extends TableProvider with DataSourceRegister {
           manager.readSchemaFile()
       }
 
-      new StructType()
-        .add("key", keySchema)
-        .add("value", valueSchema)
-        .add("partition_id", IntegerType)
+      if (sourceOptions.readChangeFeed) {
+        new StructType()
+          .add("batch_id", LongType)
+          .add("change_type", StringType)
+          .add("key", keySchema)
+          .add("value", valueSchema)
+          .add("partition_id", IntegerType)
+      } else {
+        new StructType()
+          .add("key", keySchema)
+          .add("value", valueSchema)
+          .add("partition_id", IntegerType)
+      }
+
     } catch {
       case NonFatal(e) =>
         throw StateDataSourceErrors.failedToReadStateSchema(sourceOptions, e)
@@ -125,21 +135,38 @@ class StateDataSource extends TableProvider with DataSourceRegister {
   override def supportsExternalMetadata(): Boolean = false
 }
 
+case class FromSnapshotOptions(
+    snapshotStartBatchId: Long,
+    snapshotPartitionId: Int)
+
+case class ReadChangeFeedOptions(
+    changeStartBatchId: Long,
+    changeEndBatchId: Long
+)
+
 case class StateSourceOptions(
     resolvedCpLocation: String,
     batchId: Long,
     operatorId: Int,
     storeName: String,
     joinSide: JoinSideValues,
-    snapshotStartBatchId: Option[Long],
-    snapshotPartitionId: Option[Int]) {
+    readChangeFeed: Boolean,
+    fromSnapshotOptions: Option[FromSnapshotOptions],
+    readChangeFeedOptions: Option[ReadChangeFeedOptions]) {
   def stateCheckpointLocation: Path = new Path(resolvedCpLocation, DIR_NAME_STATE)
 
   override def toString: String = {
-    s"StateSourceOptions(checkpointLocation=$resolvedCpLocation, batchId=$batchId, " +
-      s"operatorId=$operatorId, storeName=$storeName, joinSide=$joinSide, " +
-      s"snapshotStartBatchId=${snapshotStartBatchId.getOrElse("None")}, " +
-      s"snapshotPartitionId=${snapshotPartitionId.getOrElse("None")})"
+    var desc = s"StateSourceOptions(checkpointLocation=$resolvedCpLocation, batchId=$batchId, " +
+      s"operatorId=$operatorId, storeName=$storeName, joinSide=$joinSide"
+    if (fromSnapshotOptions.isDefined) {
+      desc += s", snapshotStartBatchId=${fromSnapshotOptions.get.snapshotStartBatchId}"
+      desc += s", snapshotPartitionId=${fromSnapshotOptions.get.snapshotPartitionId}"
+    }
+    if (readChangeFeedOptions.isDefined) {
+      desc += s", changeStartBatchId=${readChangeFeedOptions.get.changeStartBatchId}"
+      desc += s", changeEndBatchId=${readChangeFeedOptions.get.changeEndBatchId}"
+    }
+    desc + ")"
   }
 }
 
@@ -151,6 +178,9 @@ object StateSourceOptions extends DataSourceOptions {
   val JOIN_SIDE = newOption("joinSide")
   val SNAPSHOT_START_BATCH_ID = newOption("snapshotStartBatchId")
   val SNAPSHOT_PARTITION_ID = newOption("snapshotPartitionId")
+  val READ_CHANGE_FEED = newOption("readChangeFeed")
+  val CHANGE_START_BATCH_ID = newOption("changeStartBatchId")
+  val CHANGE_END_BATCH_ID = newOption("changeEndBatchId")
 
   object JoinSideValues extends Enumeration {
     type JoinSideValues = Value
@@ -172,16 +202,6 @@ object StateSourceOptions extends DataSourceOptions {
       throw StateDataSourceErrors.requiredOptionUnspecified(PATH)
     }.get
 
-    val resolvedCpLocation = resolvedCheckpointLocation(hadoopConf, checkpointLocation)
-
-    val batchId = Option(options.get(BATCH_ID)).map(_.toLong).orElse {
-      Some(getLastCommittedBatch(sparkSession, resolvedCpLocation))
-    }.get
-
-    if (batchId < 0) {
-      throw StateDataSourceErrors.invalidOptionValueIsNegative(BATCH_ID)
-    }
-
     val operatorId = Option(options.get(OPERATOR_ID)).map(_.toInt)
       .orElse(Some(0)).get
 
@@ -210,30 +230,95 @@ object StateSourceOptions extends DataSourceOptions {
       throw StateDataSourceErrors.conflictOptions(Seq(JOIN_SIDE, STORE_NAME))
     }
 
-    val snapshotStartBatchId = Option(options.get(SNAPSHOT_START_BATCH_ID)).map(_.toLong)
-    if (snapshotStartBatchId.exists(_ < 0)) {
-      throw StateDataSourceErrors.invalidOptionValueIsNegative(SNAPSHOT_START_BATCH_ID)
-    } else if (snapshotStartBatchId.exists(_ > batchId)) {
-      throw StateDataSourceErrors.invalidOptionValue(
-        SNAPSHOT_START_BATCH_ID, s"value should be less than or equal to $batchId")
-    }
+    val resolvedCpLocation = resolvedCheckpointLocation(hadoopConf, checkpointLocation)
+
+    var batchId = Option(options.get(BATCH_ID)).map(_.toLong)
 
+    val snapshotStartBatchId = Option(options.get(SNAPSHOT_START_BATCH_ID)).map(_.toLong)
     val snapshotPartitionId = Option(options.get(SNAPSHOT_PARTITION_ID)).map(_.toInt)
-    if (snapshotPartitionId.exists(_ < 0)) {
-      throw StateDataSourceErrors.invalidOptionValueIsNegative(SNAPSHOT_PARTITION_ID)
-    }
 
-    // both snapshotPartitionId and snapshotStartBatchId are required at the same time, because
-    // each partition may have different checkpoint status
-    if (snapshotPartitionId.isDefined && snapshotStartBatchId.isEmpty) {
-      throw StateDataSourceErrors.requiredOptionUnspecified(SNAPSHOT_START_BATCH_ID)
-    } else if (snapshotPartitionId.isEmpty && snapshotStartBatchId.isDefined) {
-      throw StateDataSourceErrors.requiredOptionUnspecified(SNAPSHOT_PARTITION_ID)
+    val readChangeFeed = Option(options.get(READ_CHANGE_FEED)).exists(_.toBoolean)
+
+    val changeStartBatchId = Option(options.get(CHANGE_START_BATCH_ID)).map(_.toLong)
+    var changeEndBatchId = Option(options.get(CHANGE_END_BATCH_ID)).map(_.toLong)
+
+    var fromSnapshotOptions: Option[FromSnapshotOptions] = None
+    var readChangeFeedOptions: Option[ReadChangeFeedOptions] = None
+
+    if (readChangeFeed) {
+      if (joinSide != JoinSideValues.none) {
+        throw StateDataSourceErrors.conflictOptions(Seq(JOIN_SIDE, READ_CHANGE_FEED))
+      }
+      if (batchId.isDefined) {
+        throw StateDataSourceErrors.conflictOptions(Seq(BATCH_ID, READ_CHANGE_FEED))
+      }
+      if (snapshotStartBatchId.isDefined) {
+        throw StateDataSourceErrors.conflictOptions(Seq(SNAPSHOT_START_BATCH_ID, READ_CHANGE_FEED))
+      }
+      if (snapshotPartitionId.isDefined) {
+        throw StateDataSourceErrors.conflictOptions(Seq(SNAPSHOT_PARTITION_ID, READ_CHANGE_FEED))
+      }
+
+      if (changeStartBatchId.isEmpty) {
+        throw StateDataSourceErrors.requiredOptionUnspecified(CHANGE_START_BATCH_ID)
+      }
+      changeEndBatchId = Option(
+        changeEndBatchId.getOrElse(getLastCommittedBatch(sparkSession, resolvedCpLocation)))
+
+      // changeStartBatchId and changeEndBatchId must all be defined at this point
+      if (changeStartBatchId.get < 0) {
+        throw StateDataSourceErrors.invalidOptionValueIsNegative(CHANGE_START_BATCH_ID)
+      }
+      if (changeEndBatchId.get < changeStartBatchId.get) {
+        throw StateDataSourceErrors.invalidOptionValue(CHANGE_END_BATCH_ID,
+          s"$CHANGE_END_BATCH_ID cannot be smaller than $CHANGE_START_BATCH_ID. " +
+          s"Please check the input to $CHANGE_END_BATCH_ID, or if you are using its default " +
+          s"value, make sure that $CHANGE_START_BATCH_ID is less than ${changeEndBatchId.get}.")
+      }
+
+      batchId = Option(changeEndBatchId.get)
+
+      readChangeFeedOptions = Option(
+        ReadChangeFeedOptions(changeStartBatchId.get, changeEndBatchId.get))
+    } else {
+      if (changeStartBatchId.isDefined) {
+        throw StateDataSourceErrors.invalidOptionValue(CHANGE_START_BATCH_ID,
+            s"Only specify this option when $READ_CHANGE_FEED is set to true.")
+      }
+      if (changeEndBatchId.isDefined) {
+        throw StateDataSourceErrors.invalidOptionValue(CHANGE_END_BATCH_ID,
+          s"Only specify this option when $READ_CHANGE_FEED is set to true.")
+      }
+
+      batchId = Option(batchId.getOrElse(getLastCommittedBatch(sparkSession, resolvedCpLocation)))
+
+      if (batchId.get < 0) {
+        throw StateDataSourceErrors.invalidOptionValueIsNegative(BATCH_ID)
+      }
+      if (snapshotStartBatchId.exists(_ < 0)) {
+        throw StateDataSourceErrors.invalidOptionValueIsNegative(SNAPSHOT_START_BATCH_ID)
+      } else if (snapshotStartBatchId.exists(_ > batchId.get)) {
+        throw StateDataSourceErrors.invalidOptionValue(
+          SNAPSHOT_START_BATCH_ID, s"value should be less than or equal to $batchId")
+      }
+      if (snapshotPartitionId.exists(_ < 0)) {
+        throw StateDataSourceErrors.invalidOptionValueIsNegative(SNAPSHOT_PARTITION_ID)
+      }
+      // both snapshotPartitionId and snapshotStartBatchId are required at the same time, because
+      // each partition may have different checkpoint status
+      if (snapshotPartitionId.isDefined && snapshotStartBatchId.isEmpty) {
+        throw StateDataSourceErrors.requiredOptionUnspecified(SNAPSHOT_START_BATCH_ID)
+      } else if (snapshotPartitionId.isEmpty && snapshotStartBatchId.isDefined) {
+        throw StateDataSourceErrors.requiredOptionUnspecified(SNAPSHOT_PARTITION_ID)
+      }
+
+      fromSnapshotOptions = Option(
+        FromSnapshotOptions(snapshotStartBatchId.get, snapshotPartitionId.get))
     }
 
     StateSourceOptions(
-      resolvedCpLocation, batchId, operatorId, storeName,
-      joinSide, snapshotStartBatchId, snapshotPartitionId)
+      resolvedCpLocation, batchId.get, operatorId, storeName, joinSide,
+      readChangeFeed, fromSnapshotOptions, readChangeFeedOptions)
   }
 
   private def resolvedCheckpointLocation(

diff --git a/...main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala b/...main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala
@@ -23,7 +23,9 @@ import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, Par
 import org.apache.spark.sql.execution.datasources.v2.state.metadata.StateMetadataTableEntry
 import org.apache.spark.sql.execution.datasources.v2.state.utils.SchemaUtil
 import org.apache.spark.sql.execution.streaming.state._
+import org.apache.spark.sql.execution.streaming.state.RecordType.{getRecordTypeAsString, RecordType}
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.SerializableConfiguration
 
 /**
@@ -37,27 +39,32 @@ class StatePartitionReaderFactory(
     stateStoreMetadata: Array[StateMetadataTableEntry]) extends PartitionReaderFactory {
 
   override def createReader(partition: InputPartition): PartitionReader[InternalRow] = {
-    new StatePartitionReader(storeConf, hadoopConf,
-      partition.asInstanceOf[StateStoreInputPartition], schema, stateStoreMetadata)
+    val stateStoreInputPartition = partition.asInstanceOf[StateStoreInputPartition]
+    if (stateStoreInputPartition.sourceOptions.readChangeFeed) {
+      new StateStoreChangeDataPartitionReader(storeConf, hadoopConf,
+        stateStoreInputPartition, schema, stateStoreMetadata)
+    } else {
+      new StatePartitionReader(storeConf, hadoopConf,
+        stateStoreInputPartition, schema, stateStoreMetadata)
+    }
   }
 }
 
 /**
  * An implementation of [[PartitionReader]] for State data source. This is used to support
  * general read from a state store instance, rather than specific to the operator.
  */
-class StatePartitionReader(
+abstract class StatePartitionReaderBase(
     storeConf: StateStoreConf,
     hadoopConf: SerializableConfiguration,
     partition: StateStoreInputPartition,
     schema: StructType,
     stateStoreMetadata: Array[StateMetadataTableEntry])
   extends PartitionReader[InternalRow] with Logging {
-
   private val keySchema = SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
   private val valueSchema = SchemaUtil.getSchemaAsDataType(schema, "value").asInstanceOf[StructType]
 
-  private lazy val provider: StateStoreProvider = {
+  protected lazy val provider: StateStoreProvider = {
     val stateStoreId = StateStoreId(partition.sourceOptions.stateCheckpointLocation.toString,
       partition.sourceOptions.operatorId, partition.partition, partition.sourceOptions.storeName)
     val stateStoreProviderId = StateStoreProviderId(stateStoreId, partition.queryId)
@@ -88,25 +95,7 @@ class StatePartitionReader(
       useMultipleValuesPerKey = false)
   }
 
-  private lazy val store: ReadStateStore = {
-    partition.sourceOptions.snapshotStartBatchId match {
-      case None => provider.getReadStore(partition.sourceOptions.batchId + 1)
-
-      case Some(snapshotStartBatchId) =>
-        if (!provider.isInstanceOf[SupportsFineGrainedReplay]) {
-          throw StateStoreErrors.stateStoreProviderDoesNotSupportFineGrainedReplay(
-            provider.getClass.toString)
-        }
-        provider.asInstanceOf[SupportsFineGrainedReplay]
-          .replayReadStateFromSnapshot(
-            snapshotStartBatchId + 1,
-            partition.sourceOptions.batchId + 1)
-    }
-  }
-
-  private lazy val iter: Iterator[InternalRow] = {
-    store.iterator().map(pair => unifyStateRowPair((pair.key, pair.value)))
-  }
+  protected val iter: Iterator[InternalRow]
 
   private var current: InternalRow = _
 
@@ -124,9 +113,46 @@ class StatePartitionReader(
 
   override def close(): Unit = {
     current = null
-    store.abort()
     provider.close()
   }
+}
+
+/**
+ * An implementation of [[StatePartitionReaderBase]] for the normal mode of State Data
+ * Source. It reads the the state at a particular batchId.
+ */
+class StatePartitionReader(
+    storeConf: StateStoreConf,
+    hadoopConf: SerializableConfiguration,
+    partition: StateStoreInputPartition,
+    schema: StructType,
+    stateStoreMetadata: Array[StateMetadataTableEntry])
+  extends StatePartitionReaderBase(storeConf, hadoopConf, partition, schema, stateStoreMetadata) {
+
+  private lazy val store: ReadStateStore = {
+    partition.sourceOptions.fromSnapshotOptions match {
+      case None => provider.getReadStore(partition.sourceOptions.batchId + 1)
+
+      case Some(fromSnapshotOptions) =>
+        if (!provider.isInstanceOf[SupportsFineGrainedReplay]) {
+          throw StateStoreErrors.stateStoreProviderDoesNotSupportFineGrainedReplay(
+            provider.getClass.toString)
+        }
+        provider.asInstanceOf[SupportsFineGrainedReplay]
+          .replayReadStateFromSnapshot(
+            fromSnapshotOptions.snapshotStartBatchId + 1,
+            partition.sourceOptions.batchId + 1)
+    }
+  }
+
+  override lazy val iter: Iterator[InternalRow] = {
+    store.iterator().map(pair => unifyStateRowPair((pair.key, pair.value)))
+  }
+
+  override def close(): Unit = {
+    store.abort()
+    super.close()
+  }
 
   private def unifyStateRowPair(pair: (UnsafeRow, UnsafeRow)): InternalRow = {
     val row = new GenericInternalRow(3)
@@ -136,3 +162,47 @@ class StatePartitionReader(
     row
   }
 }
+
+/**
+ * An implementation of [[StatePartitionReaderBase]] for the readChangeFeed mode of State Data
+ * Source. It reads the change of state over batches of a particular partition.
+ */
+class StateStoreChangeDataPartitionReader(
+    storeConf: StateStoreConf,
+    hadoopConf: SerializableConfiguration,
+    partition: StateStoreInputPartition,
+    schema: StructType,
+    stateStoreMetadata: Array[StateMetadataTableEntry])
+  extends StatePartitionReaderBase(storeConf, hadoopConf, partition, schema, stateStoreMetadata) {
+
+  private lazy val changeDataReader: StateStoreChangeDataReader = {
+    if (!provider.isInstanceOf[SupportsFineGrainedReplay]) {
+      throw StateStoreErrors.stateStoreProviderDoesNotSupportFineGrainedReplay(
+        provider.getClass.toString)
+    }
+    provider.asInstanceOf[SupportsFineGrainedReplay]
+      .getStateStoreChangeDataReader(
+        partition.sourceOptions.readChangeFeedOptions.get.changeStartBatchId + 1,
+        partition.sourceOptions.readChangeFeedOptions.get.changeEndBatchId + 1)
+  }
+
+  override lazy val iter: Iterator[InternalRow] = {
+    changeDataReader.iterator.map(unifyStateChangeDataRow)
+  }
+
+  override def close(): Unit = {
+    changeDataReader.close()
+    super.close()
+  }
+
+  private def unifyStateChangeDataRow(row: (RecordType, UnsafeRow, UnsafeRow, Long)):
+    InternalRow = {
+    val result = new GenericInternalRow(5)
+    result.update(0, row._4)
+    result.update(1, UTF8String.fromString(getRecordTypeAsString(row._1)))
+    result.update(2, row._2)
+    result.update(3, row._3)
+    result.update(4, partition.partition)
+    result
+  }
+}