apache · eason-yuchen-liu · Jul 2, 2024 · Jul 2, 2024 · Jul 2, 2024 · Jul 2, 2024
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -3812,7 +3812,7 @@
   "STATE_STORE_PROVIDER_DOES_NOT_SUPPORT_FINE_GRAINED_STATE_REPLAY" : {
     "message" : [
       "The given State Store Provider <inputClass> does not extend org.apache.spark.sql.execution.streaming.state.SupportsFineGrainedReplay.",
-      "Therefore, it does not support option snapshotStartBatchId in state data source."
+      "Therefore, it does not support option snapshotStartBatchId or readChangeFeed in state data source."
     ],
     "sqlState" : "42K06"
   },

diff --git a/.../src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala b/.../src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateDataSource.scala
@@ -36,7 +36,7 @@ import org.apache.spark.sql.execution.streaming.StreamingCheckpointConstants.{DI
 import org.apache.spark.sql.execution.streaming.StreamingSymmetricHashJoinHelper.{LeftSide, RightSide}
 import org.apache.spark.sql.execution.streaming.state.{StateSchemaCompatibilityChecker, StateStore, StateStoreConf, StateStoreId, StateStoreProviderId}
 import org.apache.spark.sql.sources.DataSourceRegister
-import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.apache.spark.util.SerializableConfiguration
 
@@ -94,10 +94,21 @@ class StateDataSource extends TableProvider with DataSourceRegister {
           manager.readSchemaFile()
       }
 
-      new StructType()
-        .add("key", keySchema)
-        .add("value", valueSchema)
-        .add("partition_id", IntegerType)
+      if (sourceOptions.readChangeFeed) {
+        new StructType()
+          .add("key", keySchema)
+          .add("value", valueSchema)
+          .add("change_type", StringType)
+          .add("batch_id", LongType)
+          .add("partition_id", IntegerType)
+      } else {
+        new StructType()
+          .add("key", keySchema)
+          .add("value", valueSchema)
+          .add("partition_id", IntegerType)
+      }
+
+
     } catch {
       case NonFatal(e) =>
         throw StateDataSourceErrors.failedToReadStateSchema(sourceOptions, e)
@@ -132,7 +143,10 @@ case class StateSourceOptions(
     storeName: String,
     joinSide: JoinSideValues,
     snapshotStartBatchId: Option[Long],
-    snapshotPartitionId: Option[Int]) {
+    snapshotPartitionId: Option[Int],
+    readChangeFeed: Boolean,
+    changeStartBatchId: Option[Long],
+    changeEndBatchId: Option[Long]) {
   def stateCheckpointLocation: Path = new Path(resolvedCpLocation, DIR_NAME_STATE)
 
   override def toString: String = {
@@ -151,6 +165,9 @@ object StateSourceOptions extends DataSourceOptions {
   val JOIN_SIDE = newOption("joinSide")
   val SNAPSHOT_START_BATCH_ID = newOption("snapshotStartBatchId")
   val SNAPSHOT_PARTITION_ID = newOption("snapshotPartitionId")
+  val READ_CHANGE_FEED = newOption("readChangeFeed")
+  val CHANGE_START_BATCH_ID = newOption("changeStartBatchId")
+  val CHANGE_END_BATCH_ID = newOption("changeEndBatchId")
 
   object JoinSideValues extends Enumeration {
     type JoinSideValues = Value
@@ -231,9 +248,45 @@ object StateSourceOptions extends DataSourceOptions {
       throw StateDataSourceErrors.requiredOptionUnspecified(SNAPSHOT_PARTITION_ID)
     }
 
+    val readChangeFeed = Option(options.get(READ_CHANGE_FEED)).exists(_.toBoolean)
+
+    val changeStartBatchId = Option(options.get(CHANGE_START_BATCH_ID)).map(_.toLong)
+    var changeEndBatchId = Option(options.get(CHANGE_END_BATCH_ID)).map(_.toLong)
+
+    if (readChangeFeed) {
+      if (joinSide != JoinSideValues.none) {
+        throw StateDataSourceErrors.conflictOptions(Seq(JOIN_SIDE, READ_CHANGE_FEED))
+      }
+      if (changeStartBatchId.isEmpty) {
+        throw StateDataSourceErrors.requiredOptionUnspecified(CHANGE_START_BATCH_ID)
+      }
+      changeEndBatchId = Option(changeEndBatchId.getOrElse(batchId))
+
+      // changeStartBatchId and changeEndBatchId must all be defined at this point
+      if (changeStartBatchId.get < 0) {
+        throw StateDataSourceErrors.invalidOptionValueIsNegative(CHANGE_START_BATCH_ID)
+      }
+      if (changeEndBatchId.get < changeStartBatchId.get) {
+        throw StateDataSourceErrors.invalidOptionValue(CHANGE_END_BATCH_ID,
+          s"$CHANGE_END_BATCH_ID cannot be smaller than $CHANGE_START_BATCH_ID. " +
+          s"Please check the input to $CHANGE_END_BATCH_ID, or if you are using its default " +
+          s"value, make sure that $CHANGE_START_BATCH_ID is less than ${changeEndBatchId.get}.")
+      }
+    } else {
+      if (changeStartBatchId.isDefined) {
+        throw StateDataSourceErrors.invalidOptionValue(CHANGE_START_BATCH_ID,
+            s"Only specify this option when $READ_CHANGE_FEED is set to true.")
+      }
+      if (changeEndBatchId.isDefined) {
+        throw StateDataSourceErrors.invalidOptionValue(CHANGE_END_BATCH_ID,
+          s"Only specify this option when $READ_CHANGE_FEED is set to true.")
+      }
+    }
+
     StateSourceOptions(
       resolvedCpLocation, batchId, operatorId, storeName,
-      joinSide, snapshotStartBatchId, snapshotPartitionId)
+      joinSide, snapshotStartBatchId, snapshotPartitionId,
+      readChangeFeed, changeStartBatchId, changeEndBatchId)
   }
 
   private def resolvedCheckpointLocation(

diff --git a/...main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala b/...main/scala/org/apache/spark/sql/execution/datasources/v2/state/StatePartitionReader.scala
@@ -23,7 +23,9 @@ import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, Par
 import org.apache.spark.sql.execution.datasources.v2.state.metadata.StateMetadataTableEntry
 import org.apache.spark.sql.execution.datasources.v2.state.utils.SchemaUtil
 import org.apache.spark.sql.execution.streaming.state._
+import org.apache.spark.sql.execution.streaming.state.RecordType.{getRecordTypeAsString, RecordType}
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.SerializableConfiguration
 
 /**
@@ -37,8 +39,14 @@ class StatePartitionReaderFactory(
     stateStoreMetadata: Array[StateMetadataTableEntry]) extends PartitionReaderFactory {
 
   override def createReader(partition: InputPartition): PartitionReader[InternalRow] = {
-    new StatePartitionReader(storeConf, hadoopConf,
-      partition.asInstanceOf[StateStoreInputPartition], schema, stateStoreMetadata)
+    val stateStoreInputPartition = partition.asInstanceOf[StateStoreInputPartition]
+    if (stateStoreInputPartition.sourceOptions.readChangeFeed) {
+      new StateStoreChangeDataPartitionReader(storeConf, hadoopConf,
+        partition.asInstanceOf[StateStoreInputPartition], schema, stateStoreMetadata)
+    } else {
+      new StatePartitionReader(storeConf, hadoopConf,
+        partition.asInstanceOf[StateStoreInputPartition], schema, stateStoreMetadata)
+    }
   }
 }
 
@@ -57,7 +65,7 @@ class StatePartitionReader(
   private val keySchema = SchemaUtil.getSchemaAsDataType(schema, "key").asInstanceOf[StructType]
   private val valueSchema = SchemaUtil.getSchemaAsDataType(schema, "value").asInstanceOf[StructType]
 
-  private lazy val provider: StateStoreProvider = {
+  protected lazy val provider: StateStoreProvider = {
     val stateStoreId = StateStoreId(partition.sourceOptions.stateCheckpointLocation.toString,
       partition.sourceOptions.operatorId, partition.partition, partition.sourceOptions.storeName)
     val stateStoreProviderId = StateStoreProviderId(stateStoreId, partition.queryId)
@@ -104,11 +112,11 @@ class StatePartitionReader(
     }
   }
 
-  private lazy val iter: Iterator[InternalRow] = {
+  protected lazy val iter: Iterator[InternalRow] = {
     store.iterator().map(pair => unifyStateRowPair((pair.key, pair.value)))
   }
 
-  private var current: InternalRow = _
+  protected var current: InternalRow = _
 
   override def next(): Boolean = {
     if (iter.hasNext) {
@@ -136,3 +144,48 @@ class StatePartitionReader(
     row
   }
 }
+
+/**
+ * An implementation of [[PartitionReader]] for the readChangeFeed mode of State Data Source.
+ * It reads the change of state over batches of a particular partition.
+ */
+class StateStoreChangeDataPartitionReader(
+    storeConf: StateStoreConf,
+    hadoopConf: SerializableConfiguration,
+    partition: StateStoreInputPartition,
+    schema: StructType,
+    stateStoreMetadata: Array[StateMetadataTableEntry])
+  extends StatePartitionReader(storeConf, hadoopConf, partition, schema, stateStoreMetadata) {
+
+  private lazy val changeDataReader: StateStoreChangeDataReader = {
+    if (!provider.isInstanceOf[SupportsFineGrainedReplay]) {
+      throw StateStoreErrors.stateStoreProviderDoesNotSupportFineGrainedReplay(
+        provider.getClass.toString)
+    }
+    provider.asInstanceOf[SupportsFineGrainedReplay]
+      .getStateStoreChangeDataReader(
+        partition.sourceOptions.changeStartBatchId.get + 1,
+        partition.sourceOptions.changeEndBatchId.get + 1)
+  }
+
+  override protected lazy val iter: Iterator[InternalRow] = {
+    changeDataReader.iterator.map(unifyStateChangeDataRow)
+  }
+
+  override def close(): Unit = {
+    current = null
+    changeDataReader.close()
+    provider.close()
+  }
+
+  private def unifyStateChangeDataRow(row: (RecordType, UnsafeRow, UnsafeRow, Long)):
+    InternalRow = {
+    val result = new GenericInternalRow(5)
+    result.update(0, row._2)
+    result.update(1, row._3)
+    result.update(2, UTF8String.fromString(getRecordTypeAsString(row._1)))
+    result.update(3, row._4)
+    result.update(4, partition.partition)
+    result
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateTable.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/state/StateTable.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.execution.datasources.v2.state.StateSourceOptions.Jo
 import org.apache.spark.sql.execution.datasources.v2.state.metadata.StateMetadataTableEntry
 import org.apache.spark.sql.execution.datasources.v2.state.utils.SchemaUtil
 import org.apache.spark.sql.execution.streaming.state.StateStoreConf
-import org.apache.spark.sql.types.{IntegerType, StructType}
+import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.apache.spark.util.ArrayImplicits._
 
@@ -76,6 +76,9 @@ class StateTable(
   override def properties(): util.Map[String, String] = Map.empty[String, String].asJava
 
   private def isValidSchema(schema: StructType): Boolean = {
+    if (sourceOptions.readChangeFeed) {
+      return isValidChangeDataSchema(schema)
+    }
     if (schema.fieldNames.toImmutableArraySeq != Seq("key", "value", "partition_id")) {
       false
     } else if (!SchemaUtil.getSchemaAsDataType(schema, "key").isInstanceOf[StructType]) {
@@ -89,6 +92,25 @@ class StateTable(
     }
   }
 
+  private def isValidChangeDataSchema(schema: StructType): Boolean = {
+    if (schema.fieldNames.toImmutableArraySeq !=
+      Seq("key", "value", "change_type", "batch_id", "partition_id")) {
+      false
+    } else if (!SchemaUtil.getSchemaAsDataType(schema, "key").isInstanceOf[StructType]) {
+      false
+    } else if (!SchemaUtil.getSchemaAsDataType(schema, "value").isInstanceOf[StructType]) {
+      false
+    } else if (!SchemaUtil.getSchemaAsDataType(schema, "change_type").isInstanceOf[StringType]) {
+      false
+    } else if (!SchemaUtil.getSchemaAsDataType(schema, "batch_id").isInstanceOf[LongType]) {
+      false
+    } else if (!SchemaUtil.getSchemaAsDataType(schema, "partition_id").isInstanceOf[IntegerType]) {
+      false
+    } else {
+      true
+    }
+  }
+
   override def metadataColumns(): Array[MetadataColumn] = Array.empty
 }
 

diff --git a/...n/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala b/...n/scala/org/apache/spark/sql/execution/streaming/state/HDFSBackedStateStoreProvider.scala
@@ -978,4 +978,47 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with
 
     result
   }
+
+  override def getStateStoreChangeDataReader(startVersion: Long, endVersion: Long):
+    StateStoreChangeDataReader = {
+    new HDFSBackedStateStoreChangeDataReader(fm, baseDir, startVersion, endVersion,
+      CompressionCodec.createCodec(sparkConf, storeConf.compressionCodec),
+      keySchema, valueSchema)
+  }
+}
+
+/** [[StateStoreChangeDataReader]] implementation for [[HDFSBackedStateStoreProvider]] */
+class HDFSBackedStateStoreChangeDataReader(
+    fm: CheckpointFileManager,
+    stateLocation: Path,
+    startVersion: Long,
+    endVersion: Long,
+    compressionCodec: CompressionCodec,
+    keySchema: StructType,
+    valueSchema: StructType)
+  extends StateStoreChangeDataReader(
+    fm, stateLocation, startVersion, endVersion, compressionCodec) {
+
+  override protected var changelogSuffix: String = "delta"
+
+  override def getNext(): (RecordType.Value, UnsafeRow, UnsafeRow, Long) = {
+    val reader = currentChangelogReader()
+    if (reader == null) {
+      return null
+    }
+    val (recordType, keyArray, valueArray, _) = reader.next()
+    val keyRow = new UnsafeRow(keySchema.fields.length)
+    keyRow.pointTo(keyArray, keyArray.length)
+    if (valueArray == null) {
+      (recordType, keyRow, null, currentChangelogVersion - 1)
+    } else {
+      val valueRow = new UnsafeRow(valueSchema.fields.length)
+      // If valueSize in existing file is not multiple of 8, floor it to multiple of 8.
+      // This is a workaround for the following:
+      // Prior to Spark 2.3 mistakenly append 4 bytes to the value row in
+      // `RowBasedKeyValueBatch`, which gets persisted into the checkpoint data
+      valueRow.pointTo(valueArray, (valueArray.length / 8) * 8)
+      (recordType, keyRow, valueRow, currentChangelogVersion - 1)
+    }
+  }
 }
diff --git a/...main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala b/...main/scala/org/apache/spark/sql/execution/streaming/state/RocksDBStateStoreProvider.scala
@@ -18,16 +18,20 @@
 package org.apache.spark.sql.execution.streaming.state
 
 import java.io._
+import java.util.concurrent.ConcurrentHashMap
 
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.Path
 
 import org.apache.spark.{SparkConf, SparkEnv}
 import org.apache.spark.internal.{Logging, MDC}
 import org.apache.spark.internal.LogKeys._
+import org.apache.spark.io.CompressionCodec
 import org.apache.spark.sql.catalyst.expressions.UnsafeRow
 import org.apache.spark.sql.errors.QueryExecutionErrors
+import org.apache.spark.sql.execution.streaming.CheckpointFileManager
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
@@ -392,6 +396,19 @@ private[sql] class RocksDBStateStoreProvider
       case e: Throwable => throw QueryExecutionErrors.cannotLoadStore(e)
     }
   }
+
+  override def getStateStoreChangeDataReader(startVersion: Long, endVersion: Long):
+    StateStoreChangeDataReader = {
+    val statePath = stateStoreId.storeCheckpointLocation()
+    val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf)
+    new RocksDBStateStoreChangeDataReader(
+      CheckpointFileManager.create(statePath, hadoopConf),
+      statePath,
+      startVersion,
+      endVersion,
+      CompressionCodec.createCodec(sparkConf, storeConf.compressionCodec),
+      keyValueEncoderMap)
+  }
 }
 
 object RocksDBStateStoreProvider {
@@ -487,3 +504,34 @@ object RocksDBStateStoreProvider {
     CUSTOM_METRIC_PINNED_BLOCKS_MEM_USAGE, CUSTOM_METRIC_NUM_EXTERNAL_COL_FAMILIES,
     CUSTOM_METRIC_NUM_INTERNAL_COL_FAMILIES)
 }
+
+/** [[StateStoreChangeDataReader]] implementation for [[RocksDBStateStoreProvider]] */
+class RocksDBStateStoreChangeDataReader(
+    fm: CheckpointFileManager,
+    stateLocation: Path,
+    startVersion: Long,
+    endVersion: Long,
+    compressionCodec: CompressionCodec,
+    keyValueEncoderMap:
+      ConcurrentHashMap[String, (RocksDBKeyStateEncoder, RocksDBValueStateEncoder)])
+  extends StateStoreChangeDataReader(
+    fm, stateLocation, startVersion, endVersion, compressionCodec) {
+
+  override protected var changelogSuffix: String = "changelog"
+
+  override def getNext(): (RecordType.Value, UnsafeRow, UnsafeRow, Long) = {
+    val reader = currentChangelogReader()
+    if (reader == null) {
+      return null
+    }
+    val (recordType, keyArray, valueArray, columnFamily) = reader.next()
+    val (rocksDBKeyStateEncoder, rocksDBValueStateEncoder) = keyValueEncoderMap.get(columnFamily)
+    val keyRow = rocksDBKeyStateEncoder.decodeKey(keyArray)
+    if (valueArray == null) {
+      (recordType, keyRow, null, currentChangelogVersion - 1)
+    } else {
+      val valueRow = rocksDBValueStateEncoder.decodeValue(valueArray)
+      (recordType, keyRow, valueRow, currentChangelogVersion - 1)
+    }
+  }
+}