feathr-ai · jaymo001 · Oct 5, 2022 · Sep 28, 2022 · Oct 4, 2022 · Oct 4, 2022
diff --git a/src/main/scala/com/linkedin/feathr/common/AnchorExtractor.scala b/src/main/scala/com/linkedin/feathr/common/AnchorExtractor.scala
@@ -1,7 +1,5 @@
 package com.linkedin.feathr.common
 
-import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
-
 /**
   * Provides feature values based on some "raw" data element
  *
@@ -39,12 +37,14 @@ trait AnchorExtractor[T] extends AnchorExtractorBase[T] with SparkRowExtractor {
    * @param datum input row
    * @return list of feature keys
    */
-  def getKeyFromRow(datum: GenericRowWithSchema): Seq[String] = getKey(datum.asInstanceOf[T])
+  def getKeyFromRow(datum: Any): Seq[String] = getKey(datum.asInstanceOf[T])
 
   /**
    * Get the feature value from the row
    * @param datum input row
    * @return A map of feature name to feature value
    */
-  def getFeaturesFromRow(datum: GenericRowWithSchema): Map[String, FeatureValue] = getFeatures(datum.asInstanceOf[T])
+  def getFeaturesFromRow(datum: Any): Map[String, FeatureValue] = getFeatures(datum.asInstanceOf[T])
+
+  override def toString: String = getClass.getSimpleName
 }
diff --git a/src/main/scala/com/linkedin/feathr/common/CanConvertToAvroRDD.scala b/src/main/scala/com/linkedin/feathr/common/CanConvertToAvroRDD.scala
@@ -0,0 +1,20 @@
+package com.linkedin.feathr.common
+
+import org.apache.avro.generic.IndexedRecord
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.DataFrame
+
+/**
+ * If an AnchorExtractor only works on a Avro record, it should extends
+ * this trait, and use convertToAvroRdd to do a one-time batch conversion of DataFrame to RDD of their choice.
+ * convertToAvroRdd will be called by Feathr engine before calling getKeyFromRow() and getFeaturesFromRow() in AnchorExtractor.
+ */
+trait CanConvertToAvroRDD {
+
+  /**
+   * One time batch converting the input data source into a RDD[IndexedRecord] for feature extraction later
+   * @param df input data source
+   * @return batch preprocessed dataframe, as RDD[IndexedRecord]
+   */
+  def convertToAvroRdd(df: DataFrame) : RDD[IndexedRecord]
+}
diff --git a/src/main/scala/com/linkedin/feathr/common/SparkRowExtractor.scala b/src/main/scala/com/linkedin/feathr/common/SparkRowExtractor.scala
@@ -1,7 +1,5 @@
 package com.linkedin.feathr.common
 
-import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
-
 /**
  * An extractor trait that provides APIs to transform a Spark GenericRowWithSchema into feature values
  */
@@ -12,12 +10,12 @@ trait SparkRowExtractor {
    * @param datum input row
    * @return list of feature keys
    */
-  def getKeyFromRow(datum: GenericRowWithSchema): Seq[String]
+  def getKeyFromRow(datum: Any): Seq[String]
 
   /**
    * Get the feature value from the row
    * @param datum input row
    * @return A map of feature name to feature value
    */
-  def getFeaturesFromRow(datum: GenericRowWithSchema): Map[String, FeatureValue]
+  def getFeaturesFromRow(datum: Any): Map[String, FeatureValue]
 }
diff --git a/.../linkedin/feathr/offline/anchored/anchorExtractor/SimpleConfigurableAnchorExtractor.scala b/.../linkedin/feathr/offline/anchored/anchorExtractor/SimpleConfigurableAnchorExtractor.scala
@@ -10,7 +10,6 @@ import com.linkedin.feathr.offline.mvel.plugins.FeathrExpressionExecutionContext
 import com.linkedin.feathr.offline.mvel.{MvelContext, MvelUtils}
 import com.linkedin.feathr.offline.util.FeatureValueTypeValidator
 import org.apache.log4j.Logger
-import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
 import org.apache.spark.sql.types._
 import org.mvel2.MVEL
 
@@ -66,7 +65,7 @@ private[offline] class SimpleConfigurableAnchorExtractor( @JsonProperty("key") k
    * @param datum input row
    * @return list of feature keys
    */
-  override def getKeyFromRow(datum: GenericRowWithSchema): Seq[String] = {
+  override def getKeyFromRow(datum: Any): Seq[String] = {
     getKey(datum.asInstanceOf[Any])
   }
 
@@ -107,7 +106,7 @@ private[offline] class SimpleConfigurableAnchorExtractor( @JsonProperty("key") k
    * @param row input row
    *  @return A map of feature name to feature value
    */
-  override def getFeaturesFromRow(row: GenericRowWithSchema) = {
+  override def getFeaturesFromRow(row: Any) = {
     getFeatures(row.asInstanceOf[Any])
   }
 
@@ -147,7 +146,7 @@ private[offline] class SimpleConfigurableAnchorExtractor( @JsonProperty("key") k
       featureTypeConfigs(featureRefStr)
     }
     val featureValue = offline.FeatureValue.fromTypeConfig(value, featureTypeConfig)
-    FeatureValueTypeValidator.validate(featureValue, featureTypeConfigs(featureRefStr))
+    FeatureValueTypeValidator.validate(featureRefStr, featureValue, featureTypeConfigs(featureRefStr) )
     (featureRefStr, featureValue)
   }
 

diff --git a/...main/scala/com/linkedin/feathr/offline/anchored/keyExtractor/MVELSourceKeyExtractor.scala b/...main/scala/com/linkedin/feathr/offline/anchored/keyExtractor/MVELSourceKeyExtractor.scala
@@ -43,7 +43,7 @@ private[feathr] class MVELSourceKeyExtractor(val anchorExtractorV1: AnchorExtrac
       .toDF()
   }
 
-  def getKey(datum: GenericRowWithSchema): Seq[String] = {
+  def getKey(datum: Any): Seq[String] = {
     anchorExtractorV1.getKeyFromRow(datum)
   }
 
@@ -55,7 +55,7 @@ private[feathr] class MVELSourceKeyExtractor(val anchorExtractorV1: AnchorExtrac
    */
   override def getKeyColumnNames(datum: Option[Any]): Seq[String] = {
     if (datum.isDefined) {
-      val size = getKey(datum.get.asInstanceOf[GenericRowWithSchema]).size
+      val size = getKey(datum.get).size
       (1 to size).map(JOIN_KEY_PREFIX + _)
     } else {
       // return empty join key to signal empty dataset
@@ -86,5 +86,6 @@ private[feathr] class MVELSourceKeyExtractor(val anchorExtractorV1: AnchorExtrac
   // this helps to reduce the number of joins
   // to the observation data
   // The default toString does not work, because toString of each object have different values
-  override def toString: String = getClass.getSimpleName + " with keyExprs:" + keyExprs.mkString(" key:")
+  override def toString: String = getClass.getSimpleName + " with keyExprs:" + keyExprs.mkString(" key:") +
+    "anchorExtractor:" + anchorExtractorV1.toString
 }
diff --git a/src/main/scala/com/linkedin/feathr/offline/config/FeathrConfigLoader.scala b/src/main/scala/com/linkedin/feathr/offline/config/FeathrConfigLoader.scala
@@ -327,7 +327,7 @@ private[offline] class AnchorLoader extends JsonDeserializer[FeatureAnchor] {
             case Some(tType) => offline.FeatureValue.fromTypeConfig(rawValue, tType)
             case None => offline.FeatureValue(rawValue, featureType, key)
           }
-          FeatureValueTypeValidator.validate(featureValue, featureTypeConfig)
+          FeatureValueTypeValidator.validate(featureValue, featureTypeConfig, key)
           (key, featureValue)
       }
       .toMap

diff --git a/src/main/scala/com/linkedin/feathr/offline/generation/StreamingFeatureGenerator.scala b/src/main/scala/com/linkedin/feathr/offline/generation/StreamingFeatureGenerator.scala
@@ -6,7 +6,7 @@ import com.linkedin.feathr.common.JoiningFeatureParams
 import com.linkedin.feathr.offline.config.location.KafkaEndpoint
 import com.linkedin.feathr.offline.generation.outputProcessor.PushToRedisOutputProcessor.TABLE_PARAM_CONFIG_NAME
 import com.linkedin.feathr.offline.generation.outputProcessor.RedisOutputUtils
-import com.linkedin.feathr.offline.job.FeatureTransformation.getFeatureJoinKey
+import com.linkedin.feathr.offline.job.FeatureTransformation.getFeatureKeyColumnNames
 import com.linkedin.feathr.offline.job.{FeatureGenSpec, FeatureTransformation}
 import com.linkedin.feathr.offline.logical.FeatureGroups
 import com.linkedin.feathr.offline.source.accessor.DataPathHandler
@@ -111,7 +111,7 @@ class StreamingFeatureGenerator(dataPathHandlers: List[DataPathHandler]) {
           // Apply feature transformation
           val transformedResult = DataFrameBasedSqlEvaluator.transform(anchor.featureAnchor.extractor.asInstanceOf[SimpleAnchorExtractorSpark],
             withKeyColumnDF, featureNamePrefixPairs, anchor.featureAnchor.featureTypeConfigs)
-          val outputJoinKeyColumnNames = getFeatureJoinKey(keyExtractor, withKeyColumnDF)
+          val outputJoinKeyColumnNames = getFeatureKeyColumnNames(keyExtractor, withKeyColumnDF)
           val selectedColumns = outputJoinKeyColumnNames ++ anchor.selectedFeatures.filter(keyTaggedFeatures.map(_.featureName).contains(_))
           val cleanedDF = transformedResult.df.select(selectedColumns.head, selectedColumns.tail:_*)
           val keyColumnNames = FeatureTransformation.getStandardizedKeyNames(outputJoinKeyColumnNames.size)