Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
zhengruifeng committed Jan 25, 2025
1 parent a45eb2b commit 23b3b82
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ org.apache.spark.ml.recommendation.ALS
org.apache.spark.ml.fpm.FPGrowth

# feature
org.apache.spark.ml.feature.Imputer
org.apache.spark.ml.feature.StandardScaler
org.apache.spark.ml.feature.MaxAbsScaler
org.apache.spark.ml.feature.MinMaxScaler
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ org.apache.spark.ml.recommendation.ALSModel
org.apache.spark.ml.fpm.FPGrowthModel

# feature
org.apache.spark.ml.feature.ImputerModel
org.apache.spark.ml.feature.StandardScalerModel
org.apache.spark.ml.feature.MaxAbsScalerModel
org.apache.spark.ml.feature.MinMaxScalerModel
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,8 @@ class ImputerModel private[ml] (

import ImputerModel._

private[ml] def this() = this(Identifiable.randomUID("imputer"), null)

/** @group setParam */
@Since("3.0.0")
def setInputCol(value: String): this.type = set(inputCol, value)
Expand Down
1 change: 1 addition & 0 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -2261,6 +2261,7 @@ def setOutputCol(self, value: str) -> "ImputerModel":

@property
@since("2.2.0")
@try_remote_attribute_relation
def surrogateDF(self) -> DataFrame:
"""
Returns a DataFrame containing inputCols and their corresponding surrogates,
Expand Down
42 changes: 42 additions & 0 deletions python/pyspark/ml/tests/test_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
HashingTF,
IDF,
IDFModel,
Imputer,
ImputerModel,
NGram,
RFormula,
Tokenizer,
Expand Down Expand Up @@ -541,6 +543,46 @@ def test_word2vec(self):
model2 = Word2VecModel.load(d)
self.assertEqual(str(model), str(model2))

def test_imputer(self):
spark = self.spark
df = spark.createDataFrame(
[
(1.0, float("nan")),
(2.0, float("nan")),
(float("nan"), 3.0),
(4.0, 4.0),
(5.0, 5.0),
],
["a", "b"],
)

imputer = Imputer(strategy="mean")
imputer.setInputCols(["a", "b"])
imputer.setOutputCols(["out_a", "out_b"])

self.assertEqual(imputer.getStrategy(), "mean")
self.assertEqual(imputer.getInputCols(), ["a", "b"])
self.assertEqual(imputer.getOutputCols(), ["out_a", "out_b"])

model = imputer.fit(df)
self.assertEqual(model.surrogateDF.columns, ["a", "b"])
self.assertEqual(model.surrogateDF.count(), 1)
self.assertEqual(list(model.surrogateDF.head()), [3.0, 4.0])

output = model.transform(df)
self.assertEqual(output.columns, ["a", "b", "out_a", "out_b"])
self.assertEqual(output.count(), 5)

# save & load
with tempfile.TemporaryDirectory(prefix="word2vec") as d:
imputer.write().overwrite().save(d)
imputer2 = Imputer.load(d)
self.assertEqual(str(imputer), str(imputer2))

model.write().overwrite().save(d)
model2 = ImputerModel.load(d)
self.assertEqual(str(model), str(model2))

def test_count_vectorizer(self):
df = self.spark.createDataFrame(
[(0, ["a", "b", "c"]), (1, ["a", "b", "b", "c", "a"])],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,7 @@ private[ml] object MLUtils {
(classOf[FPGrowthModel], Set("associationRules", "freqItemsets")),

// Feature Models
(classOf[ImputerModel], Set("surrogateDF")),
(classOf[StandardScalerModel], Set("mean", "std")),
(classOf[MaxAbsScalerModel], Set("maxAbs")),
(classOf[MinMaxScalerModel], Set("originalMax", "originalMin")),
Expand Down

0 comments on commit 23b3b82

Please sign in to comment.