diff --git a/docs/ml-features.md b/docs/ml-features.md
index 72643137d96b1..f2992e26fd492 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1283,6 +1283,57 @@ for more details on the API.
+## VectorSizeHint
+
+It can sometimes be useful to explicitly specify the size of the vectors for a column of
+`VectorType`. For example, `VectorAssembler` uses size information from its input columns to
+produce size information and metadata for its output column. While in some cases this information
+can be obtained by inspecting the contents of the column, in a streaming dataframe the contents are
+not available until the stream is started. `VectorSizeHint` allows a user to explicitly specify the
+vector size for a column so that `VectorAssembler`, or other transformers that might
+need to know vector size, can use that column as an input.
+
+To use `VectorSizeHint` a user must set the `inputCol` and `size` parameters. Applying this
+transformer to a dataframe produces a new dataframe with updated metadata for `inputCol` specifying
+the vector size. Downstream operations on the resulting dataframe can get this size using the
+meatadata.
+
+`VectorSizeHint` can also take an optional `handleInvalid` parameter which controls its
+behaviour when the vector column contains nulls or vectors of the wrong size. By default
+`handleInvalid` is set to "error", indicating an exception should be thrown. This parameter can
+also be set to "skip", indicating that rows containing invalid values should be filtered out from
+the resulting dataframe, or "optimistic", indicating that the column should not be checked for
+invalid values and all rows should be kept. Note that the use of "optimistic" can cause the
+resulting dataframe to be in an inconsistent state, me:aning the metadata for the column
+`VectorSizeHint` was applied to does not match the contents of that column. Users should take care
+to avoid this kind of inconsistent state.
+
+
+
+
+Refer to the [VectorSizeHint Scala docs](api/scala/index.html#org.apache.spark.ml.feature.VectorSizeHint)
+for more details on the API.
+
+{% include_example scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala %}
+
+
+
+
+Refer to the [VectorSizeHint Java docs](api/java/org/apache/spark/ml/feature/VectorSizeHint.html)
+for more details on the API.
+
+{% include_example java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java %}
+
+
+
+
+Refer to the [VectorSizeHint Python docs](api/python/pyspark.ml.html#pyspark.ml.feature.VectorSizeHint)
+for more details on the API.
+
+{% include_example python/ml/vector_size_hint_example.py %}
+
+
+
## QuantileDiscretizer
`QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
new file mode 100644
index 0000000000000..d649a2ccbaa72
--- /dev/null
+++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaVectorSizeHintExample.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.ml;
+
+import org.apache.spark.sql.SparkSession;
+
+// $example on$
+import java.util.Arrays;
+
+import org.apache.spark.ml.feature.VectorAssembler;
+import org.apache.spark.ml.feature.VectorSizeHint;
+import org.apache.spark.ml.linalg.VectorUDT;
+import org.apache.spark.ml.linalg.Vectors;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import static org.apache.spark.sql.types.DataTypes.*;
+// $example off$
+
+public class JavaVectorSizeHintExample {
+ public static void main(String[] args) {
+ SparkSession spark = SparkSession
+ .builder()
+ .appName("JavaVectorSizeHintExample")
+ .getOrCreate();
+
+ // $example on$
+ StructType schema = createStructType(new StructField[]{
+ createStructField("id", IntegerType, false),
+ createStructField("hour", IntegerType, false),
+ createStructField("mobile", DoubleType, false),
+ createStructField("userFeatures", new VectorUDT(), false),
+ createStructField("clicked", DoubleType, false)
+ });
+ Row row0 = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0);
+ Row row1 = RowFactory.create(0, 18, 1.0, Vectors.dense(0.0, 10.0), 0.0);
+ Dataset dataset = spark.createDataFrame(Arrays.asList(row0, row1), schema);
+
+ VectorSizeHint sizeHint = new VectorSizeHint()
+ .setInputCol("userFeatures")
+ .setHandleInvalid("skip")
+ .setSize(3);
+
+ Dataset datasetWithSize = sizeHint.transform(dataset);
+ System.out.println("Rows where 'userFeatures' is not the right size are filtered out");
+ datasetWithSize.show(false);
+
+ VectorAssembler assembler = new VectorAssembler()
+ .setInputCols(new String[]{"hour", "mobile", "userFeatures"})
+ .setOutputCol("features");
+
+ // This dataframe can be used by downstream transformers as before
+ Dataset output = assembler.transform(datasetWithSize);
+ System.out.println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column " +
+ "'features'");
+ output.select("features", "clicked").show(false);
+ // $example off$
+
+ spark.stop();
+ }
+}
+
diff --git a/examples/src/main/python/ml/vector_size_hint_example.py b/examples/src/main/python/ml/vector_size_hint_example.py
new file mode 100644
index 0000000000000..fb77dacec629d
--- /dev/null
+++ b/examples/src/main/python/ml/vector_size_hint_example.py
@@ -0,0 +1,57 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+
+# $example on$
+from pyspark.ml.linalg import Vectors
+from pyspark.ml.feature import (VectorSizeHint, VectorAssembler)
+# $example off$
+from pyspark.sql import SparkSession
+
+if __name__ == "__main__":
+ spark = SparkSession\
+ .builder\
+ .appName("VectorSizeHintExample")\
+ .getOrCreate()
+
+ # $example on$
+ dataset = spark.createDataFrame(
+ [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0),
+ (0, 18, 1.0, Vectors.dense([0.0, 10.0]), 0.0)],
+ ["id", "hour", "mobile", "userFeatures", "clicked"])
+
+ sizeHint = VectorSizeHint(
+ inputCol="userFeatures",
+ handleInvalid="skip",
+ size=3)
+
+ datasetWithSize = sizeHint.transform(dataset)
+ print("Rows where 'userFeatures' is not the right size are filtered out")
+ datasetWithSize.show(truncate=False)
+
+ assembler = VectorAssembler(
+ inputCols=["hour", "mobile", "userFeatures"],
+ outputCol="features")
+
+ # This dataframe can be used by downstream transformers as before
+ output = assembler.transform(datasetWithSize)
+ print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
+ output.select("features", "clicked").show(truncate=False)
+ # $example off$
+
+ spark.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala
new file mode 100644
index 0000000000000..688731a791f35
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/ml/VectorSizeHintExample.scala
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+// $example on$
+import org.apache.spark.ml.feature.{VectorAssembler, VectorSizeHint}
+import org.apache.spark.ml.linalg.Vectors
+// $example off$
+import org.apache.spark.sql.SparkSession
+
+object VectorSizeHintExample {
+ def main(args: Array[String]): Unit = {
+ val spark = SparkSession
+ .builder
+ .appName("VectorSizeHintExample")
+ .getOrCreate()
+
+ // $example on$
+ val dataset = spark.createDataFrame(
+ Seq(
+ (0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0),
+ (0, 18, 1.0, Vectors.dense(0.0, 10.0), 0.0))
+ ).toDF("id", "hour", "mobile", "userFeatures", "clicked")
+
+ val sizeHint = new VectorSizeHint()
+ .setInputCol("userFeatures")
+ .setHandleInvalid("skip")
+ .setSize(3)
+
+ val datasetWithSize = sizeHint.transform(dataset)
+ println("Rows where 'userFeatures' is not the right size are filtered out")
+ datasetWithSize.show(false)
+
+ val assembler = new VectorAssembler()
+ .setInputCols(Array("hour", "mobile", "userFeatures"))
+ .setOutputCol("features")
+
+ // This dataframe can be used by downstream transformers as before
+ val output = assembler.transform(datasetWithSize)
+ println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
+ output.select("features", "clicked").show(false)
+ // $example off$
+
+ spark.stop()
+ }
+}
+// scalastyle:on println