apache · WeichenXu123 · Dec 16, 2019 · Dec 16, 2019 · Dec 17, 2019 · Dec 18, 2019
diff --git a/mllib/src/main/scala/org/apache/spark/ml/functions.scala b/mllib/src/main/scala/org/apache/spark/ml/functions.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml
+
+import org.apache.spark.annotation.Since
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.mllib.linalg.{Vector => OldVector}
+import org.apache.spark.sql.Column
+import org.apache.spark.sql.functions.udf
+
+// scalastyle:off
+@Since("3.0.0")
+object functions {
+// scalastyle:on
+
+  private[ml] val vector_to_array_udf = udf { vec: Any =>
+    vec match {
+      case v: Vector => v.toArray
+      case v: OldVector => v.toArray
+      case _ => throw new IllegalArgumentException(
+        "function vector_to_array require an argument of type " +
+        "`org.apache.spark.ml.linalg.Vector` or `org.apache.spark.mllib.linalg.Vector`.")
+    }
+  }
+
+  /**
+   * Convert MLlib sparse/dense vectors in a DataFrame into dense arrays.
+   *
+   * @since 3.0.0
+   */
+  def vector_to_array(v: Column): Column = vector_to_array_udf(v)
+}
diff --git a/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/FunctionsSuite.scala
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml
+
+import org.apache.spark.ml.functions.vector_to_array
+import org.apache.spark.ml.linalg.Vectors
+import org.apache.spark.ml.util.MLTest
+import org.apache.spark.mllib.linalg.{Vectors => OldVectors}
+
+class FunctionsSuite extends MLTest {
+
+  import testImplicits._
+
+  test("test vector_to_array") {
+    val df1 = Seq(
+      (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)),
+      (Vectors.sparse(3, Seq((0, 2.0), (2, 3.0))), OldVectors.sparse(3, Seq((0, 20.0), (2, 30.0))))
+    ).toDF("vec", "oldVec")
+
+    val result = df1.select(vector_to_array('vec), vector_to_array('oldVec))
+      .as[(List[Double], List[Double])]
+      .collect()
+
+    val expected = Array(
+      (List(1.0, 2.0, 3.0), List(10.0, 20.0, 30.0)),
+      (List(2.0, 0.0, 3.0), List(20.0, 0.0, 30.0))
+    )
+    assert(result === expected)
+  }
+}
diff --git a/python/pyspark/ml/functions.py b/python/pyspark/ml/functions.py
@@ -0,0 +1,42 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from pyspark import since, SparkContext
+from pyspark.sql.column import Column, _to_java_column
+
+
+@since(3.0)
+def vector_to_array(col):
+    """
+    Convert MLlib sparse/dense vectors in a DataFrame into dense arrays.
+
+    >>> from pyspark.ml.linalg import Vectors
+    >>> from pyspark.ml.functions import vector_to_array
+    >>> from pyspark.mllib.linalg import Vectors as OldVectors
+    >>> df = spark.createDataFrame([
+    ...     (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)),
+    ...     (Vectors.sparse(3, [(0, 2.0), (2, 3.0)]),
+    ...      OldVectors.sparse(3, [(0, 20.0), (2, 30.0)]))],
+    ...     ["vec", "oldVec"])
+    >>> df.select(vector_to_array("vec").alias("vec"),
+    ...           vector_to_array("oldVec").alias("oldVec")).collect()
+    [Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]),
+     Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])]
+    """
+    sc = SparkContext._active_spark_context
+    return Column(
+        sc._jvm.org.apache.spark.ml.functions.vector_to_array(_to_java_column(col)))