[SPARK-23120][PYSPARK][ML] Add basic PMML export support to PySpark

## What changes were proposed in this pull request? Adds basic PMML export support for Spark ML stages to PySpark as was previously done in Scala. Includes LinearRegressionModel as the first stage to implement. ## How was this patch tested? Doctest, the main testing work for this is on the Scala side. (TODO holden add the unittest once I finish locally). Author: Holden Karau <[email protected]> Closes #21172 from holdenk/SPARK-23120-add-pmml-export-support-to-pyspark.
apache · Jun 28, 2018 · a95a4af · a95a4af
1 parent 524827f
commit a95a4af
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 1 deletion.
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
@@ -95,6 +95,7 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction
     True
     >>> model.numFeatures
     1
+    >>> model.write().format("pmml").save(model_path + "_2")
 
     .. versionadded:: 1.4.0
     """
@@ -161,7 +162,7 @@ def getEpsilon(self):
         return self.getOrDefault(self.epsilon)
 
 
-class LinearRegressionModel(JavaModel, JavaPredictionModel, JavaMLWritable, JavaMLReadable):
+class LinearRegressionModel(JavaModel, JavaPredictionModel, GeneralJavaMLWritable, JavaMLReadable):
     """
     Model fitted by :class:`LinearRegression`.
 

diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
@@ -1362,6 +1362,23 @@ def test_linear_regression(self):
         except OSError:
             pass
 
+    def test_linear_regression_pmml_basic(self):
+        # Most of the validation is done in the Scala side, here we just check
+        # that we output text rather than parquet (e.g. that the format flag
+        # was respected).
+        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
+                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
+                                        ["label", "weight", "features"])
+        lr = LinearRegression(maxIter=1)
+        model = lr.fit(df)
+        path = tempfile.mkdtemp()
+        lr_path = path + "/lr-pmml"
+        model.write().format("pmml").save(lr_path)
+        pmml_text_list = self.sc.textFile(lr_path).collect()
+        pmml_text = "\n".join(pmml_text_list)
+        self.assertIn("Apache Spark", pmml_text)
+        self.assertIn("PMML", pmml_text)
+
     def test_logistic_regression(self):
         lr = LogisticRegression(maxIter=1)
         path = tempfile.mkdtemp()

diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py
@@ -148,6 +148,23 @@ def overwrite(self):
         return self
 
 
+@inherit_doc
+class GeneralMLWriter(MLWriter):
+    """
+    Utility class that can save ML instances in different formats.
+
+    .. versionadded:: 2.4.0
+    """
+
+    def format(self, source):
+        """
+        Specifies the format of ML export (e.g. "pmml", "internal", or the fully qualified class
+        name for export).
+        """
+        self.source = source
+        return self
+
+
 @inherit_doc
 class JavaMLWriter(MLWriter):
     """
@@ -192,6 +209,24 @@ def session(self, sparkSession):
         return self
 
 
+@inherit_doc
+class GeneralJavaMLWriter(JavaMLWriter):
+    """
+    (Private) Specialization of :py:class:`GeneralMLWriter` for :py:class:`JavaParams` types
+    """
+
+    def __init__(self, instance):
+        super(GeneralJavaMLWriter, self).__init__(instance)
+
+    def format(self, source):
+        """
+        Specifies the format of ML export (e.g. "pmml", "internal", or the fully qualified class
+        name for export).
+        """
+        self._jwrite.format(source)
+        return self
+
+
 @inherit_doc
 class MLWritable(object):
     """
@@ -220,6 +255,17 @@ def write(self):
         return JavaMLWriter(self)
 
 
+@inherit_doc
+class GeneralJavaMLWritable(JavaMLWritable):
+    """
+    (Private) Mixin for ML instances that provide :py:class:`GeneralJavaMLWriter`.
+    """
+
+    def write(self):
+        """Returns an GeneralMLWriter instance for this ML instance."""
+        return GeneralJavaMLWriter(self)
+
+
 @inherit_doc
 class MLReader(BaseReadWrite):
     """