apache · HyukjinKwon · Jan 9, 2018 · Jan 11, 2018 · Jan 13, 2018 · ueshin
diff --git a/.gitignore b/.gitignore
@@ -62,6 +62,8 @@ project/plugins/src_managed/
 project/plugins/target/
 python/lib/pyspark.zip
 python/deps
+python/test_coverage/coverage_data
+python/test_coverage/htmlcov
 python/pyspark/python
 reports/
 scalastyle-on-compile.generated.xml

diff --git a/python/.coveragerc b/python/.coveragerc
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+[run]
+branch = true
+parallel = true
+data_file = ${COVERAGE_DIR}/coverage_data/coverage
diff --git a/python/run-tests-with-coverage b/python/run-tests-with-coverage
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -o pipefail
+set -e
+
+# This variable indicates which coverage executable to run to combine coverages
+# and generate HTMLs, for example, 'coverage3' in Python 3.
+COV_EXEC="${COV_EXEC:-coverage}"
+FWDIR="$(cd "`dirname $0`"; pwd)"
+pushd "$FWDIR" > /dev/null
+
+# Ensure that coverage executable is installed.
+if ! hash $COV_EXEC 2>/dev/null; then
+  echo "Missing coverage executable in your path, skipping PySpark coverage"
+  exit 1
+fi
+
+# Set up the directories for coverage results.
+export COVERAGE_DIR="$FWDIR/test_coverage"
+rm -fr "$COVERAGE_DIR/coverage_data"
+rm -fr "$COVERAGE_DIR/htmlcov"
+mkdir -p "$COVERAGE_DIR/coverage_data"
+
+# Current directory are added in the python path so that it doesn't refer our built
+# pyspark zip library first.
+export PYTHONPATH="$FWDIR:$PYTHONPATH"
+# Also, our sitecustomize.py and coverage_daemon.py are included in the path.
+export PYTHONPATH="$COVERAGE_DIR:$PYTHONPATH"
+
+# We use 'spark.python.daemon.module' configuration to insert the coverage supported workers.
+export SPARK_CONF_DIR="$COVERAGE_DIR/conf"
+
+# This environment variable enables the coverage.
+export COVERAGE_PROCESS_START="$FWDIR/.coveragerc"
+
+# If you'd like to run a specific unittest class, you could do such as
+# SPARK_TESTING=1 ../bin/pyspark pyspark.sql.tests VectorizedUDFTests
+./run-tests "$@"
+
+# Don't run coverage for the coverage command itself
+unset COVERAGE_PROCESS_START
+
+# Coverage could generate empty coverage data files. Remove it to get rid of warnings when combining.
+find $COVERAGE_DIR/coverage_data -size 0 -print0 | xargs -0 rm
+echo "Combining collected coverage data under $COVERAGE_DIR/coverage_data"
+$COV_EXEC combine
+echo "Reporting the coverage data at $COVERAGE_DIR/coverage_data/coverage"
+$COV_EXEC report --include "pyspark/*"
+echo "Generating HTML files for PySpark coverage under $COVERAGE_DIR/htmlcov"
+$COV_EXEC html --ignore-errors --include "pyspark/*" --directory "$COVERAGE_DIR/htmlcov"
+
+popd
diff --git a/python/run-tests.py b/python/run-tests.py
@@ -38,7 +38,7 @@
 
 
 from sparktestsupport import SPARK_HOME  # noqa (suppress pep8 warnings)
-from sparktestsupport.shellutils import which, subprocess_check_output  # noqa
+from sparktestsupport.shellutils import which, subprocess_check_output, run_cmd  # noqa
 from sparktestsupport.modules import all_modules  # noqa
 
 
@@ -175,6 +175,9 @@ def main():
 
     task_queue = Queue.PriorityQueue()
     for python_exec in python_execs:
+        if "COVERAGE_PROCESS_START" in os.environ:
+            # Make sure if coverage is installed.
+            run_cmd([python_exec, "-c", "import coverage"])
         python_implementation = subprocess_check_output(
             [python_exec, "-c", "import platform; print(platform.python_implementation())"],
             universal_newlines=True).strip()

diff --git a/python/test_coverage/conf/spark-defaults.conf b/python/test_coverage/conf/spark-defaults.conf
@@ -0,0 +1,21 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This is used to generate PySpark coverage results. Seems there's no way to
+# add a configuration when SPARK_TESTING environment variable is set because
+# we will directly execute modules by python -m.
+spark.python.daemon.module coverage_daemon
diff --git a/python/test_coverage/coverage_daemon.py b/python/test_coverage/coverage_daemon.py
@@ -0,0 +1,45 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import imp
+
+
+# This is a hack to always refer the main code rather than built zip.
+main_code_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+daemon = imp.load_source("daemon", "%s/pyspark/daemon.py" % main_code_dir)
+
+if "COVERAGE_PROCESS_START" in os.environ:
+    worker = imp.load_source("worker", "%s/pyspark/worker.py" % main_code_dir)
+
+    def _cov_wrapped(*args, **kwargs):
+        import coverage
+        cov = coverage.coverage(
+            config_file=os.environ["COVERAGE_PROCESS_START"])
+        cov.start()
+        try:
+            worker.main(*args, **kwargs)
+        finally:
+            cov.stop()
+            cov.save()
+    daemon.worker_main = _cov_wrapped
+else:
+    raise RuntimeError("COVERAGE_PROCESS_START environment variable is not set, exiting.")
+
+
+if __name__ == '__main__':
+    daemon.manager()
diff --git a/python/test_coverage/sitecustomize.py b/python/test_coverage/sitecustomize.py
@@ -0,0 +1,23 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Note that this 'sitecustomize' module is a built-in feature in Python.
+# If this module is defined, it's executed when the Python session begins.
+# `coverage.process_startup()` seeks if COVERAGE_PROCESS_START environment
+# variable is set or not. If set, it starts to run the coverage.
+import coverage
+coverage.process_startup()