From 6a703e9e0e34ae47ff2118e51f526895f0ffff6e Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Tue, 6 Feb 2018 16:08:15 +0900
Subject: [PATCH] [SPARK-23300][TESTS] Prints out if Pandas and PyArrow are
 installed or not in PySpark SQL tests

This PR proposes to log if PyArrow and Pandas are installed or not so we can check if related tests are going to be skipped or not.

Manually tested:

I don't have PyArrow installed in PyPy.
```bash
$ ./run-tests --python-executables=python3
```

```
...
Will test against the following Python executables: ['python3']
Will test the following Python modules: ['pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming']
Will test PyArrow related features against Python executable 'python3' in 'pyspark-sql' module.
Will test Pandas related features against Python executable 'python3' in 'pyspark-sql' module.
Starting test(python3): pyspark.mllib.tests
Starting test(python3): pyspark.sql.tests
Starting test(python3): pyspark.streaming.tests
Starting test(python3): pyspark.tests
```

```bash
$ ./run-tests --modules=pyspark-streaming
```

```
...
Will test against the following Python executables: ['python2.7', 'pypy']
Will test the following Python modules: ['pyspark-streaming']
Starting test(pypy): pyspark.streaming.tests
Starting test(pypy): pyspark.streaming.util
Starting test(python2.7): pyspark.streaming.tests
Starting test(python2.7): pyspark.streaming.util
```

```bash
$ ./run-tests
```

```
...
Will test against the following Python executables: ['python2.7', 'pypy']
Will test the following Python modules: ['pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming']
Will test PyArrow related features against Python executable 'python2.7' in 'pyspark-sql' module.
Will test Pandas related features against Python executable 'python2.7' in 'pyspark-sql' module.
Will skip PyArrow related features against Python executable 'pypy' in 'pyspark-sql' module. PyArrow >= 0.8.0 is required; however, PyArrow was not found.
Will test Pandas related features against Python executable 'pypy' in 'pyspark-sql' module.
Starting test(pypy): pyspark.streaming.tests
Starting test(pypy): pyspark.sql.tests
Starting test(pypy): pyspark.tests
Starting test(python2.7): pyspark.mllib.tests
```

```bash
$ ./run-tests --modules=pyspark-sql --python-executables=pypy
```

```
...
Will test against the following Python executables: ['pypy']
Will test the following Python modules: ['pyspark-sql']
Will skip PyArrow related features against Python executable 'pypy' in 'pyspark-sql' module. PyArrow >= 0.8.0 is required; however, PyArrow was not found.
Will test Pandas related features against Python executable 'pypy' in 'pyspark-sql' module.
Starting test(pypy): pyspark.sql.tests
Starting test(pypy): pyspark.sql.catalog
Starting test(pypy): pyspark.sql.column
Starting test(pypy): pyspark.sql.conf
```

After some modification to produce other cases:

```bash
$ ./run-tests
```

```
...
Will test against the following Python executables: ['python2.7', 'pypy']
Will test the following Python modules: ['pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming']
Will skip PyArrow related features against Python executable 'python2.7' in 'pyspark-sql' module. PyArrow >= 20.0.0 is required; however, PyArrow 0.8.0 was found.
Will skip Pandas related features against Python executable 'python2.7' in 'pyspark-sql' module. Pandas >= 20.0.0 is required; however, Pandas 0.20.2 was found.
Will skip PyArrow related features against Python executable 'pypy' in 'pyspark-sql' module. PyArrow >= 20.0.0 is required; however, PyArrow was not found.
Will skip Pandas related features against Python executable 'pypy' in 'pyspark-sql' module. Pandas >= 20.0.0 is required; however, Pandas 0.22.0 was found.
Starting test(pypy): pyspark.sql.tests
Starting test(pypy): pyspark.streaming.tests
Starting test(pypy): pyspark.tests
Starting test(python2.7): pyspark.mllib.tests
```

```bash
./run-tests-with-coverage
```
```
...
Will test against the following Python executables: ['python2.7', 'pypy']
Will test the following Python modules: ['pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming']
Will test PyArrow related features against Python executable 'python2.7' in 'pyspark-sql' module.
Will test Pandas related features against Python executable 'python2.7' in 'pyspark-sql' module.
Coverage is not installed in Python executable 'pypy' but 'COVERAGE_PROCESS_START' environment variable is set, exiting.
```

Author: hyukjinkwon <gurwls223@gmail.com>

Closes #20473 from HyukjinKwon/SPARK-23300.
---
 python/run-tests.py | 56 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index 1341086f02db0..3539c76b911a4 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -31,6 +31,7 @@
     import Queue
 else:
     import queue as Queue
+from distutils.version import LooseVersion
 
 
 # Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
@@ -39,7 +40,7 @@
 
 from sparktestsupport import SPARK_HOME  # noqa (suppress pep8 warnings)
 from sparktestsupport.shellutils import which, subprocess_check_output  # noqa
-from sparktestsupport.modules import all_modules  # noqa
+from sparktestsupport.modules import all_modules, pyspark_sql  # noqa
 
 
 python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals if m.name != 'root')
@@ -151,6 +152,55 @@ def parse_opts():
     return opts
 
 
+def _check_dependencies(python_exec, modules_to_test):
+    # If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and
+    # explicitly prints out. See SPARK-23300.
+    if pyspark_sql in modules_to_test:
+        # TODO(HyukjinKwon): Relocate and deduplicate these version specifications.
+        minimum_pyarrow_version = '0.8.0'
+        minimum_pandas_version = '0.19.2'
+
+        try:
+            pyarrow_version = subprocess_check_output(
+                [python_exec, "-c", "import pyarrow; print(pyarrow.__version__)"],
+                universal_newlines=True,
+                stderr=open(os.devnull, 'w')).strip()
+            if LooseVersion(pyarrow_version) >= LooseVersion(minimum_pyarrow_version):
+                LOGGER.info("Will test PyArrow related features against Python executable "
+                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
+            else:
+                LOGGER.warning(
+                    "Will skip PyArrow related features against Python executable "
+                    "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
+                    "%s was found." % (
+                        python_exec, pyspark_sql.name, minimum_pyarrow_version, pyarrow_version))
+        except:
+            LOGGER.warning(
+                "Will skip PyArrow related features against Python executable "
+                "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
+                "was not found." % (python_exec, pyspark_sql.name, minimum_pyarrow_version))
+
+        try:
+            pandas_version = subprocess_check_output(
+                [python_exec, "-c", "import pandas; print(pandas.__version__)"],
+                universal_newlines=True,
+                stderr=open(os.devnull, 'w')).strip()
+            if LooseVersion(pandas_version) >= LooseVersion(minimum_pandas_version):
+                LOGGER.info("Will test Pandas related features against Python executable "
+                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
+            else:
+                LOGGER.warning(
+                    "Will skip Pandas related features against Python executable "
+                    "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
+                    "%s was found." % (
+                        python_exec, pyspark_sql.name, minimum_pandas_version, pandas_version))
+        except:
+            LOGGER.warning(
+                "Will skip Pandas related features against Python executable "
+                "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
+                "was not found." % (python_exec, pyspark_sql.name, minimum_pandas_version))
+
+
 def main():
     opts = parse_opts()
     if (opts.verbose):
@@ -175,6 +225,10 @@ def main():
 
     task_queue = Queue.PriorityQueue()
     for python_exec in python_execs:
+        # Check if the python executable has proper dependencies installed to run tests
+        # for given modules properly.
+        _check_dependencies(python_exec, modules_to_test)
+
         python_implementation = subprocess_check_output(
             [python_exec, "-c", "import platform; print(platform.python_implementation())"],
             universal_newlines=True).strip()