From f0249bdae0dd2f2d65b96dce2e6ec37e70b9be13 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 17 Jun 2015 17:26:34 -0700
Subject: [PATCH 01/14] WIP

---
 dev/run-tests.py | 230 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 205 insertions(+), 25 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index c64c71f4f723f..e30085efe7798 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -28,6 +28,199 @@
 USER_HOME = os.environ.get("HOME")
 
 
+all_modules = []
+
+
+class Module(object):
+
+    def __init__(self, name, dependencies, source_file_regexes, sbt_test_goals=(),
+                 should_run_python_tests=False, should_run_r_tests=False):
+        self.name = name
+        self.dependencies = dependencies
+        self.source_file_prefixes = source_file_regexes
+        self.sbt_test_goals = sbt_test_goals
+        self.should_run_python_tests = should_run_python_tests
+        self.should_run_r_tests = should_run_r_tests
+
+        self.dependent_modules = set()
+        for dep in dependencies:
+            dep.dependent_modules.add(self)
+        all_modules.append(self)
+
+    def contains_file(self, filename):
+        return any(re.match(p, filename) for p in self.source_file_prefixes)
+
+
+root = Module(
+    name="root",
+    dependencies=[],
+    source_file_regexes=[],
+    sbt_test_goals=[
+        "test",
+    ]
+)
+
+
+sql = Module(
+    name="sql",
+    dependencies=[],
+    source_file_regexes=[
+        "sql/(?!hive-thriftserver)",
+        "bin/spark-sql",
+        "examples/src/main/java/org/apache/spark/examples/sql/",
+        "examples/src/main/scala/org/apache/spark/examples/sql/",
+    ],
+    sbt_test_goals=[
+        "catalyst/test",
+        "sql/test",
+        "hive/test",
+    ])
+
+
+hive_thriftserver = Module(
+    name="hive-thriftserver",
+    dependencies=[sql],
+    source_file_regexes=[
+        "sql/hive-thriftserver",
+        "sbin/start-thriftserver.sh",
+    ],
+    sbt_test_goals=[
+        "hive-thriftserver/test",
+    ]
+)
+
+
+mllib = Module(
+    name="mllib",
+    dependencies=[sql],
+    source_file_regexes=[
+        "examples/src/main/java/org/apache/spark/examples/mllib/",
+        "examples/src/main/scala/org/apache/spark/examples/mllib",
+        "data/mllib/",
+        "mllib/",
+    ],
+    sbt_test_goals=[
+        "mllib/test",
+        "examples/test",
+    ]
+)
+
+
+graphx = Module(
+    name="graphx",
+    dependencies=[],
+    source_file_regexes=[
+        "graphx/",
+    ],
+    sbt_test_goals=[
+        "graphx/test"
+    ]
+)
+
+
+streaming = Module(
+    name="streaming",
+    dependencies=[],
+    source_file_regexes=[
+        "external/",
+        "extras/java8-tests/",
+        "extras/kinesis-asl/",
+        "streaming",
+    ],
+    sbt_test_goals=[
+        "streaming/test",
+        "streaming-flume/test",
+        "streaming-flume-sink/test",
+        "streaming-kafka/test",
+        "streaming-mqtt/test",
+        "streaming-twitter/test",
+        "streaming-zeromq/test",
+    ]
+)
+
+
+examples = Module(
+    name="examples",
+    dependencies=[graphx, mllib, streaming, sql],
+    source_file_regexes=[
+        "examples/",
+    ],
+    sbt_test_goals=[
+        "examples/test",
+    ]
+)
+
+
+pyspark = Module(
+    name="pyspark",
+    dependencies=[mllib, streaming, sql],
+    source_file_regexes=[
+        "python/"
+    ],
+    should_run_python_tests=True
+)
+
+
+sparkr = Module(
+    name="sparkr",
+    dependencies=[sql, mllib],
+    source_file_regexes=[
+        "R/",
+    ],
+    should_run_r_tests=True
+)
+
+
+docs = Module(
+    name="docs",
+    dependencies=[],
+    source_file_regexes=[
+        "docs/",
+    ]
+)
+
+
+def determine_modules(filenames):
+    """
+    Given a list of filenames, return the set of modules that contain those files.
+    If a file is not associated with a more specific submodule, then this method will consider that
+    file to belong to the 'root' module.
+
+    >>> sorted(x.name for x in determine_modules(["python/pyspark/a.py", "sql/test/foo"]))
+    ['pyspark', 'sql']
+    >>> [x.name for x in determine_modules(["file_not_matched_by_any_subproject"])]
+    ['root']
+    """
+    changed_modules = set()
+    for filename in filenames:
+        matched_at_least_one_module = False
+        for module in all_modules:
+            if module.contains_file(filename):
+                changed_modules.add(module)
+                matched_at_least_one_module = True
+        if not matched_at_least_one_module:
+            changed_modules.add(root)
+    return changed_modules
+
+
+def determine_modules_to_test(changed_modules):
+    """
+    Given a set of modules that have changed, compute the transitive closure of those modules'
+    dependent modules in order to determine the set of modules that should be tested.
+
+    >>> sorted(x.name for x in determine_modules_to_test([root]))
+    ['root']
+    >>> sorted(x.name for x in determine_modules_to_test([graphx]))
+    ['examples', 'graphx']
+    >>> sorted(x.name for x in determine_modules_to_test([sql]))
+    ['examples', 'hive-thriftserver', 'mllib', 'pyspark', 'sparkr', 'sql']
+    """
+    modules_to_test = set()
+    for module in changed_modules:
+        modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
+    return modules_to_test.union(set(changed_modules))
+
+
 def get_error_codes(err_code_file):
     """Function to retrieve all block numbers from the `run-tests-codes.sh`
     file to maintain backwards compatibility with the `run-tests-jenkins`
@@ -43,7 +236,7 @@ def get_error_codes(err_code_file):
 
 
 def exit_from_command_with_retcode(cmd, retcode):
-    print "[error] running", cmd, "; received return code", retcode
+    print "[error] running", cmd.join(' '), "; received return code", retcode
     sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
 
 
@@ -177,14 +370,14 @@ def build_spark_documentation():
     os.chdir(SPARK_HOME)
 
 
-def exec_maven(mvn_args=[]):
+def exec_maven(mvn_args=()):
     """Will call Maven in the current directory with the list of mvn_args passed
     in and returns the subprocess for any further processing"""
 
     run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args)
 
 
-def exec_sbt(sbt_args=[]):
+def exec_sbt(sbt_args=()):
     """Will call SBT in the current directory with the list of mvn_args passed
     in and returns the subprocess for any further processing"""
 
@@ -231,7 +424,7 @@ def get_hadoop_profiles(hadoop_version):
         sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
 
 
-def get_build_profiles(hadoop_version="hadoop2.3",
+def get_build_profiles(hadoop_version,
                        enable_base_profiles=True,
                        enable_hive_profiles=False,
                        enable_doc_profiles=False):
@@ -318,19 +511,6 @@ def identify_changed_modules(test_env):
         # remove any empty strings
         changed_files = [f for f in raw_output.split('\n') if f]
 
-        sql_files = [f for f in changed_files
-                     if any(f.startswith(p) for p in
-                            ["sql/",
-                             "bin/spark-sql",
-                             "sbin/start-thriftserver.sh",
-                             "examples/src/main/java/org/apache/spark/examples/sql/",
-                             "examples/src/main/scala/org/apache/spark/examples/sql/"])]
-        mllib_files = [f for f in changed_files
-                       if any(f.startswith(p) for p in
-                              ["examples/src/main/java/org/apache/spark/examples/mllib/",
-                               "examples/src/main/scala/org/apache/spark/examples/mllib",
-                               "data/mllib/",
-                               "mllib/"])]
         streaming_files = [f for f in changed_files
                            if any(f.startswith(p) for p in
                                   ["examples/scala-2.10/",
@@ -356,12 +536,6 @@ def identify_changed_modules(test_env):
 
         if changed_core_files:
             changed_modules.add("CORE")
-        if sql_files:
-            print "[info] Detected changes in SQL. Will run Hive test suite."
-            changed_modules.add("SQL")
-        if mllib_files:
-            print "[info] Detected changes in MLlib. Will run MLlib test suite."
-            changed_modules.add("MLLIB")
         if streaming_files:
             print "[info] Detected changes in Streaming. Will run Streaming test suite."
             changed_modules.add("STREAMING")
@@ -416,8 +590,6 @@ def run_scala_tests_sbt(test_modules, test_profiles):
                                "streaming-twitter/test",
                                "streaming-zeromq/test",
                                "examples/test"]
-        if "GRAPHX" in test_modules and "CORE" not in test_modules:
-            sbt_test_goals += ["graphx/test", "examples/test"]
         if not sbt_test_goals:
             sbt_test_goals = ["test"]
 
@@ -532,5 +704,13 @@ def main():
     run_python_tests()
     run_sparkr_tests()
 
+
+def _test():
+    import doctest
+    (failure_count, test_count) = doctest.testmod()
+    if failure_count:
+        exit(-1)
+
 if __name__ == "__main__":
+    _test()
     main()

From f53864b8ff8b21e685b42166c94cedc4e4bd00c7 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 17 Jun 2015 18:05:27 -0700
Subject: [PATCH 02/14] Finish integrating module changes

---
 dev/run-tests.py | 179 ++++++++++++++++++++---------------------------
 1 file changed, 76 insertions(+), 103 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index e30085efe7798..53fdabc8708b1 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -17,6 +17,7 @@
 # limitations under the License.
 #
 
+import itertools
 import os
 import re
 import sys
@@ -28,6 +29,11 @@
 USER_HOME = os.environ.get("HOME")
 
 
+# -------------------------------------------------------------------------------------------------
+# Test module definitions and functions for traversing module dependency graph
+# -------------------------------------------------------------------------------------------------
+
+
 all_modules = []
 
 
@@ -57,7 +63,9 @@ def contains_file(self, filename):
     source_file_regexes=[],
     sbt_test_goals=[
         "test",
-    ]
+    ],
+    should_run_python_tests=True,
+    should_run_r_tests=True
 )
 
 
@@ -180,15 +188,15 @@ def contains_file(self, filename):
 )
 
 
-def determine_modules(filenames):
+def determine_modules_for_files(filenames):
     """
     Given a list of filenames, return the set of modules that contain those files.
     If a file is not associated with a more specific submodule, then this method will consider that
     file to belong to the 'root' module.
 
-    >>> sorted(x.name for x in determine_modules(["python/pyspark/a.py", "sql/test/foo"]))
+    >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"]))
     ['pyspark', 'sql']
-    >>> [x.name for x in determine_modules(["file_not_matched_by_any_subproject"])]
+    >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])]
     ['root']
     """
     changed_modules = set()
@@ -203,6 +211,33 @@ def determine_modules(filenames):
     return changed_modules
 
 
+def identify_changed_modules_from_git_commits(patch_sha, target_branch=None, target_ref=None):
+    """
+    Given a git commit and target ref, use the set of files changed in the diff in order to
+    determine which modules' tests should be run.
+
+    >>> [x.name for x in \
+         identify_changed_modules_from_git_commits("fc0a1475ef", target_ref="5da21f07")]
+    ['graphx']
+    >>> 'root' in [x.name for x in \
+         identify_changed_modules_from_git_commits("50a0496a43", target_ref="6765ef9")]
+    True
+    """
+    if target_branch is None and target_ref is None:
+        raise AttributeError("must specify either target_branch or target_ref")
+    elif target_branch is not None and target_ref is not None:
+        raise AttributeError("must specify either target_branch or target_ref, not both")
+    if target_branch is not None:
+        diff_target = target_branch
+        run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
+    else:
+        diff_target = target_ref
+    raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target])
+    # Remove any empty strings
+    changed_files = [f for f in raw_output.split('\n') if f]
+    return determine_modules_for_files(changed_files)
+
+
 def determine_modules_to_test(changed_modules):
     """
     Given a set of modules that have changed, compute the transitive closure of those modules'
@@ -218,8 +253,16 @@ def determine_modules_to_test(changed_modules):
     modules_to_test = set()
     for module in changed_modules:
         modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
-    return modules_to_test.union(set(changed_modules))
+    modules_to_test = modules_to_test.union(set(changed_modules))
+    if root in modules_to_test:
+        return [root]
+    else:
+        return modules_to_test
+
 
+# -------------------------------------------------------------------------------------------------
+# Functions for working with subprocesses and shell tools
+# -------------------------------------------------------------------------------------------------
 
 def get_error_codes(err_code_file):
     """Function to retrieve all block numbers from the `run-tests-codes.sh`
@@ -275,7 +318,7 @@ def which(program):
     """Find and return the given program by its absolute path or 'None'
     - from: http://stackoverflow.com/a/377028"""
 
-    fpath, fname = os.path.split(program)
+    fpath = os.path.split(program)[0]
 
     if fpath:
         if is_exe(program):
@@ -327,6 +370,11 @@ def determine_java_version(java_exe):
                        update=version_info[3])
 
 
+# -------------------------------------------------------------------------------------------------
+# Functions for running the other build and test scripts
+# -------------------------------------------------------------------------------------------------
+
+
 def set_title_and_block(title, err_block):
     os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block]
     line_str = '=' * 72
@@ -494,65 +542,6 @@ def detect_binary_inop_with_mima():
     run_cmd([os.path.join(SPARK_HOME, "dev", "mima")])
 
 
-def identify_changed_modules(test_env):
-    """Given the passed in environment will determine the changed modules and
-    return them as a set. If the environment is local, will simply run all tests.
-    If run under the `amplab_jenkins` environment will determine the changed files
-    as compared to the `ghprbTargetBranch` and execute the necessary set of tests
-    to provide coverage for the changed code."""
-    changed_modules = set()
-
-    if test_env == "amplab_jenkins":
-        target_branch = os.environ["ghprbTargetBranch"]
-
-        run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)])
-
-        raw_output = subprocess.check_output(['git', 'diff', '--name-only', target_branch])
-        # remove any empty strings
-        changed_files = [f for f in raw_output.split('\n') if f]
-
-        streaming_files = [f for f in changed_files
-                           if any(f.startswith(p) for p in
-                                  ["examples/scala-2.10/",
-                                   "examples/src/main/java/org/apache/spark/examples/streaming/",
-                                   "examples/src/main/scala/org/apache/spark/examples/streaming/",
-                                   "external/",
-                                   "extras/java8-tests/",
-                                   "extras/kinesis-asl/",
-                                   "streaming/"])]
-        graphx_files = [f for f in changed_files
-                        if any(f.startswith(p) for p in
-                               ["examples/src/main/scala/org/apache/spark/examples/graphx/",
-                                "graphx/"])]
-        doc_files = [f for f in changed_files if f.startswith("docs/")]
-
-        # union together all changed top level project files
-        top_level_project_files = set().union(*[set(f) for f in [sql_files,
-                                                                 mllib_files,
-                                                                 streaming_files,
-                                                                 graphx_files,
-                                                                 doc_files]])
-        changed_core_files = set(changed_files).difference(top_level_project_files)
-
-        if changed_core_files:
-            changed_modules.add("CORE")
-        if streaming_files:
-            print "[info] Detected changes in Streaming. Will run Streaming test suite."
-            changed_modules.add("STREAMING")
-        if graphx_files:
-            print "[info] Detected changes in GraphX. Will run GraphX test suite."
-            changed_modules.add("GRAPHX")
-        if doc_files:
-            print "[info] Detected changes in documentation. Will build spark with documentation."
-            changed_modules.add("DOCS")
-
-        return changed_modules
-    else:
-        # we aren't in the Amplab environment so simply run all tests
-        changed_modules.add("ALL")
-        return changed_modules
-
-
 def run_scala_tests_maven(test_profiles):
     mvn_test_goals = ["test", "--fail-at-end"]
     profiles_and_goals = test_profiles + mvn_test_goals
@@ -564,34 +553,8 @@ def run_scala_tests_maven(test_profiles):
 
 
 def run_scala_tests_sbt(test_modules, test_profiles):
-    # declare the variable for reference
-    sbt_test_goals = []
 
-    if "ALL" in test_modules:
-        sbt_test_goals = ["test"]
-    else:
-        # if we only have changes in SQL, MLlib, Streaming, or GraphX then build
-        # a custom test list
-        if "SQL" in test_modules and "CORE" not in test_modules:
-            sbt_test_goals += ["catalyst/test",
-                               "sql/test",
-                               "hive/test",
-                               "hive-thriftserver/test",
-                               "mllib/test",
-                               "examples/test"]
-        if "MLLIB" in test_modules and "CORE" not in test_modules:
-            sbt_test_goals += ["mllib/test", "examples/test"]
-        if "STREAMING" in test_modules and "CORE" not in test_modules:
-            sbt_test_goals += ["streaming/test",
-                               "streaming-flume/test",
-                               "streaming-flume-sink/test",
-                               "streaming-kafka/test",
-                               "streaming-mqtt/test",
-                               "streaming-twitter/test",
-                               "streaming-zeromq/test",
-                               "examples/test"]
-        if not sbt_test_goals:
-            sbt_test_goals = ["test"]
+    sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
 
     profiles_and_goals = test_profiles + sbt_test_goals
 
@@ -608,7 +571,7 @@ def run_scala_tests(build_tool, hadoop_version, test_modules):
 
     test_modules = set(test_modules)
 
-    hive_profiles = ("SQL" in test_modules)
+    hive_profiles = (sql in test_modules)
     test_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=hive_profiles)
 
     if build_tool == "maven":
@@ -677,16 +640,23 @@ def main():
     print "[info] Using build tool", build_tool, "with profile", hadoop_version,
     print "under environment", test_env
 
-    # determine high level changes
-    changed_modules = identify_changed_modules(test_env)
-    print "[info] Found the following changed modules:", ", ".join(changed_modules)
+    changed_modules = [root]
+    if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
+        target_branch = os.environ["ghprbTargetBranch"]
+        changed_modules = identify_changed_modules_from_git_commits("HEAD",
+                                                                    target_branch=target_branch)
+    print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)
+
+    test_modules = determine_modules_to_test(changed_modules)
 
     # license checks
     run_apache_rat_checks()
 
     # style checks
-    run_scala_style_checks()
-    run_python_style_checks()
+    if changed_modules != [docs]:
+        run_scala_style_checks()
+    if any(m.should_run_python_tests for m in changed_modules):
+        run_python_style_checks()
 
     # determine if docs were changed and if we're inside the amplab environment
     # note - the below commented out until *all* Jenkins workers can get `jekyll` installed
@@ -700,17 +670,20 @@ def main():
     detect_binary_inop_with_mima()
 
     # run the test suites
-    run_scala_tests(build_tool, hadoop_version, changed_modules)
-    run_python_tests()
-    run_sparkr_tests()
+    run_scala_tests(build_tool, hadoop_version, test_modules)
+
+    if any(m.should_run_python_tests for m in test_modules):
+        run_python_tests()
+    if any(m.should_run_r_tests for m in test_modules):
+        run_sparkr_tests()
 
 
 def _test():
     import doctest
-    (failure_count, test_count) = doctest.testmod()
+    failure_count = doctest.testmod()[0]
     if failure_count:
         exit(-1)
 
 if __name__ == "__main__":
     _test()
-    main()
+    main()
\ No newline at end of file

From 37f3fb3f7f11ce25370caa5c8ebf42f60e56227f Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 17 Jun 2015 18:08:08 -0700
Subject: [PATCH 03/14] Remove doc profiles option, since it's not actually
 needed (see #6865)

---
 dev/run-tests.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 53fdabc8708b1..56d6a6430a1cb 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -474,14 +474,12 @@ def get_hadoop_profiles(hadoop_version):
 
 def get_build_profiles(hadoop_version,
                        enable_base_profiles=True,
-                       enable_hive_profiles=False,
-                       enable_doc_profiles=False):
+                       enable_hive_profiles=False):
     """Returns a list of hadoop profiles to be used as looked up from the passed in hadoop profile
     key with the option of adding on the base and hive profiles."""
 
     base_profiles = ["-Pkinesis-asl"]
     hive_profiles = ["-Phive", "-Phive-thriftserver"]
-    doc_profiles = []
     hadoop_profiles = get_hadoop_profiles(hadoop_version)
 
     build_profiles = hadoop_profiles
@@ -492,9 +490,6 @@ def get_build_profiles(hadoop_version,
     if enable_hive_profiles:
         build_profiles += hive_profiles
 
-    if enable_doc_profiles:
-        build_profiles += doc_profiles
-
     return build_profiles
 
 

From 33714415a294ddb8044829438e08a2b25d7c8caa Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 17 Jun 2015 18:10:06 -0700
Subject: [PATCH 04/14] Test everything if nothing has changed (needed for
 non-PRB builds)

---
 dev/run-tests.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 56d6a6430a1cb..87de4016c65d0 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -635,11 +635,13 @@ def main():
     print "[info] Using build tool", build_tool, "with profile", hadoop_version,
     print "under environment", test_env
 
-    changed_modules = [root]
+    changed_modules = None
     if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
         target_branch = os.environ["ghprbTargetBranch"]
         changed_modules = identify_changed_modules_from_git_commits("HEAD",
                                                                     target_branch=target_branch)
+    if not changed_modules:
+        changed_modules = ['root']
     print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)
 
     test_modules = determine_modules_to_test(changed_modules)

From 43a0ced49eea09831d9bc333af9e20a75c7a79cf Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 17 Jun 2015 18:11:56 -0700
Subject: [PATCH 05/14] Minor fixes

---
 dev/run-tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 87de4016c65d0..8af8dd258a657 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -551,7 +551,7 @@ def run_scala_tests_sbt(test_modules, test_profiles):
 
     sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
 
-    profiles_and_goals = test_profiles + sbt_test_goals
+    profiles_and_goals = test_profiles + list(sbt_test_goals)
 
     print "[info] Running Spark tests using SBT with these arguments:",
     print " ".join(profiles_and_goals)
@@ -641,7 +641,7 @@ def main():
         changed_modules = identify_changed_modules_from_git_commits("HEAD",
                                                                     target_branch=target_branch)
     if not changed_modules:
-        changed_modules = ['root']
+        changed_modules = [root]
     print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)
 
     test_modules = determine_modules_to_test(changed_modules)

From 7092d3e37c21662175e5991ec766c79b121019ac Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Wed, 17 Jun 2015 18:55:41 -0700
Subject: [PATCH 06/14] Skip SBT tests if no test goals are specified

---
 dev/run-tests.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 8af8dd258a657..95ecbf8995898 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -551,6 +551,9 @@ def run_scala_tests_sbt(test_modules, test_profiles):
 
     sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules))
 
+    if not sbt_test_goals:
+        return
+
     profiles_and_goals = test_profiles + list(sbt_test_goals)
 
     print "[info] Running Spark tests using SBT with these arguments:",

From dc6f1c62c96d8119ee7a0ec0370aa28473a315ff Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 18 Jun 2015 23:28:17 -0700
Subject: [PATCH 07/14] Use changed files' extensions to decide whether to run
 style checks

---
 dev/run-tests.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 95ecbf8995898..8c118f41df30b 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -211,16 +211,16 @@ def determine_modules_for_files(filenames):
     return changed_modules
 
 
-def identify_changed_modules_from_git_commits(patch_sha, target_branch=None, target_ref=None):
+def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None):
     """
     Given a git commit and target ref, use the set of files changed in the diff in order to
     determine which modules' tests should be run.
 
-    >>> [x.name for x in \
-         identify_changed_modules_from_git_commits("fc0a1475ef", target_ref="5da21f07")]
+    >>> [x.name for x in determine_modules_for_files( \
+            identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))]
     ['graphx']
-    >>> 'root' in [x.name for x in \
-         identify_changed_modules_from_git_commits("50a0496a43", target_ref="6765ef9")]
+    >>> 'root' in [x.name for x in determine_modules_for_files( \
+         identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))]
     True
     """
     if target_branch is None and target_ref is None:
@@ -234,8 +234,7 @@ def identify_changed_modules_from_git_commits(patch_sha, target_branch=None, tar
         diff_target = target_ref
     raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target])
     # Remove any empty strings
-    changed_files = [f for f in raw_output.split('\n') if f]
-    return determine_modules_for_files(changed_files)
+    return [f for f in raw_output.split('\n') if f]
 
 
 def determine_modules_to_test(changed_modules):
@@ -639,10 +638,11 @@ def main():
     print "under environment", test_env
 
     changed_modules = None
+    changed_files = None
     if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"):
         target_branch = os.environ["ghprbTargetBranch"]
-        changed_modules = identify_changed_modules_from_git_commits("HEAD",
-                                                                    target_branch=target_branch)
+        changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch)
+        changed_modules = determine_modules_for_files(changed_files)
     if not changed_modules:
         changed_modules = [root]
     print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules)
@@ -653,9 +653,9 @@ def main():
     run_apache_rat_checks()
 
     # style checks
-    if changed_modules != [docs]:
+    if not changed_files or any(f.endsWith(".scala") for f in changed_files):
         run_scala_style_checks()
-    if any(m.should_run_python_tests for m in changed_modules):
+    if not changed_files or any(f.endsWith(".py") for f in changed_files):
         run_python_style_checks()
 
     # determine if docs were changed and if we're inside the amplab environment
@@ -686,4 +686,4 @@ def _test():
 
 if __name__ == "__main__":
     _test()
-    main()
\ No newline at end of file
+    main()

From 3670d50e8e4f6fa99732721be19c13e9ed3ff6cb Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 18 Jun 2015 23:29:24 -0700
Subject: [PATCH 08/14] mllib should depend on streaming

---
 dev/run-tests.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 8c118f41df30b..d5fc51114fff1 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -98,22 +98,6 @@ def contains_file(self, filename):
 )
 
 
-mllib = Module(
-    name="mllib",
-    dependencies=[sql],
-    source_file_regexes=[
-        "examples/src/main/java/org/apache/spark/examples/mllib/",
-        "examples/src/main/scala/org/apache/spark/examples/mllib",
-        "data/mllib/",
-        "mllib/",
-    ],
-    sbt_test_goals=[
-        "mllib/test",
-        "examples/test",
-    ]
-)
-
-
 graphx = Module(
     name="graphx",
     dependencies=[],
@@ -147,6 +131,22 @@ def contains_file(self, filename):
 )
 
 
+mllib = Module(
+    name="mllib",
+    dependencies=[streaming, sql],
+    source_file_regexes=[
+        "examples/src/main/java/org/apache/spark/examples/mllib/",
+        "examples/src/main/scala/org/apache/spark/examples/mllib",
+        "data/mllib/",
+        "mllib/",
+    ],
+    sbt_test_goals=[
+        "mllib/test",
+        "examples/test",
+    ]
+)
+
+
 examples = Module(
     name="examples",
     dependencies=[graphx, mllib, streaming, sql],

From df10e23ae658caf75a5a265ce6b117611f5e0674 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 18 Jun 2015 23:32:19 -0700
Subject: [PATCH 09/14] update to reflect fact that no module depends on root

---
 dev/run-tests.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index d5fc51114fff1..dbbef1a616449 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -249,14 +249,15 @@ def determine_modules_to_test(changed_modules):
     >>> sorted(x.name for x in determine_modules_to_test([sql]))
     ['examples', 'hive-thriftserver', 'mllib', 'pyspark', 'sparkr', 'sql']
     """
+    # If we're going to have to run all of the tests, then we can just short-circuit
+    # and return 'root'. No module depends on root, so if it appears then it will be
+    # in changed_modules.
+    if root in changed_modules:
+        return [root]
     modules_to_test = set()
     for module in changed_modules:
         modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules))
-    modules_to_test = modules_to_test.union(set(changed_modules))
-    if root in modules_to_test:
-        return [root]
-    else:
-        return modules_to_test
+    return modules_to_test.union(set(changed_modules))
 
 
 # -------------------------------------------------------------------------------------------------

From 35a3052db88550f81b7e78615a7ef252bd9ef929 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 18 Jun 2015 23:47:06 -0700
Subject: [PATCH 10/14] Enable Hive tests when running all tests

---
 dev/run-tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index dbbef1a616449..c37cd2670eefc 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -569,7 +569,7 @@ def run_scala_tests(build_tool, hadoop_version, test_modules):
 
     test_modules = set(test_modules)
 
-    hive_profiles = (sql in test_modules)
+    hive_profiles = (sql in test_modules or root in test_modules)
     test_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=hive_profiles)
 
     if build_tool == "maven":

From e46539fb9140d2b0c1d6d616a6b9d4c4f697d061 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 18 Jun 2015 23:49:54 -0700
Subject: [PATCH 11/14] Fix camel-cased endswith()

---
 dev/run-tests.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index c37cd2670eefc..cc8466b1baa3a 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -654,9 +654,9 @@ def main():
     run_apache_rat_checks()
 
     # style checks
-    if not changed_files or any(f.endsWith(".scala") for f in changed_files):
+    if not changed_files or any(f.endswith(".scala") for f in changed_files):
         run_scala_style_checks()
-    if not changed_files or any(f.endsWith(".py") for f in changed_files):
+    if not changed_files or any(f.endswith(".py") for f in changed_files):
         run_python_style_checks()
 
     # determine if docs were changed and if we're inside the amplab environment

From a86a953443411aac830b615cbfc06763520a6f2f Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 19 Jun 2015 15:38:54 -0700
Subject: [PATCH 12/14] Clean up modules; add new modules for streaming
 external projects

---
 dev/run-tests.py | 106 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 93 insertions(+), 13 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index cc8466b1baa3a..bc97d031ffcc7 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -75,8 +75,6 @@ def contains_file(self, filename):
     source_file_regexes=[
         "sql/(?!hive-thriftserver)",
         "bin/spark-sql",
-        "examples/src/main/java/org/apache/spark/examples/sql/",
-        "examples/src/main/scala/org/apache/spark/examples/sql/",
     ],
     sbt_test_goals=[
         "catalyst/test",
@@ -114,35 +112,108 @@ def contains_file(self, filename):
     name="streaming",
     dependencies=[],
     source_file_regexes=[
-        "external/",
-        "extras/java8-tests/",
-        "extras/kinesis-asl/",
         "streaming",
     ],
     sbt_test_goals=[
         "streaming/test",
-        "streaming-flume/test",
-        "streaming-flume-sink/test",
-        "streaming-kafka/test",
-        "streaming-mqtt/test",
-        "streaming-twitter/test",
+    ]
+)
+
+
+streaming_kinesis_asl = Module(
+    name="kinesis-asl",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "extras/kinesis-asl/",
+    ],
+    sbt_test_goals=[
+        "kinesis-asl/test",
+    ]
+)
+
+
+streaming_zeromq = Module(
+    name="streaming-zeromq",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/zeromq",
+    ],
+    sbt_test_goals=[
         "streaming-zeromq/test",
     ]
 )
 
 
+streaming_twitter = Module(
+    name="streaming-twitter",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/twitter",
+    ],
+    sbt_test_goals=[
+        "streaming-twitter/test",
+    ]
+)
+
+
+streaming_mqqt = Module(
+    name="streaming-mqqt",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/mqqt",
+    ],
+    sbt_test_goals=[
+        "streaming-mqqt/test",
+    ]
+)
+
+
+streaming_kafka = Module(
+    name="streaming-kafka",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/kafka",
+        "external/kafka-assembly",
+    ],
+    sbt_test_goals=[
+        "streaming-kafka/test",
+    ]
+)
+
+
+streaming_flume_sink = Module(
+    name="streaming-flume-sink",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/flume-sink",
+    ],
+    sbt_test_goals=[
+        "streaming-flume-sink/test",
+    ]
+)
+
+
+streaming_flume = Module(
+    name="streaming_flume",
+    dependencies=[streaming],
+    source_file_regexes=[
+        "external/flume",
+    ],
+    sbt_test_goals=[
+        "streaming-flume/test",
+    ]
+)
+
+
 mllib = Module(
     name="mllib",
     dependencies=[streaming, sql],
     source_file_regexes=[
-        "examples/src/main/java/org/apache/spark/examples/mllib/",
-        "examples/src/main/scala/org/apache/spark/examples/mllib",
         "data/mllib/",
         "mllib/",
     ],
     sbt_test_goals=[
         "mllib/test",
-        "examples/test",
     ]
 )
 
@@ -188,6 +259,15 @@ def contains_file(self, filename):
 )
 
 
+ec2 = Module(
+    name="ec2",
+    dependencies=[],
+    source_file_regexes=[
+        "ec2/",
+    ]
+)
+
+
 def determine_modules_for_files(filenames):
     """
     Given a list of filenames, return the set of modules that contain those files.

From 4224da5164f77cc684d3995db4570cfa23b9c852 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 19 Jun 2015 21:58:54 -0700
Subject: [PATCH 13/14] Add documentation to Module.

---
 dev/run-tests.py | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index bc97d031ffcc7..45b1b4aa46d3a 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -38,9 +38,31 @@
 
 
 class Module(object):
+    """
+    A module is the basic abstraction in our test runner script. Each module consists of a set of
+    source files, a set of test commands, and a set of dependencies on other modules. We use modules
+    to define a dependency graph that lets determine which tests to run based on which files have
+    changed.
+    """
 
     def __init__(self, name, dependencies, source_file_regexes, sbt_test_goals=(),
                  should_run_python_tests=False, should_run_r_tests=False):
+        """
+        Define a new module.
+
+        :param name: A short module name, for display in logging and error messages.
+        :param dependencies: A set of dependencies for this module. This should only include direct
+            dependencies; transitive dependencies are resolved automatically.
+        :param source_file_regexes: a set of regexes that match source files belonging to this
+            module. These regexes are applied by attempting to match at the beginning of the
+            filename strings.
+        :param sbt_test_goals: A set of SBT test goals for testing this module/
+        :param should_run_python_tests: If true, changes in this module will trigger Python tests.
+            For now, this has the effect of causing _all_ Python tests to be run, although in the
+            future this should be changed to run only a subset of the Python tests that depend
+            on this module.
+        :param should_run_r_tests: If true, changes in this module will trigger all R tests.
+        """
         self.name = name
         self.dependencies = dependencies
         self.source_file_prefixes = source_file_regexes
@@ -80,7 +102,8 @@ def contains_file(self, filename):
         "catalyst/test",
         "sql/test",
         "hive/test",
-    ])
+    ]
+)
 
 
 hive_thriftserver = Module(
@@ -534,8 +557,10 @@ def exec_sbt(sbt_args=()):
 
 
 def get_hadoop_profiles(hadoop_version):
-    """Return a list of profiles indicating which Hadoop version to use from
-    a Hadoop version tag."""
+    """
+    For the given Hadoop version tag, return a list of SBT profile flags for
+    building and testing against that Hadoop version.
+    """
 
     sbt_maven_hadoop_profiles = {
         "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.0.4"],

From 75de4506e302d8fa6a5f69465c6b1ee1ba8eb596 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Fri, 19 Jun 2015 22:29:18 -0700
Subject: [PATCH 14/14] Use module system to determine which build profiles to
 enable.

---
 dev/run-tests.py | 84 +++++++++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 44 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index 45b1b4aa46d3a..474889ec2d8aa 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -45,8 +45,8 @@ class Module(object):
     changed.
     """
 
-    def __init__(self, name, dependencies, source_file_regexes, sbt_test_goals=(),
-                 should_run_python_tests=False, should_run_r_tests=False):
+    def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(),
+                 sbt_test_goals=(), should_run_python_tests=False, should_run_r_tests=False):
         """
         Define a new module.
 
@@ -56,7 +56,9 @@ def __init__(self, name, dependencies, source_file_regexes, sbt_test_goals=(),
         :param source_file_regexes: a set of regexes that match source files belonging to this
             module. These regexes are applied by attempting to match at the beginning of the
             filename strings.
-        :param sbt_test_goals: A set of SBT test goals for testing this module/
+        :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in
+            order to build and test this module (e.g. '-PprofileName').
+        :param sbt_test_goals: A set of SBT test goals for testing this module.
         :param should_run_python_tests: If true, changes in this module will trigger Python tests.
             For now, this has the effect of causing _all_ Python tests to be run, although in the
             future this should be changed to run only a subset of the Python tests that depend
@@ -67,6 +69,7 @@ def __init__(self, name, dependencies, source_file_regexes, sbt_test_goals=(),
         self.dependencies = dependencies
         self.source_file_prefixes = source_file_regexes
         self.sbt_test_goals = sbt_test_goals
+        self.build_profile_flags = build_profile_flags
         self.should_run_python_tests = should_run_python_tests
         self.should_run_r_tests = should_run_r_tests
 
@@ -79,18 +82,6 @@ def contains_file(self, filename):
         return any(re.match(p, filename) for p in self.source_file_prefixes)
 
 
-root = Module(
-    name="root",
-    dependencies=[],
-    source_file_regexes=[],
-    sbt_test_goals=[
-        "test",
-    ],
-    should_run_python_tests=True,
-    should_run_r_tests=True
-)
-
-
 sql = Module(
     name="sql",
     dependencies=[],
@@ -98,6 +89,9 @@ def contains_file(self, filename):
         "sql/(?!hive-thriftserver)",
         "bin/spark-sql",
     ],
+    build_profile_flags=[
+        "-Phive",
+    ],
     sbt_test_goals=[
         "catalyst/test",
         "sql/test",
@@ -113,6 +107,9 @@ def contains_file(self, filename):
         "sql/hive-thriftserver",
         "sbin/start-thriftserver.sh",
     ],
+    build_profile_flags=[
+        "-Phive-thriftserver",
+    ],
     sbt_test_goals=[
         "hive-thriftserver/test",
     ]
@@ -149,6 +146,9 @@ def contains_file(self, filename):
     source_file_regexes=[
         "extras/kinesis-asl/",
     ],
+    build_profile_flags=[
+        "-Pkinesis-asl",
+    ],
     sbt_test_goals=[
         "kinesis-asl/test",
     ]
@@ -291,6 +291,23 @@ def contains_file(self, filename):
 )
 
 
+# The root module is a dummy module which is used to run all of the tests.
+# No other modules should directly depend on this module.
+root = Module(
+    name="root",
+    dependencies=[],
+    source_file_regexes=[],
+    # In order to run all of the tests, enable every test profile:
+    build_profile_flags=
+        list(set(itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))),
+    sbt_test_goals=[
+        "test",
+    ],
+    should_run_python_tests=True,
+    should_run_r_tests=True
+)
+
+
 def determine_modules_for_files(filenames):
     """
     Given a list of filenames, return the set of modules that contain those files.
@@ -382,7 +399,7 @@ def get_error_codes(err_code_file):
 
 
 def exit_from_command_with_retcode(cmd, retcode):
-    print "[error] running", cmd.join(' '), "; received return code", retcode
+    print "[error] running", ' '.join(cmd), "; received return code", retcode
     sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
 
 
@@ -577,30 +594,9 @@ def get_hadoop_profiles(hadoop_version):
         sys.exit(int(os.environ.get("CURRENT_BLOCK", 255)))
 
 
-def get_build_profiles(hadoop_version,
-                       enable_base_profiles=True,
-                       enable_hive_profiles=False):
-    """Returns a list of hadoop profiles to be used as looked up from the passed in hadoop profile
-    key with the option of adding on the base and hive profiles."""
-
-    base_profiles = ["-Pkinesis-asl"]
-    hive_profiles = ["-Phive", "-Phive-thriftserver"]
-    hadoop_profiles = get_hadoop_profiles(hadoop_version)
-
-    build_profiles = hadoop_profiles
-
-    if enable_base_profiles:
-        build_profiles += base_profiles
-
-    if enable_hive_profiles:
-        build_profiles += hive_profiles
-
-    return build_profiles
-
-
 def build_spark_maven(hadoop_version):
-    # we always build with Hive support even if we skip Hive tests in most builds
-    build_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=True)
+    # Enable all of the profiles for the build:
+    build_profiles = get_hadoop_profiles(hadoop_version) + root.build_profile_flags
     mvn_goals = ["clean", "package", "-DskipTests"]
     profiles_and_goals = build_profiles + mvn_goals
 
@@ -611,7 +607,8 @@ def build_spark_maven(hadoop_version):
 
 
 def build_spark_sbt(hadoop_version):
-    build_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=True)
+    # Enable all of the profiles for the build:
+    build_profiles = get_hadoop_profiles(hadoop_version) + root.build_profile_flags
     sbt_goals = ["package",
                  "assembly/assembly",
                  "streaming-kafka-assembly/assembly"]
@@ -674,9 +671,8 @@ def run_scala_tests(build_tool, hadoop_version, test_modules):
 
     test_modules = set(test_modules)
 
-    hive_profiles = (sql in test_modules or root in test_modules)
-    test_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=hive_profiles)
-
+    test_profiles = get_hadoop_profiles(hadoop_version) + \
+        list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules)))
     if build_tool == "maven":
         run_scala_tests_maven(test_profiles)
     else:
@@ -740,7 +736,7 @@ def main():
         hadoop_version = "hadoop2.3"
         test_env = "local"
 
-    print "[info] Using build tool", build_tool, "with profile", hadoop_version,
+    print "[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,
     print "under environment", test_env
 
     changed_modules = None