From f0249bdae0dd2f2d65b96dce2e6ec37e70b9be13 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 17 Jun 2015 17:26:34 -0700 Subject: [PATCH 01/14] WIP --- dev/run-tests.py | 230 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 205 insertions(+), 25 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index c64c71f4f723f..e30085efe7798 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -28,6 +28,199 @@ USER_HOME = os.environ.get("HOME") +all_modules = [] + + +class Module(object): + + def __init__(self, name, dependencies, source_file_regexes, sbt_test_goals=(), + should_run_python_tests=False, should_run_r_tests=False): + self.name = name + self.dependencies = dependencies + self.source_file_prefixes = source_file_regexes + self.sbt_test_goals = sbt_test_goals + self.should_run_python_tests = should_run_python_tests + self.should_run_r_tests = should_run_r_tests + + self.dependent_modules = set() + for dep in dependencies: + dep.dependent_modules.add(self) + all_modules.append(self) + + def contains_file(self, filename): + return any(re.match(p, filename) for p in self.source_file_prefixes) + + +root = Module( + name="root", + dependencies=[], + source_file_regexes=[], + sbt_test_goals=[ + "test", + ] +) + + +sql = Module( + name="sql", + dependencies=[], + source_file_regexes=[ + "sql/(?!hive-thriftserver)", + "bin/spark-sql", + "examples/src/main/java/org/apache/spark/examples/sql/", + "examples/src/main/scala/org/apache/spark/examples/sql/", + ], + sbt_test_goals=[ + "catalyst/test", + "sql/test", + "hive/test", + ]) + + +hive_thriftserver = Module( + name="hive-thriftserver", + dependencies=[sql], + source_file_regexes=[ + "sql/hive-thriftserver", + "sbin/start-thriftserver.sh", + ], + sbt_test_goals=[ + "hive-thriftserver/test", + ] +) + + +mllib = Module( + name="mllib", + dependencies=[sql], + source_file_regexes=[ + "examples/src/main/java/org/apache/spark/examples/mllib/", + "examples/src/main/scala/org/apache/spark/examples/mllib", + "data/mllib/", + "mllib/", + ], + sbt_test_goals=[ + "mllib/test", + "examples/test", + ] +) + + +graphx = Module( + name="graphx", + dependencies=[], + source_file_regexes=[ + "graphx/", + ], + sbt_test_goals=[ + "graphx/test" + ] +) + + +streaming = Module( + name="streaming", + dependencies=[], + source_file_regexes=[ + "external/", + "extras/java8-tests/", + "extras/kinesis-asl/", + "streaming", + ], + sbt_test_goals=[ + "streaming/test", + "streaming-flume/test", + "streaming-flume-sink/test", + "streaming-kafka/test", + "streaming-mqtt/test", + "streaming-twitter/test", + "streaming-zeromq/test", + ] +) + + +examples = Module( + name="examples", + dependencies=[graphx, mllib, streaming, sql], + source_file_regexes=[ + "examples/", + ], + sbt_test_goals=[ + "examples/test", + ] +) + + +pyspark = Module( + name="pyspark", + dependencies=[mllib, streaming, sql], + source_file_regexes=[ + "python/" + ], + should_run_python_tests=True +) + + +sparkr = Module( + name="sparkr", + dependencies=[sql, mllib], + source_file_regexes=[ + "R/", + ], + should_run_r_tests=True +) + + +docs = Module( + name="docs", + dependencies=[], + source_file_regexes=[ + "docs/", + ] +) + + +def determine_modules(filenames): + """ + Given a list of filenames, return the set of modules that contain those files. + If a file is not associated with a more specific submodule, then this method will consider that + file to belong to the 'root' module. + + >>> sorted(x.name for x in determine_modules(["python/pyspark/a.py", "sql/test/foo"])) + ['pyspark', 'sql'] + >>> [x.name for x in determine_modules(["file_not_matched_by_any_subproject"])] + ['root'] + """ + changed_modules = set() + for filename in filenames: + matched_at_least_one_module = False + for module in all_modules: + if module.contains_file(filename): + changed_modules.add(module) + matched_at_least_one_module = True + if not matched_at_least_one_module: + changed_modules.add(root) + return changed_modules + + +def determine_modules_to_test(changed_modules): + """ + Given a set of modules that have changed, compute the transitive closure of those modules' + dependent modules in order to determine the set of modules that should be tested. + + >>> sorted(x.name for x in determine_modules_to_test([root])) + ['root'] + >>> sorted(x.name for x in determine_modules_to_test([graphx])) + ['examples', 'graphx'] + >>> sorted(x.name for x in determine_modules_to_test([sql])) + ['examples', 'hive-thriftserver', 'mllib', 'pyspark', 'sparkr', 'sql'] + """ + modules_to_test = set() + for module in changed_modules: + modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules)) + return modules_to_test.union(set(changed_modules)) + + def get_error_codes(err_code_file): """Function to retrieve all block numbers from the `run-tests-codes.sh` file to maintain backwards compatibility with the `run-tests-jenkins` @@ -43,7 +236,7 @@ def get_error_codes(err_code_file): def exit_from_command_with_retcode(cmd, retcode): - print "[error] running", cmd, "; received return code", retcode + print "[error] running", cmd.join(' '), "; received return code", retcode sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) @@ -177,14 +370,14 @@ def build_spark_documentation(): os.chdir(SPARK_HOME) -def exec_maven(mvn_args=[]): +def exec_maven(mvn_args=()): """Will call Maven in the current directory with the list of mvn_args passed in and returns the subprocess for any further processing""" run_cmd([os.path.join(SPARK_HOME, "build", "mvn")] + mvn_args) -def exec_sbt(sbt_args=[]): +def exec_sbt(sbt_args=()): """Will call SBT in the current directory with the list of mvn_args passed in and returns the subprocess for any further processing""" @@ -231,7 +424,7 @@ def get_hadoop_profiles(hadoop_version): sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) -def get_build_profiles(hadoop_version="hadoop2.3", +def get_build_profiles(hadoop_version, enable_base_profiles=True, enable_hive_profiles=False, enable_doc_profiles=False): @@ -318,19 +511,6 @@ def identify_changed_modules(test_env): # remove any empty strings changed_files = [f for f in raw_output.split('\n') if f] - sql_files = [f for f in changed_files - if any(f.startswith(p) for p in - ["sql/", - "bin/spark-sql", - "sbin/start-thriftserver.sh", - "examples/src/main/java/org/apache/spark/examples/sql/", - "examples/src/main/scala/org/apache/spark/examples/sql/"])] - mllib_files = [f for f in changed_files - if any(f.startswith(p) for p in - ["examples/src/main/java/org/apache/spark/examples/mllib/", - "examples/src/main/scala/org/apache/spark/examples/mllib", - "data/mllib/", - "mllib/"])] streaming_files = [f for f in changed_files if any(f.startswith(p) for p in ["examples/scala-2.10/", @@ -356,12 +536,6 @@ def identify_changed_modules(test_env): if changed_core_files: changed_modules.add("CORE") - if sql_files: - print "[info] Detected changes in SQL. Will run Hive test suite." - changed_modules.add("SQL") - if mllib_files: - print "[info] Detected changes in MLlib. Will run MLlib test suite." - changed_modules.add("MLLIB") if streaming_files: print "[info] Detected changes in Streaming. Will run Streaming test suite." changed_modules.add("STREAMING") @@ -416,8 +590,6 @@ def run_scala_tests_sbt(test_modules, test_profiles): "streaming-twitter/test", "streaming-zeromq/test", "examples/test"] - if "GRAPHX" in test_modules and "CORE" not in test_modules: - sbt_test_goals += ["graphx/test", "examples/test"] if not sbt_test_goals: sbt_test_goals = ["test"] @@ -532,5 +704,13 @@ def main(): run_python_tests() run_sparkr_tests() + +def _test(): + import doctest + (failure_count, test_count) = doctest.testmod() + if failure_count: + exit(-1) + if __name__ == "__main__": + _test() main() From f53864b8ff8b21e685b42166c94cedc4e4bd00c7 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 17 Jun 2015 18:05:27 -0700 Subject: [PATCH 02/14] Finish integrating module changes --- dev/run-tests.py | 179 ++++++++++++++++++++--------------------------- 1 file changed, 76 insertions(+), 103 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index e30085efe7798..53fdabc8708b1 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -17,6 +17,7 @@ # limitations under the License. # +import itertools import os import re import sys @@ -28,6 +29,11 @@ USER_HOME = os.environ.get("HOME") +# ------------------------------------------------------------------------------------------------- +# Test module definitions and functions for traversing module dependency graph +# ------------------------------------------------------------------------------------------------- + + all_modules = [] @@ -57,7 +63,9 @@ def contains_file(self, filename): source_file_regexes=[], sbt_test_goals=[ "test", - ] + ], + should_run_python_tests=True, + should_run_r_tests=True ) @@ -180,15 +188,15 @@ def contains_file(self, filename): ) -def determine_modules(filenames): +def determine_modules_for_files(filenames): """ Given a list of filenames, return the set of modules that contain those files. If a file is not associated with a more specific submodule, then this method will consider that file to belong to the 'root' module. - >>> sorted(x.name for x in determine_modules(["python/pyspark/a.py", "sql/test/foo"])) + >>> sorted(x.name for x in determine_modules_for_files(["python/pyspark/a.py", "sql/test/foo"])) ['pyspark', 'sql'] - >>> [x.name for x in determine_modules(["file_not_matched_by_any_subproject"])] + >>> [x.name for x in determine_modules_for_files(["file_not_matched_by_any_subproject"])] ['root'] """ changed_modules = set() @@ -203,6 +211,33 @@ def determine_modules(filenames): return changed_modules +def identify_changed_modules_from_git_commits(patch_sha, target_branch=None, target_ref=None): + """ + Given a git commit and target ref, use the set of files changed in the diff in order to + determine which modules' tests should be run. + + >>> [x.name for x in \ + identify_changed_modules_from_git_commits("fc0a1475ef", target_ref="5da21f07")] + ['graphx'] + >>> 'root' in [x.name for x in \ + identify_changed_modules_from_git_commits("50a0496a43", target_ref="6765ef9")] + True + """ + if target_branch is None and target_ref is None: + raise AttributeError("must specify either target_branch or target_ref") + elif target_branch is not None and target_ref is not None: + raise AttributeError("must specify either target_branch or target_ref, not both") + if target_branch is not None: + diff_target = target_branch + run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)]) + else: + diff_target = target_ref + raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target]) + # Remove any empty strings + changed_files = [f for f in raw_output.split('\n') if f] + return determine_modules_for_files(changed_files) + + def determine_modules_to_test(changed_modules): """ Given a set of modules that have changed, compute the transitive closure of those modules' @@ -218,8 +253,16 @@ def determine_modules_to_test(changed_modules): modules_to_test = set() for module in changed_modules: modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules)) - return modules_to_test.union(set(changed_modules)) + modules_to_test = modules_to_test.union(set(changed_modules)) + if root in modules_to_test: + return [root] + else: + return modules_to_test + +# ------------------------------------------------------------------------------------------------- +# Functions for working with subprocesses and shell tools +# ------------------------------------------------------------------------------------------------- def get_error_codes(err_code_file): """Function to retrieve all block numbers from the `run-tests-codes.sh` @@ -275,7 +318,7 @@ def which(program): """Find and return the given program by its absolute path or 'None' - from: http://stackoverflow.com/a/377028""" - fpath, fname = os.path.split(program) + fpath = os.path.split(program)[0] if fpath: if is_exe(program): @@ -327,6 +370,11 @@ def determine_java_version(java_exe): update=version_info[3]) +# ------------------------------------------------------------------------------------------------- +# Functions for running the other build and test scripts +# ------------------------------------------------------------------------------------------------- + + def set_title_and_block(title, err_block): os.environ["CURRENT_BLOCK"] = ERROR_CODES[err_block] line_str = '=' * 72 @@ -494,65 +542,6 @@ def detect_binary_inop_with_mima(): run_cmd([os.path.join(SPARK_HOME, "dev", "mima")]) -def identify_changed_modules(test_env): - """Given the passed in environment will determine the changed modules and - return them as a set. If the environment is local, will simply run all tests. - If run under the `amplab_jenkins` environment will determine the changed files - as compared to the `ghprbTargetBranch` and execute the necessary set of tests - to provide coverage for the changed code.""" - changed_modules = set() - - if test_env == "amplab_jenkins": - target_branch = os.environ["ghprbTargetBranch"] - - run_cmd(['git', 'fetch', 'origin', str(target_branch+':'+target_branch)]) - - raw_output = subprocess.check_output(['git', 'diff', '--name-only', target_branch]) - # remove any empty strings - changed_files = [f for f in raw_output.split('\n') if f] - - streaming_files = [f for f in changed_files - if any(f.startswith(p) for p in - ["examples/scala-2.10/", - "examples/src/main/java/org/apache/spark/examples/streaming/", - "examples/src/main/scala/org/apache/spark/examples/streaming/", - "external/", - "extras/java8-tests/", - "extras/kinesis-asl/", - "streaming/"])] - graphx_files = [f for f in changed_files - if any(f.startswith(p) for p in - ["examples/src/main/scala/org/apache/spark/examples/graphx/", - "graphx/"])] - doc_files = [f for f in changed_files if f.startswith("docs/")] - - # union together all changed top level project files - top_level_project_files = set().union(*[set(f) for f in [sql_files, - mllib_files, - streaming_files, - graphx_files, - doc_files]]) - changed_core_files = set(changed_files).difference(top_level_project_files) - - if changed_core_files: - changed_modules.add("CORE") - if streaming_files: - print "[info] Detected changes in Streaming. Will run Streaming test suite." - changed_modules.add("STREAMING") - if graphx_files: - print "[info] Detected changes in GraphX. Will run GraphX test suite." - changed_modules.add("GRAPHX") - if doc_files: - print "[info] Detected changes in documentation. Will build spark with documentation." - changed_modules.add("DOCS") - - return changed_modules - else: - # we aren't in the Amplab environment so simply run all tests - changed_modules.add("ALL") - return changed_modules - - def run_scala_tests_maven(test_profiles): mvn_test_goals = ["test", "--fail-at-end"] profiles_and_goals = test_profiles + mvn_test_goals @@ -564,34 +553,8 @@ def run_scala_tests_maven(test_profiles): def run_scala_tests_sbt(test_modules, test_profiles): - # declare the variable for reference - sbt_test_goals = [] - if "ALL" in test_modules: - sbt_test_goals = ["test"] - else: - # if we only have changes in SQL, MLlib, Streaming, or GraphX then build - # a custom test list - if "SQL" in test_modules and "CORE" not in test_modules: - sbt_test_goals += ["catalyst/test", - "sql/test", - "hive/test", - "hive-thriftserver/test", - "mllib/test", - "examples/test"] - if "MLLIB" in test_modules and "CORE" not in test_modules: - sbt_test_goals += ["mllib/test", "examples/test"] - if "STREAMING" in test_modules and "CORE" not in test_modules: - sbt_test_goals += ["streaming/test", - "streaming-flume/test", - "streaming-flume-sink/test", - "streaming-kafka/test", - "streaming-mqtt/test", - "streaming-twitter/test", - "streaming-zeromq/test", - "examples/test"] - if not sbt_test_goals: - sbt_test_goals = ["test"] + sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules)) profiles_and_goals = test_profiles + sbt_test_goals @@ -608,7 +571,7 @@ def run_scala_tests(build_tool, hadoop_version, test_modules): test_modules = set(test_modules) - hive_profiles = ("SQL" in test_modules) + hive_profiles = (sql in test_modules) test_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=hive_profiles) if build_tool == "maven": @@ -677,16 +640,23 @@ def main(): print "[info] Using build tool", build_tool, "with profile", hadoop_version, print "under environment", test_env - # determine high level changes - changed_modules = identify_changed_modules(test_env) - print "[info] Found the following changed modules:", ", ".join(changed_modules) + changed_modules = [root] + if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): + target_branch = os.environ["ghprbTargetBranch"] + changed_modules = identify_changed_modules_from_git_commits("HEAD", + target_branch=target_branch) + print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules) + + test_modules = determine_modules_to_test(changed_modules) # license checks run_apache_rat_checks() # style checks - run_scala_style_checks() - run_python_style_checks() + if changed_modules != [docs]: + run_scala_style_checks() + if any(m.should_run_python_tests for m in changed_modules): + run_python_style_checks() # determine if docs were changed and if we're inside the amplab environment # note - the below commented out until *all* Jenkins workers can get `jekyll` installed @@ -700,17 +670,20 @@ def main(): detect_binary_inop_with_mima() # run the test suites - run_scala_tests(build_tool, hadoop_version, changed_modules) - run_python_tests() - run_sparkr_tests() + run_scala_tests(build_tool, hadoop_version, test_modules) + + if any(m.should_run_python_tests for m in test_modules): + run_python_tests() + if any(m.should_run_r_tests for m in test_modules): + run_sparkr_tests() def _test(): import doctest - (failure_count, test_count) = doctest.testmod() + failure_count = doctest.testmod()[0] if failure_count: exit(-1) if __name__ == "__main__": _test() - main() + main() \ No newline at end of file From 37f3fb3f7f11ce25370caa5c8ebf42f60e56227f Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 17 Jun 2015 18:08:08 -0700 Subject: [PATCH 03/14] Remove doc profiles option, since it's not actually needed (see #6865) --- dev/run-tests.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 53fdabc8708b1..56d6a6430a1cb 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -474,14 +474,12 @@ def get_hadoop_profiles(hadoop_version): def get_build_profiles(hadoop_version, enable_base_profiles=True, - enable_hive_profiles=False, - enable_doc_profiles=False): + enable_hive_profiles=False): """Returns a list of hadoop profiles to be used as looked up from the passed in hadoop profile key with the option of adding on the base and hive profiles.""" base_profiles = ["-Pkinesis-asl"] hive_profiles = ["-Phive", "-Phive-thriftserver"] - doc_profiles = [] hadoop_profiles = get_hadoop_profiles(hadoop_version) build_profiles = hadoop_profiles @@ -492,9 +490,6 @@ def get_build_profiles(hadoop_version, if enable_hive_profiles: build_profiles += hive_profiles - if enable_doc_profiles: - build_profiles += doc_profiles - return build_profiles From 33714415a294ddb8044829438e08a2b25d7c8caa Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 17 Jun 2015 18:10:06 -0700 Subject: [PATCH 04/14] Test everything if nothing has changed (needed for non-PRB builds) --- dev/run-tests.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 56d6a6430a1cb..87de4016c65d0 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -635,11 +635,13 @@ def main(): print "[info] Using build tool", build_tool, "with profile", hadoop_version, print "under environment", test_env - changed_modules = [root] + changed_modules = None if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] changed_modules = identify_changed_modules_from_git_commits("HEAD", target_branch=target_branch) + if not changed_modules: + changed_modules = ['root'] print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules) test_modules = determine_modules_to_test(changed_modules) From 43a0ced49eea09831d9bc333af9e20a75c7a79cf Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 17 Jun 2015 18:11:56 -0700 Subject: [PATCH 05/14] Minor fixes --- dev/run-tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 87de4016c65d0..8af8dd258a657 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -551,7 +551,7 @@ def run_scala_tests_sbt(test_modules, test_profiles): sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules)) - profiles_and_goals = test_profiles + sbt_test_goals + profiles_and_goals = test_profiles + list(sbt_test_goals) print "[info] Running Spark tests using SBT with these arguments:", print " ".join(profiles_and_goals) @@ -641,7 +641,7 @@ def main(): changed_modules = identify_changed_modules_from_git_commits("HEAD", target_branch=target_branch) if not changed_modules: - changed_modules = ['root'] + changed_modules = [root] print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules) test_modules = determine_modules_to_test(changed_modules) From 7092d3e37c21662175e5991ec766c79b121019ac Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Wed, 17 Jun 2015 18:55:41 -0700 Subject: [PATCH 06/14] Skip SBT tests if no test goals are specified --- dev/run-tests.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dev/run-tests.py b/dev/run-tests.py index 8af8dd258a657..95ecbf8995898 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -551,6 +551,9 @@ def run_scala_tests_sbt(test_modules, test_profiles): sbt_test_goals = set(itertools.chain.from_iterable(m.sbt_test_goals for m in test_modules)) + if not sbt_test_goals: + return + profiles_and_goals = test_profiles + list(sbt_test_goals) print "[info] Running Spark tests using SBT with these arguments:", From dc6f1c62c96d8119ee7a0ec0370aa28473a315ff Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 18 Jun 2015 23:28:17 -0700 Subject: [PATCH 07/14] Use changed files' extensions to decide whether to run style checks --- dev/run-tests.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 95ecbf8995898..8c118f41df30b 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -211,16 +211,16 @@ def determine_modules_for_files(filenames): return changed_modules -def identify_changed_modules_from_git_commits(patch_sha, target_branch=None, target_ref=None): +def identify_changed_files_from_git_commits(patch_sha, target_branch=None, target_ref=None): """ Given a git commit and target ref, use the set of files changed in the diff in order to determine which modules' tests should be run. - >>> [x.name for x in \ - identify_changed_modules_from_git_commits("fc0a1475ef", target_ref="5da21f07")] + >>> [x.name for x in determine_modules_for_files( \ + identify_changed_files_from_git_commits("fc0a1475ef", target_ref="5da21f07"))] ['graphx'] - >>> 'root' in [x.name for x in \ - identify_changed_modules_from_git_commits("50a0496a43", target_ref="6765ef9")] + >>> 'root' in [x.name for x in determine_modules_for_files( \ + identify_changed_files_from_git_commits("50a0496a43", target_ref="6765ef9"))] True """ if target_branch is None and target_ref is None: @@ -234,8 +234,7 @@ def identify_changed_modules_from_git_commits(patch_sha, target_branch=None, tar diff_target = target_ref raw_output = subprocess.check_output(['git', 'diff', '--name-only', patch_sha, diff_target]) # Remove any empty strings - changed_files = [f for f in raw_output.split('\n') if f] - return determine_modules_for_files(changed_files) + return [f for f in raw_output.split('\n') if f] def determine_modules_to_test(changed_modules): @@ -639,10 +638,11 @@ def main(): print "under environment", test_env changed_modules = None + changed_files = None if test_env == "amplab_jenkins" and os.environ.get("AMP_JENKINS_PRB"): target_branch = os.environ["ghprbTargetBranch"] - changed_modules = identify_changed_modules_from_git_commits("HEAD", - target_branch=target_branch) + changed_files = identify_changed_files_from_git_commits("HEAD", target_branch=target_branch) + changed_modules = determine_modules_for_files(changed_files) if not changed_modules: changed_modules = [root] print "[info] Found the following changed modules:", ", ".join(x.name for x in changed_modules) @@ -653,9 +653,9 @@ def main(): run_apache_rat_checks() # style checks - if changed_modules != [docs]: + if not changed_files or any(f.endsWith(".scala") for f in changed_files): run_scala_style_checks() - if any(m.should_run_python_tests for m in changed_modules): + if not changed_files or any(f.endsWith(".py") for f in changed_files): run_python_style_checks() # determine if docs were changed and if we're inside the amplab environment @@ -686,4 +686,4 @@ def _test(): if __name__ == "__main__": _test() - main() \ No newline at end of file + main() From 3670d50e8e4f6fa99732721be19c13e9ed3ff6cb Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 18 Jun 2015 23:29:24 -0700 Subject: [PATCH 08/14] mllib should depend on streaming --- dev/run-tests.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 8c118f41df30b..d5fc51114fff1 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -98,22 +98,6 @@ def contains_file(self, filename): ) -mllib = Module( - name="mllib", - dependencies=[sql], - source_file_regexes=[ - "examples/src/main/java/org/apache/spark/examples/mllib/", - "examples/src/main/scala/org/apache/spark/examples/mllib", - "data/mllib/", - "mllib/", - ], - sbt_test_goals=[ - "mllib/test", - "examples/test", - ] -) - - graphx = Module( name="graphx", dependencies=[], @@ -147,6 +131,22 @@ def contains_file(self, filename): ) +mllib = Module( + name="mllib", + dependencies=[streaming, sql], + source_file_regexes=[ + "examples/src/main/java/org/apache/spark/examples/mllib/", + "examples/src/main/scala/org/apache/spark/examples/mllib", + "data/mllib/", + "mllib/", + ], + sbt_test_goals=[ + "mllib/test", + "examples/test", + ] +) + + examples = Module( name="examples", dependencies=[graphx, mllib, streaming, sql], From df10e23ae658caf75a5a265ce6b117611f5e0674 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 18 Jun 2015 23:32:19 -0700 Subject: [PATCH 09/14] update to reflect fact that no module depends on root --- dev/run-tests.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index d5fc51114fff1..dbbef1a616449 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -249,14 +249,15 @@ def determine_modules_to_test(changed_modules): >>> sorted(x.name for x in determine_modules_to_test([sql])) ['examples', 'hive-thriftserver', 'mllib', 'pyspark', 'sparkr', 'sql'] """ + # If we're going to have to run all of the tests, then we can just short-circuit + # and return 'root'. No module depends on root, so if it appears then it will be + # in changed_modules. + if root in changed_modules: + return [root] modules_to_test = set() for module in changed_modules: modules_to_test = modules_to_test.union(determine_modules_to_test(module.dependent_modules)) - modules_to_test = modules_to_test.union(set(changed_modules)) - if root in modules_to_test: - return [root] - else: - return modules_to_test + return modules_to_test.union(set(changed_modules)) # ------------------------------------------------------------------------------------------------- From 35a3052db88550f81b7e78615a7ef252bd9ef929 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 18 Jun 2015 23:47:06 -0700 Subject: [PATCH 10/14] Enable Hive tests when running all tests --- dev/run-tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index dbbef1a616449..c37cd2670eefc 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -569,7 +569,7 @@ def run_scala_tests(build_tool, hadoop_version, test_modules): test_modules = set(test_modules) - hive_profiles = (sql in test_modules) + hive_profiles = (sql in test_modules or root in test_modules) test_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=hive_profiles) if build_tool == "maven": From e46539fb9140d2b0c1d6d616a6b9d4c4f697d061 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Thu, 18 Jun 2015 23:49:54 -0700 Subject: [PATCH 11/14] Fix camel-cased endswith() --- dev/run-tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index c37cd2670eefc..cc8466b1baa3a 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -654,9 +654,9 @@ def main(): run_apache_rat_checks() # style checks - if not changed_files or any(f.endsWith(".scala") for f in changed_files): + if not changed_files or any(f.endswith(".scala") for f in changed_files): run_scala_style_checks() - if not changed_files or any(f.endsWith(".py") for f in changed_files): + if not changed_files or any(f.endswith(".py") for f in changed_files): run_python_style_checks() # determine if docs were changed and if we're inside the amplab environment From a86a953443411aac830b615cbfc06763520a6f2f Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 19 Jun 2015 15:38:54 -0700 Subject: [PATCH 12/14] Clean up modules; add new modules for streaming external projects --- dev/run-tests.py | 106 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 93 insertions(+), 13 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index cc8466b1baa3a..bc97d031ffcc7 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -75,8 +75,6 @@ def contains_file(self, filename): source_file_regexes=[ "sql/(?!hive-thriftserver)", "bin/spark-sql", - "examples/src/main/java/org/apache/spark/examples/sql/", - "examples/src/main/scala/org/apache/spark/examples/sql/", ], sbt_test_goals=[ "catalyst/test", @@ -114,35 +112,108 @@ def contains_file(self, filename): name="streaming", dependencies=[], source_file_regexes=[ - "external/", - "extras/java8-tests/", - "extras/kinesis-asl/", "streaming", ], sbt_test_goals=[ "streaming/test", - "streaming-flume/test", - "streaming-flume-sink/test", - "streaming-kafka/test", - "streaming-mqtt/test", - "streaming-twitter/test", + ] +) + + +streaming_kinesis_asl = Module( + name="kinesis-asl", + dependencies=[streaming], + source_file_regexes=[ + "extras/kinesis-asl/", + ], + sbt_test_goals=[ + "kinesis-asl/test", + ] +) + + +streaming_zeromq = Module( + name="streaming-zeromq", + dependencies=[streaming], + source_file_regexes=[ + "external/zeromq", + ], + sbt_test_goals=[ "streaming-zeromq/test", ] ) +streaming_twitter = Module( + name="streaming-twitter", + dependencies=[streaming], + source_file_regexes=[ + "external/twitter", + ], + sbt_test_goals=[ + "streaming-twitter/test", + ] +) + + +streaming_mqqt = Module( + name="streaming-mqqt", + dependencies=[streaming], + source_file_regexes=[ + "external/mqqt", + ], + sbt_test_goals=[ + "streaming-mqqt/test", + ] +) + + +streaming_kafka = Module( + name="streaming-kafka", + dependencies=[streaming], + source_file_regexes=[ + "external/kafka", + "external/kafka-assembly", + ], + sbt_test_goals=[ + "streaming-kafka/test", + ] +) + + +streaming_flume_sink = Module( + name="streaming-flume-sink", + dependencies=[streaming], + source_file_regexes=[ + "external/flume-sink", + ], + sbt_test_goals=[ + "streaming-flume-sink/test", + ] +) + + +streaming_flume = Module( + name="streaming_flume", + dependencies=[streaming], + source_file_regexes=[ + "external/flume", + ], + sbt_test_goals=[ + "streaming-flume/test", + ] +) + + mllib = Module( name="mllib", dependencies=[streaming, sql], source_file_regexes=[ - "examples/src/main/java/org/apache/spark/examples/mllib/", - "examples/src/main/scala/org/apache/spark/examples/mllib", "data/mllib/", "mllib/", ], sbt_test_goals=[ "mllib/test", - "examples/test", ] ) @@ -188,6 +259,15 @@ def contains_file(self, filename): ) +ec2 = Module( + name="ec2", + dependencies=[], + source_file_regexes=[ + "ec2/", + ] +) + + def determine_modules_for_files(filenames): """ Given a list of filenames, return the set of modules that contain those files. From 4224da5164f77cc684d3995db4570cfa23b9c852 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 19 Jun 2015 21:58:54 -0700 Subject: [PATCH 13/14] Add documentation to Module. --- dev/run-tests.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index bc97d031ffcc7..45b1b4aa46d3a 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -38,9 +38,31 @@ class Module(object): + """ + A module is the basic abstraction in our test runner script. Each module consists of a set of + source files, a set of test commands, and a set of dependencies on other modules. We use modules + to define a dependency graph that lets determine which tests to run based on which files have + changed. + """ def __init__(self, name, dependencies, source_file_regexes, sbt_test_goals=(), should_run_python_tests=False, should_run_r_tests=False): + """ + Define a new module. + + :param name: A short module name, for display in logging and error messages. + :param dependencies: A set of dependencies for this module. This should only include direct + dependencies; transitive dependencies are resolved automatically. + :param source_file_regexes: a set of regexes that match source files belonging to this + module. These regexes are applied by attempting to match at the beginning of the + filename strings. + :param sbt_test_goals: A set of SBT test goals for testing this module/ + :param should_run_python_tests: If true, changes in this module will trigger Python tests. + For now, this has the effect of causing _all_ Python tests to be run, although in the + future this should be changed to run only a subset of the Python tests that depend + on this module. + :param should_run_r_tests: If true, changes in this module will trigger all R tests. + """ self.name = name self.dependencies = dependencies self.source_file_prefixes = source_file_regexes @@ -80,7 +102,8 @@ def contains_file(self, filename): "catalyst/test", "sql/test", "hive/test", - ]) + ] +) hive_thriftserver = Module( @@ -534,8 +557,10 @@ def exec_sbt(sbt_args=()): def get_hadoop_profiles(hadoop_version): - """Return a list of profiles indicating which Hadoop version to use from - a Hadoop version tag.""" + """ + For the given Hadoop version tag, return a list of SBT profile flags for + building and testing against that Hadoop version. + """ sbt_maven_hadoop_profiles = { "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.0.4"], From 75de4506e302d8fa6a5f69465c6b1ee1ba8eb596 Mon Sep 17 00:00:00 2001 From: Josh Rosen Date: Fri, 19 Jun 2015 22:29:18 -0700 Subject: [PATCH 14/14] Use module system to determine which build profiles to enable. --- dev/run-tests.py | 84 +++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 44 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index 45b1b4aa46d3a..474889ec2d8aa 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -45,8 +45,8 @@ class Module(object): changed. """ - def __init__(self, name, dependencies, source_file_regexes, sbt_test_goals=(), - should_run_python_tests=False, should_run_r_tests=False): + def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), + sbt_test_goals=(), should_run_python_tests=False, should_run_r_tests=False): """ Define a new module. @@ -56,7 +56,9 @@ def __init__(self, name, dependencies, source_file_regexes, sbt_test_goals=(), :param source_file_regexes: a set of regexes that match source files belonging to this module. These regexes are applied by attempting to match at the beginning of the filename strings. - :param sbt_test_goals: A set of SBT test goals for testing this module/ + :param build_profile_flags: A set of profile flags that should be passed to Maven or SBT in + order to build and test this module (e.g. '-PprofileName'). + :param sbt_test_goals: A set of SBT test goals for testing this module. :param should_run_python_tests: If true, changes in this module will trigger Python tests. For now, this has the effect of causing _all_ Python tests to be run, although in the future this should be changed to run only a subset of the Python tests that depend @@ -67,6 +69,7 @@ def __init__(self, name, dependencies, source_file_regexes, sbt_test_goals=(), self.dependencies = dependencies self.source_file_prefixes = source_file_regexes self.sbt_test_goals = sbt_test_goals + self.build_profile_flags = build_profile_flags self.should_run_python_tests = should_run_python_tests self.should_run_r_tests = should_run_r_tests @@ -79,18 +82,6 @@ def contains_file(self, filename): return any(re.match(p, filename) for p in self.source_file_prefixes) -root = Module( - name="root", - dependencies=[], - source_file_regexes=[], - sbt_test_goals=[ - "test", - ], - should_run_python_tests=True, - should_run_r_tests=True -) - - sql = Module( name="sql", dependencies=[], @@ -98,6 +89,9 @@ def contains_file(self, filename): "sql/(?!hive-thriftserver)", "bin/spark-sql", ], + build_profile_flags=[ + "-Phive", + ], sbt_test_goals=[ "catalyst/test", "sql/test", @@ -113,6 +107,9 @@ def contains_file(self, filename): "sql/hive-thriftserver", "sbin/start-thriftserver.sh", ], + build_profile_flags=[ + "-Phive-thriftserver", + ], sbt_test_goals=[ "hive-thriftserver/test", ] @@ -149,6 +146,9 @@ def contains_file(self, filename): source_file_regexes=[ "extras/kinesis-asl/", ], + build_profile_flags=[ + "-Pkinesis-asl", + ], sbt_test_goals=[ "kinesis-asl/test", ] @@ -291,6 +291,23 @@ def contains_file(self, filename): ) +# The root module is a dummy module which is used to run all of the tests. +# No other modules should directly depend on this module. +root = Module( + name="root", + dependencies=[], + source_file_regexes=[], + # In order to run all of the tests, enable every test profile: + build_profile_flags= + list(set(itertools.chain.from_iterable(m.build_profile_flags for m in all_modules))), + sbt_test_goals=[ + "test", + ], + should_run_python_tests=True, + should_run_r_tests=True +) + + def determine_modules_for_files(filenames): """ Given a list of filenames, return the set of modules that contain those files. @@ -382,7 +399,7 @@ def get_error_codes(err_code_file): def exit_from_command_with_retcode(cmd, retcode): - print "[error] running", cmd.join(' '), "; received return code", retcode + print "[error] running", ' '.join(cmd), "; received return code", retcode sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) @@ -577,30 +594,9 @@ def get_hadoop_profiles(hadoop_version): sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) -def get_build_profiles(hadoop_version, - enable_base_profiles=True, - enable_hive_profiles=False): - """Returns a list of hadoop profiles to be used as looked up from the passed in hadoop profile - key with the option of adding on the base and hive profiles.""" - - base_profiles = ["-Pkinesis-asl"] - hive_profiles = ["-Phive", "-Phive-thriftserver"] - hadoop_profiles = get_hadoop_profiles(hadoop_version) - - build_profiles = hadoop_profiles - - if enable_base_profiles: - build_profiles += base_profiles - - if enable_hive_profiles: - build_profiles += hive_profiles - - return build_profiles - - def build_spark_maven(hadoop_version): - # we always build with Hive support even if we skip Hive tests in most builds - build_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=True) + # Enable all of the profiles for the build: + build_profiles = get_hadoop_profiles(hadoop_version) + root.build_profile_flags mvn_goals = ["clean", "package", "-DskipTests"] profiles_and_goals = build_profiles + mvn_goals @@ -611,7 +607,8 @@ def build_spark_maven(hadoop_version): def build_spark_sbt(hadoop_version): - build_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=True) + # Enable all of the profiles for the build: + build_profiles = get_hadoop_profiles(hadoop_version) + root.build_profile_flags sbt_goals = ["package", "assembly/assembly", "streaming-kafka-assembly/assembly"] @@ -674,9 +671,8 @@ def run_scala_tests(build_tool, hadoop_version, test_modules): test_modules = set(test_modules) - hive_profiles = (sql in test_modules or root in test_modules) - test_profiles = get_build_profiles(hadoop_version, enable_hive_profiles=hive_profiles) - + test_profiles = get_hadoop_profiles(hadoop_version) + \ + list(set(itertools.chain.from_iterable(m.build_profile_flags for m in test_modules))) if build_tool == "maven": run_scala_tests_maven(test_profiles) else: @@ -740,7 +736,7 @@ def main(): hadoop_version = "hadoop2.3" test_env = "local" - print "[info] Using build tool", build_tool, "with profile", hadoop_version, + print "[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version, print "under environment", test_env changed_modules = None