diff --git a/dev/run-tests.py b/dev/run-tests.py index 0a178e655924d..38859da5486f5 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -24,25 +24,21 @@ import subprocess from collections import namedtuple -SPARK_PROJ_ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") -USER_HOME_DIR = os.environ.get("HOME") +SPARK_HOME = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") +USER_HOME = os.environ.get("HOME") -SBT_MAVEN_PROFILE_ARGS_ENV = "SBT_MAVEN_PROFILES_ARGS" -AMPLAB_JENKINS_BUILD_TOOL = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL") -AMPLAB_JENKINS = os.environ.get("AMPLAB_JENKINS") - -SBT_OUTPUT_FILTER = re.compile("^.*[info].*Resolving" + "|" + - "^.*[warn].*Merging" + "|" + - "^.*[info].*Including") +#SBT_MAVEN_PROFILE_ARGS_ENV = "SBT_MAVEN_PROFILES_ARGS" +#AMPLAB_JENKINS_BUILD_TOOL = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") +#AMPLAB_JENKINS = os.environ.get("AMPLAB_JENKINS") def get_error_codes(err_code_file): """Function to retrieve all block numbers from the `run-tests-codes.sh` - file to maintain backwards compatibility with the `run-tests-jenkins` + file to maintain backwards compatibility with the `run-tests-jenkins` script""" - + with open(err_code_file, 'r') as f: - err_codes = [e.split()[1].strip().split('=') + err_codes = [e.split()[1].strip().split('=') for e in f if e.startswith("readonly")] return dict(err_codes) @@ -63,13 +59,6 @@ def rm_r(path): os.remove(path) -def lineno(): - """Returns the current line number in our program - - from: http://stackoverflow.com/a/3056059""" - - return inspect.currentframe().f_back.f_lineno - - def run_cmd(cmd): """Given a command as a list of arguments will attempt to execute the command and, on failure, print an error message""" @@ -82,32 +71,6 @@ def run_cmd(cmd): exit_from_command_with_retcode(e.cmd, e.returncode) -def set_sbt_maven_profile_args(): - """Properly sets the SBT environment variable arguments with additional - checks to determine if this is running on an Amplab Jenkins machine""" - - # base environment values for SBT_MAVEN_PROFILE_ARGS_ENV which will be appended on - sbt_maven_profile_args_base = ["-Pkinesis-asl"] - - sbt_maven_profile_arg_dict = { - "hadoop1.0" : ["-Phadoop-1", "-Dhadoop.version=1.0.4"], - "hadoop2.0" : ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"], - "hadoop2.2" : ["-Pyarn", "-Phadoop-2.2"], - "hadoop2.3" : ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"], - } - - # set the SBT maven build profile argument environment variable and ensure - # we build against the right version of Hadoop - if os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE"): - os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \ - " ".join(sbt_maven_profile_arg_dict.get(ajbp, []) - + sbt_maven_profile_args_base) - else: - os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \ - " ".join(sbt_maven_profile_arg_dict.get("hadoop2.3", []) - + sbt_maven_profile_args_base) - - def is_exe(path): """Check if a given path is an executable file - from: http://stackoverflow.com/a/377028""" @@ -134,7 +97,12 @@ def which(program): def determine_java_executable(): - """Will return the *best* path possible for a 'java' executable or `None`""" + """Will return the path of the java executable that will be used by Spark's + tests or `None`""" + + # Any changes in the way that Spark's build detects java must be reflected + # here. Currently the build looks for $JAVA_HOME/bin/java then falls back to + # the `java` executable on the path java_home = os.environ.get("JAVA_HOME") @@ -144,21 +112,21 @@ def determine_java_executable(): return java_exe if java_exe else which("java") +JavaVersion = namedtuple('JavaVersion', ['major', 'minor', 'patch', 'update']) + + def determine_java_version(java_exe): """Given a valid java executable will return its version in named tuple format with accessors '.major', '.minor', '.patch', '.update'""" - raw_output = subprocess.check_output([java_exe, "-version"], + raw_output = subprocess.check_output([java_exe, "-version"], stderr=subprocess.STDOUT) - raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"' - version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25' - version, update = version_str.split('_') # eg ['1.8.0', '25'] - - JavaVersion = namedtuple('JavaVersion', - ['major', 'minor', 'patch', 'update']) + raw_version_str = raw_output.split('\n')[0] # eg 'java version "1.8.0_25"' + version_str = raw_version_str.split()[-1].strip('"') # eg '1.8.0_25' + version, update = version_str.split('_') # eg ['1.8.0', '25'] # map over the values and convert them to integers - version_info = map(lambda x: int(x), version.split('.') + [update]) + version_info = [int(x) for x in version.split('.') + [update]] return JavaVersion(major=version_info[0], minor=version_info[1], @@ -166,56 +134,6 @@ def determine_java_version(java_exe): update=version_info[3]) -def multi_starts_with(orig_str, *prefixes): - """Takes a string and an abritrary number of prefixes then checks the - original string for any of the possible prefixes passed in""" - - for s in prefixes: - if orig_str.startswith(s): - return True - return False - - -def determine_test_suite(): - """This function current acts to determine if SQL tests need to be run in - addition to the core test suite *or* if _only_ SQL tests need to be run - as the git logs show that to be the only thing touched. In the future - this function will act more generically to help further segregate the - test suite runner (hence the function name). - @return a set of unique test names""" - test_suite = list() - - if AMPLAB_JENKINS: - run_cmd(['git', 'fetch', 'origin', 'master:master']) - - raw_output = subprocess.check_output(['git', 'diff', '--name-only', 'master']) - # remove any empty strings - changed_files = [f for f in raw_output.split('\n') if f] - - # find any sql files - sql_files = [f for f in changed_files - if multi_starts_with(f, - "sql/", - "bin/spark-sql", - "sbin/start-thriftserver.sh")] - - non_sql_files = set(changed_files).difference(set(sql_files)) - - if non_sql_files: - test_suite.append("CORE") - if sql_files: - print "[info] Detected changes in SQL. Will run Hive test suite." - test_suite.append("SQL") - if not non_sql_files: - print "[info] Detected no changes except in SQL. Will only run SQL tests." - return set(test_suite) - else: - # we aren't in the Amplab environment so merely run all tests - test_suite.append("CORE") - test_suite.append("SQL") - return set(test_suite) - - def set_title_and_block(title, err_block): os.environ["CURRENT_BLOCK"] = error_codes[err_block] line_str = '=' * 72 @@ -254,6 +172,10 @@ def exec_sbt(sbt_args=[]): sbt_cmd = ["./build/sbt"] + sbt_args + sbt_output_filter = re.compile("^.*[info].*Resolving" + "|" + + "^.*[warn].*Merging" + "|" + + "^.*[info].*Including") + # NOTE: echo "q" is needed because sbt on encountering a build file # with failure (either resolution or compilation) prompts the user for # input either q, r, etc to quit or retry. This echo is there to make it @@ -264,42 +186,90 @@ def exec_sbt(sbt_args=[]): stdout=subprocess.PIPE) echo_proc.wait() for line in iter(sbt_proc.stdout.readline, ''): - if not SBT_OUTPUT_FILTER.match(line): - print line, + if not sbt_output_filter.match(line): + print line, retcode = sbt_proc.wait() if retcode > 0: exit_from_command_with_retcode(sbt_cmd, retcode) -def build_apache_spark(): - """Will first build Spark with Hive v0.12.0 to ensure the build is - successful and, after, will build Spark again against Hive v0.13.1 as the - tests are based off that""" +def get_hadoop_profiles(hadoop_version): + """Return a list of profiles indicating which Hadoop version to use from a Hadoop version tag.""" + + #amplab_jenkins_build_profile = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE") + + sbt_maven_hadoop_profiles = { + "hadoop1.0": ["-Phadoop-1", "-Dhadoop.version=1.0.4"], + "hadoop2.0": ["-Phadoop-1", "-Dhadoop.version=2.0.0-mr1-cdh4.1.1"], + "hadoop2.2": ["-Pyarn", "-Phadoop-2.2"], + "hadoop2.3": ["-Pyarn", "-Phadoop-2.3", "-Dhadoop.version=2.3.0"], + } + + try: + hadoop_profiles = sbt_maven_hadoop_profiles[hadoop_version] + except KeyError: + print "[error] Could not find", hadoop_version, "in the list. Valid options", + print "are 'hadoop1.0', 'hadoop2.0', 'hadoop2.2', and 'hadoop2.3'." + sys.exit(int(os.environ.get("CURRENT_BLOCK", 255))) + + return hadoop_profiles + + +def get_build_profiles(hadoop_version="hadoop2.3", + base_profiles=True, + hive_profiles=False): + """Returns a list of hadoop profiles to be used as looked up from the passed in hadoop profile + key with the option of adding on the base and hive profiles.""" + + base_profiles = ["-Pkinesis-asl"] + hive_profiles = ["-Phive", "-Phive-thriftserver"] + hadoop_profiles = get_hadoop_profiles(hadoop_version) + + # first, check and add the base profiles + if base_profiles: build_profiles = build_profile + base_profiles + # second, check and add the hive profiles + if hive_profiles: build_profiles = build_profile + hive_profiles + + return build_profiles + + +def build_spark_maven(hadoop_version): + build_profiles = get_build_profiles(hadoop_version, hive_profiles=True) + mvn_goals = ["clean", "package", "-DskipTests"] + profiles_and_goals = build_profiles + mvn_goals + + print "[info] Building Spark (w/Hive 0.13.1) with these arguments:", + print " ".join(profiles_and_goals) + + exec_maven(profiles_and_goals) + + +def build_spark_sbt(hadoop_version): + build_profiles = get_build_profiles(hadoop_version, hive_profiles=True) + sbt_goals = ["package", + "assembly/assembly", + "streaming-kafka-assembly/assembly"] + profiles_and_goals = build_profiles + sbt_goals + + print "[info] Building Spark (w/Hive 0.13.1) with these arguments:", + print " ".join(profiles_and_goals) + + exec_sbt(profiles_and_goals) + + +def build_apache_spark(build_tool, hadoop_version): + """Will build Spark against Hive v0.13.1 given the passed in build tool (either `sbt` or + `maven`). Defaults to using `sbt`.""" set_title_and_block("Building Spark", "BLOCK_BUILD") - sbt_maven_profile_args = os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split() - hive_profile_args = sbt_maven_profile_args + ["-Phive", - "-Phive-thriftserver"] - # set the default maven args - base_mvn_args = ["clean", "package", "-DskipTests"] - # set the necessary sbt goals - sbt_hive_goals = ["package", - "assembly/assembly", - "streaming-kafka-assembly/assembly"] - - # Then build with default Hive version (0.13.1) because tests are based on - # this version - print "[info] Compile with Hive 0.13.1" rm_r("lib_managed") - print "[info] Building Spark with these arguments:", - print " ".join(hive_profile_args) - if AMPLAB_JENKINS_BUILD_TOOL == "maven": - exec_maven(hive_profile_args + base_mvn_args) + if build_tool == "maven": + build_spark_maven(hadoop_version) else: - exec_sbt(hive_profile_args + sbt_hive_goals) + build_spark_sbt(hadoop_version) def detect_binary_inop_with_mima(): @@ -308,49 +278,98 @@ def detect_binary_inop_with_mima(): run_cmd(["./dev/mima"]) -def run_scala_tests(test_suite=[]): - """Function to properly execute all tests pass in, as a list, from the - `determine_test_suite` function""" - set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") +def determine_test_modules(test_env): + """This function current acts to determine if SQL tests need to be run in + addition to the core test suite *or* if _only_ SQL tests need to be run + as the git logs show that to be the only thing touched. In the future + this function will act more generically to help further segregate the + test suite runner (hence the function name). + @return a set of unique test names""" + test_suite = list() - # ensure the test_suite is a set - if not isinstance(test_suite, set): - test_suite = set(test_suite) + if test_env == "amplab_jenkins": + target_branch = os.environ.get("ghprbTargetBranch") + run_cmd(['git', 'fetch', 'origin', target_branch+":"+target_branch]) - # if the Spark SQL tests are enabled, run the tests with the Hive profiles - # enabled. - if "SQL" in test_suite: - sbt_maven_profile_args = \ - os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split() - os.environ[SBT_MAVEN_PROFILE_ARGS_ENV] = \ - " ".join(sbt_maven_profile_args + ["-Phive", "-Phive-thriftserver"]) - - # if we only have changes in SQL build a custom test string - if "SQL" in test_suite and "CORE" not in test_suite: - sbt_maven_test_args = ["catalyst/test", - "sql/test", - "hive/test", - "hive-thriftserver/test", - "mllib/test"] + raw_output = subprocess.check_output(['git', 'diff', '--name-only', target_branch]) + # remove any empty strings + changed_files = [f for f in raw_output.split('\n') if f] + + # find any sql files + sql_files = [f for f in changed_files + if any(f.startswith(p) for p in ["sql/", + "bin/spark-sql", + "sbin/start-thriftserver.sh"])] + + non_sql_files = set(changed_files).difference(set(sql_files)) + + if non_sql_files: + test_suite.append("CORE") + if sql_files: + print "[info] Detected changes in SQL. Will run Hive test suite." + test_suite.append("SQL") + if not non_sql_files: + print "[info] Detected no changes except in SQL. Will only run SQL tests." + return set(test_suite) else: - sbt_maven_test_args = ["test"] + # we aren't in the Amplab environment so simply run all tests + test_suite.append("CORE") + test_suite.append("SQL") + return set(test_suite) + + +def run_scala_tests_maven(test_profiles): + mvn_test_goals = ["test", "--fail-at-end"] + profiles_and_goals = test_profiles + mvn_test_goals + + print "[info] Running Spark tests with these arguments:", + print " ".join(profiles_and_goals) + + exec_maven(profiles_and_goals) - # get the latest sbt maven profile arguments - sbt_maven_profile_args = os.environ.get(SBT_MAVEN_PROFILE_ARGS_ENV).split() + +def run_scala_tests_sbt(test_modules, test_profiles): + # if we only have changes in SQL build a custom test list + if "SQL" in test_modules and "CORE" not in test_modules: + sbt_test_goals = ["catalyst/test", + "sql/test", + "hive/test", + "hive-thriftserver/test", + "mllib/test"] + else: + sbt_test_goals = ["test"] + + profiles_and_goals = test_profiles + sbt_test_goals print "[info] Running Spark tests with these arguments:", - print " ".join(sbt_maven_profile_args), - print " ".join(sbt_maven_test_args) + print " ".join(profiles_and_goals) + + exec_sbt(profiles_and_goals) + + +def run_scala_tests(build_tool, hadoop_version, test_modules): + """Function to properly execute all tests passed in as a set from the + `determine_test_suites` function""" + set_title_and_block("Running Spark unit tests", "BLOCK_SPARK_UNIT_TESTS") + + test_modules = set(test_modules) - if AMPLAB_JENKINS_BUILD_TOOL == "maven": - exec_maven(["test"] + sbt_maven_profile_args + ["--fail-at-end"]) + # if the Spark SQL tests are enabled, run the tests with the Hive profiles + # enabled. + if "SQL" in test_modules: + test_profiles = get_build_profiles(hadoop_version, hive_profiles=True) + else: + test_profiles = get_build_profiles(hadoop_version) + + if build_tool == "maven": + run_scala_tests_maven(test_profiles) else: - exec_sbt(sbt_maven_profile_args + sbt_maven_test_args) + run_scala_tests_sbt(test_modules, test_profiles) -def run_python_tests(test_suite=[]): +def run_python_tests(): set_title_and_block("Running PySpark tests", "BLOCK_PYSPARK_UNIT_TESTS") - + # Add path for Python3 in Jenkins if we're calling from a Jenkins machine if AMPLAB_JENKINS: os.environ["PATH"] = os.environ.get("PATH")+":/home/anaconda/envs/py3k/bin" @@ -358,7 +377,7 @@ def run_python_tests(test_suite=[]): run_cmd(["./python/run-tests"]) -def run_sparkr_tests(test_suite=[]): +def run_sparkr_tests(): set_title_and_block("Running SparkR tests", "BLOCK_SPARKR_UNIT_TESTS") if which("R"): @@ -367,29 +386,28 @@ def run_sparkr_tests(test_suite=[]): else: print "Ignoring SparkR tests as R was not found in PATH" -if __name__ == "__main__": + +def main(): # Ensure the user home directory (HOME) is valid and is an absolute directory - if not USER_HOME_DIR or not os.path.isabs(USER_HOME_DIR): + if not USER_HOME or not os.path.isabs(USER_HOME): print "[error] Cannot determine your home directory as an absolute path;", print "ensure the $HOME environment variable is set properly." sys.exit(1) - os.chdir(SPARK_PROJ_ROOT) + os.chdir(SPARK_HOME) - rm_r("./work") - rm_r(os.path.join(USER_HOME_DIR, ".ivy2/local/org.apache.spark")) - rm_r(os.path.join(USER_HOME_DIR, ".ivy2/cache/org.apache.spark")) + rm_r(os.path.join(SPARK_HOME, "work")) + rm_r(os.path.join(USER_HOME, ".ivy2/local/org.apache.spark")) + rm_r(os.path.join(USER_HOME, ".ivy2/cache/org.apache.spark")) error_codes = get_error_codes("./dev/run-tests-codes.sh") os.environ["CURRENT_BLOCK"] = error_codes["BLOCK_GENERAL"] - set_sbt_maven_profile_args() - java_exe = determine_java_executable() if not java_exe: - print "[error] Cannot find a version of `java` on the system; please", + print "[error] Cannot find a version of `java` on the system; please", print "install one and retry." sys.exit(2) @@ -398,20 +416,36 @@ def run_sparkr_tests(test_suite=[]): if java_version.minor < 8: print "[warn] Java 8 tests will not run because JDK version is < 1.8." - test_suite = determine_test_suite() + if os.environ.get("AMPLAB_JENKINS"): + # if we're on the Amplab Jenkins build servers setup variables + # to reflect the environment settings + build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt") + hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.3") + test_env="amplab_jenkins" + else: + # else we're running locally and can use local settings + build_tool = "sbt" + hadoop_version = "hadoop2.3" + test_env="local" + # license checks run_apache_rat_checks() - - run_scala_style_checks() + # style checks + run_scala_style_checks() run_python_style_checks() - build_apache_spark() + # spark build + build_apache_spark(build_tool, hadoop_version) + # backwards compatibility checks detect_binary_inop_with_mima() - run_scala_tests(test_suite) - + # test suites + test_modules = determine_test_modules(test_env) + run_scala_tests(build_tool, hadoop_version, test_modules) run_python_tests() - run_sparkr_tests() + +if __name__ == "__main__": + main()