From ddb633443fd5700a035f9e9da39b3d7540b87ade Mon Sep 17 00:00:00 2001 From: Oliver Sanders Date: Wed, 7 Nov 2018 17:29:23 +0000 Subject: [PATCH] auto stop-restart: handle jobs running on localhost --- doc/src/cylc-user-guide/cug.tex | 30 +++-- lib/cylc/remote.py | 5 +- lib/cylc/scheduler.py | 80 +++++++++----- tests/lib/bash/test_header | 15 +++ tests/restart/41-auto-restart-local-jobs.t | 123 +++++++++++++++++++++ 5 files changed, 215 insertions(+), 38 deletions(-) create mode 100644 tests/restart/41-auto-restart-local-jobs.t diff --git a/doc/src/cylc-user-guide/cug.tex b/doc/src/cylc-user-guide/cug.tex index 10f91468bfb..ca5d9bc0e89 100644 --- a/doc/src/cylc-user-guide/cug.tex +++ b/doc/src/cylc-user-guide/cug.tex @@ -8236,13 +8236,27 @@ \subsection{Auto Stop-Restart} \item \lstinline=[run hosts][suite servers]run hosts= \end{myitemize} -When a host is added to the \lstinline=condemned hosts= list in, -any suites will automatically shutdown then restart selecting a new host from -\lstinline=run hosts=. If a host is suffixed with an exclamation -mark then Cylc will not attempt to automatically restart the suite. +The auto stop-restart feature has two modes: + +\begin{description} + \item[Normal Mode] \hfill + + When a host is added to the \lstinline=condemned hosts= list, any suites + running on that host will automatically shutdown then restart selecting a + new host from \lstinline=run hosts=. + + For safety, before attempting to stop the suite cylc will first wait for any + jobs running locally (under background or at) to complete. + + \item[Force Mode] \hfill + + If a host is suffixed with an exclamation mark then Cylc will not attempt + to automatically restart the suite and any local jobs (running under + background or at) will be left running. +\end{description} For example in the following configuration any suites running on -lstinline=foo= will attempt to restart on \lstinline=pub= whereas any suites +\lstinline=foo= will attempt to restart on \lstinline=pub= whereas any suites running on \lstinline=bar= will stop immediately, making no attempt to restart. \begin{lstlisting} @@ -8256,10 +8270,10 @@ \subsection{Auto Stop-Restart} Suites will wait for a random period of time between zero and \lstinline=auto restart delay= seconds before attempting to stop and restart. -This auto shutdown-restart functionality can only operate provided that the -user hasn't specified any behaviour which is not preserved by +At present the auto shutdown-restart functionality can only operate provided +that the user hasn't specified any behaviour which is not preserved by \lstinline=cylc restart= (e.g.\ user specified hold point or run mode). This -caveat will be removed in a future version, at present Cylc will not attempt to +caveat will be removed in a future version, currently Cylc will not attempt to auto shutdown-restart suites which meet this criterion but will log a critical error message to alert the user. diff --git a/lib/cylc/remote.py b/lib/cylc/remote.py index fedf8743ca6..e318dfec3fe 100644 --- a/lib/cylc/remote.py +++ b/lib/cylc/remote.py @@ -168,8 +168,9 @@ def construct_ssh_cmd(raw_cmd, user=None, host=None, forward_x11=False, user_at_host += 'localhost' command.append(user_at_host) - # Pass cylc version (and optionally UTC mode) through. - command += ['env', quote(r'CYLC_VERSION=%s' % CYLC_VERSION)] + # Pass CYLC_VERSION, CYLC_CONF_PATH and optionally CYLC_UTC through. + command += ['env', quote(r'CYLC_VERSION=%s' % CYLC_VERSION), + quote('CYLC_CONF_PATH=%s' % os.environ.get('CYLC_CONF_PATH'))] if set_UTC and os.getenv('CYLC_UTC') in ["True", "true"]: command.append(quote(r'CYLC_UTC=True')) command.append(quote(r'TZ=UTC')) diff --git a/lib/cylc/scheduler.py b/lib/cylc/scheduler.py index 4cdec9b0f7e..c3136810d4d 100644 --- a/lib/cylc/scheduler.py +++ b/lib/cylc/scheduler.py @@ -103,6 +103,9 @@ class Scheduler(object): START_MESSAGE_TMPL = ( START_MESSAGE_PREFIX + 'server=%(host)s:%(port)s pid=%(pid)s') + AUTO_STOP_RESTART_NORMAL = 'stop and restart' + AUTO_STOP_RESTART_FORCE = 'stop' + # Dependency negotiation etc. will run after these commands PROC_CMDS = ( 'release_suite', @@ -217,7 +220,7 @@ def __init__(self, is_restart, options, args): # health check settings self.time_next_health_check = None - self.auto_restart = False + self.auto_restart_mode = None self.auto_restart_time = None def start(self): @@ -247,7 +250,7 @@ def start(self): except SchedulerStop as exc: # deliberate stop self.shutdown(exc) - if self.auto_restart: + if self.auto_restart_mode == self.AUTO_STOP_RESTART_NORMAL: self.suite_auto_restart() except SchedulerError as exc: @@ -1216,7 +1219,7 @@ def process_task_pool(self): self._get_events_conf('reset inactivity timer')): self.set_suite_inactivity_timer() self.pool.match_dependencies() - if self.stop_mode is None: + if self.stop_mode is None and self.auto_restart_time is None: itasks = self.pool.get_ready_tasks() if itasks: cylc.flags.iflag = True @@ -1339,11 +1342,32 @@ def suite_shutdown(self): self.command_kill_tasks() self.time_next_kill = time() + self.INTERVAL_STOP_KILL - # Is the suite set to auto restart now? - if (self.auto_restart_time is not None and - time() > self.auto_restart_time): - self.auto_restart = True - self._set_stop(TaskPool.STOP_REQUEST_NOW_NOW) + # Is the suite set to auto stop [+restart] now ... + if (self.auto_restart_time is None or time() < self.auto_restart_time): + # ... no + pass + elif self.auto_restart_mode == self.AUTO_STOP_RESTART_NORMAL: + # ... yes - wait for local jobs to complete before restarting + # * Avoid polling issues see #2843 + # * Ensure the host can be safely taken down once suites + for itask in self.pool.get_tasks(): + if ( + itask.task_host == 'localhost' and + itask.summary['batch_sys_name'] in ['background', 'at'] and + itask.state.status in TASK_STATUSES_ACTIVE + ): + LOG.info('Waiting for jobs running on localhost to ' + 'complete before attempting restart') + break + else: + self._set_stop(TaskPool.STOP_REQUEST_NOW_NOW) + elif self.auto_restart_mode == self.AUTO_STOP_RESTART_FORCE: + # ... yes - leave local jobs running then stop the suite + # (no restart) + self._set_stop(TaskPool.STOP_REQUEST_NOW) + else: + raise SchedulerError('Invalid auto_restart_mode=%s' % + self.auto_restart_mode) def suite_auto_restart(self, max_retries=3): """Attempt to restart the suite assuming it has already stopped.""" @@ -1373,7 +1397,8 @@ def suite_auto_restart(self, max_retries=3): 'manual restart required.' % max_retries) return False - def set_auto_restart(self, restart_delay=None): + def set_auto_restart(self, restart_delay=None, + mode=AUTO_STOP_RESTART_NORMAL): """Configure the suite to automatically stop and restart. Restart handled by `suite_auto_restart`. @@ -1383,6 +1408,7 @@ def set_auto_restart(self, restart_delay=None): Suite will wait a random period between 0 and `restart_delay` seconds before attempting to stop/restart in order to avoid multiple suites restarting simultaneously. + mode (str): Auto stop-restart mode. Return: bool: False if it is not possible to automatically stop/restart @@ -1392,17 +1418,13 @@ def set_auto_restart(self, restart_delay=None): if self.stop_mode or self.auto_restart_time is not None: return True - # Check the suite is auto-restartable see #2799. - if ( - not self.can_auto_stop or - self.options.final_point_string or - self.pool_hold_point or - self.run_mode != 'live' or - self.stop_clock_time or - self.stop_point or - self.stop_task - ): - LOG.critical('Suite cannot automatically restart.') + # Force mode, stop the suite now, don't restart it. + if mode == self.AUTO_STOP_RESTART_FORCE: + self.auto_restart_time = time() + self.auto_restart_mode = mode + return True + + if not self.can_auto_restart(): return False LOG.info('Suite will automatically restart on a new host.') @@ -1417,10 +1439,14 @@ def set_auto_restart(self, restart_delay=None): self.auto_restart_time = shutdown_time else: self.auto_restart_time = time() + + self.auto_restart_mode = self.AUTO_STOP_RESTART_NORMAL + return True def can_auto_restart(self): """Determine whether this suite can safely auto stop-restart.""" + # Check the suite is auto-restartable see #2799. ret = ['Incompatible configuration: "%s"' % key for key, value in [ ('can_auto_stop', not self.can_auto_stop), ('final_point', self.options.final_point_string), @@ -1432,6 +1458,7 @@ def can_auto_restart(self): ('stop_task', self.stop_task) ] if value] + # Check whether there is currently an available host to restart on. try: HostAppointer(cached=False).appoint_host() except EmptyHostList: @@ -1458,7 +1485,6 @@ def suite_health_check(self, has_changes): 4. Suite contact file has the right info? """ - # 1. check if suite is stalled - if so call handler if defined if self.stop_mode is None and not has_changes: self.check_suite_stalled() @@ -1472,17 +1498,17 @@ def suite_health_check(self, has_changes): current_glbl_cfg = glbl_cfg(cached=False) for host in current_glbl_cfg.get(['suite servers', 'condemned hosts']): - force_kill = False + mode = self.AUTO_STOP_RESTART_NORMAL if host.endswith('!'): # host ends in an `!` -> force shutdown mode - force_kill = True + mode = self.AUTO_STOP_RESTART_FORCE host = host[:-1] if get_fqdn_by_host(host) == self.host: # this host is condemned, take the appropriate action LOG.info('The Cylc suite host will soon become ' 'un-available.') - if force_kill: + if mode == self.AUTO_STOP_RESTART_FORCE: # server is condemned in "force" mode -> stop # the suite, don't attempt to restart LOG.critical( @@ -1491,10 +1517,9 @@ def suite_health_check(self, has_changes): 'When another suite host becomes available ' 'the suite can be restarted by:\n' ' $ cylc restart %s' % self.suite) - self._set_stop(TaskPool.STOP_REQUEST_NOW) - return # skip remaining health checks + if self.set_auto_restart(mode=mode): + return # skip remaining health checks elif ( - self.can_auto_restart() and self.set_auto_restart(current_glbl_cfg.get( ['suite servers', 'auto restart delay'])) ): @@ -1542,7 +1567,6 @@ def update_profiler_logs(self, tinit): def run(self): """Main loop.""" - self.initialise_scheduler() while True: # MAIN LOOP tinit = time() diff --git a/tests/lib/bash/test_header b/tests/lib/bash/test_header index bbc3c8e483f..36d5762ef89 100644 --- a/tests/lib/bash/test_header +++ b/tests/lib/bash/test_header @@ -56,6 +56,8 @@ # (stdin if FILE_CONTROL is "-" or missing). # grep_ok PATTERN FILE # Run "grep -q -e PATTERN FILE". +# grep_fail PATTERN FILE +# Run "grep -v -q -e PATTERN FILE". # count_ok PATTERN FILE COUNT # Test that PATTERN occurs in exactly COUNT lines of FILE. # exists_ok FILE @@ -337,6 +339,19 @@ grep_ok() { fail "${TEST_NAME}" } +grep_fail() { + local BRE="$1" + local FILE="$2" + local TEST_NAME="$(basename "${FILE}")-grep-ok" + if grep -v -q -e "${BRE}" "${FILE}"; then + ok "${TEST_NAME}" + return + fi + mkdir -p "${TEST_LOG_DIR}" + echo "Found ${BRE} in ${FILE}" >"${TEST_LOG_DIR}/${TEST_NAME}.stderr" + fail "${TEST_NAME}" +} + exists_ok() { local FILE=$1 local TEST_NAME="$(basename "${FILE}")-file-exists-ok" diff --git a/tests/restart/41-auto-restart-local-jobs.t b/tests/restart/41-auto-restart-local-jobs.t new file mode 100644 index 00000000000..64431981a18 --- /dev/null +++ b/tests/restart/41-auto-restart-local-jobs.t @@ -0,0 +1,123 @@ +#!/bin/bash +# THIS FILE IS PART OF THE CYLC SUITE ENGINE. +# Copyright (C) 2008-2018 NIWA & British Crown (Met Office) & Contributors. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +#------------------------------------------------------------------------------- +. "$(dirname "$0")/test_header" +export CYLC_TEST_HOST2=$( \ + cylc get-global-config -i '[test battery]remote host with shared fs' \ + 2>'/dev/null') +if [[ -z "${CYLC_TEST_HOST2}" ]]; then + skip_all '"[test battery]remote host with shared fs": not defined' +fi +export CYLC_TEST_HOST1="$(hostname)" +if ${CYLC_TEST_DEBUG:-false}; then ERR=2; else ERR=1; fi +set_test_number 17 + +BASE_GLOBALRC=" +[cylc] + health check interval = PT5S + [[events]] + abort on inactivity = True + abort on timeout = True + inactivity = PT2M + timeout = PT2M +" + +TEST_DIR="$HOME/cylc-run/" init_suite "${TEST_NAME_BASE}" <<< ' +[scheduling] + [[dependencies]] + graph = foo => bar => baz +[runtime] + [[root]] + script = sleep 15 +' + +create_test_globalrc '' " +${BASE_GLOBALRC} +[suite servers] + run hosts = ${CYLC_TEST_HOST1} +" + +job-ps-line() { + # line to grep for in ps listings to see if cylc background jobs are + # running + printf '/bin/bash.*log/job/1/%s/.*/job' "$1" +} + +cylc run "${SUITE_NAME}" +#------------------------------------------------------------------------------- +# auto stop-restart - normal mode: +# ensure the suite WAITS for local jobs to complete before restarting +TEST_NAME="${TEST_NAME_BASE}-normal-mode" + +cylc suite-state "${SUITE_NAME}" --task='foo' --status='running' --point=1 \ + --interval=1 --max-polls=20 >& $ERR + +# ensure that later tests aren't placebos +run_ok "${TEST_NAME}-ps-1" ps -fu "${USER}" +grep_ok "$(job-ps-line foo)" "${TEST_NAME}-ps-1.stdout" + +create_test_globalrc '' " +${BASE_GLOBALRC} +[suite servers] + run hosts = ${CYLC_TEST_HOST1}, ${CYLC_TEST_HOST2} + condemned hosts = ${CYLC_TEST_HOST1} +" + +FILE=$(cylc cat-log "${SUITE_NAME}" -m p |xargs readlink -f) +log_scan "${TEST_NAME}-stop" "${FILE}" 40 1 \ + 'The Cylc suite host will soon become un-available' \ + 'Waiting for jobs running on localhost to complete' \ + 'Waiting for jobs running on localhost to complete' \ + 'Suite shutting down - REQUEST(NOW-NOW)' \ + "Attempting to restart on \"${CYLC_TEST_HOST2}\"" \ + "Suite now running on \"${CYLC_TEST_HOST2}\"" \ + +run_ok "${TEST_NAME}-ps-2" ps -fu "${USER}" +grep_fail "$(job-ps-line foo)" "${TEST_NAME}-ps-2.stdout" +grep_fail "$(job-ps-line bar)" "${TEST_NAME}-ps-2.stdout" + +poll test -f "${SUITE_RUN_DIR}/.service/contact" +FILE=$(cylc cat-log "${SUITE_NAME}" -m p |xargs readlink -f) +log_scan "${TEST_NAME}-restart" "${FILE}" 20 1 \ + "Suite starting: server=$(ssh "${CYLC_TEST_HOST2}" hostname -f)" +#------------------------------------------------------------------------------- +# auto stop-restart - force mode: +# ensure the suite DOESN'T WAIT for local jobs to complete before stopping +TEST_NAME="${TEST_NAME_BASE}-force-mode" + +cylc suite-state "${SUITE_NAME}" --task='bar' --status='running' --point=1 \ + --interval=1 --max-polls=20 >& $ERR + +create_test_globalrc '' " +${BASE_GLOBALRC} +[suite servers] + run hosts = ${CYLC_TEST_HOST1}, ${CYLC_TEST_HOST2} + condemned hosts = ${CYLC_TEST_HOST2}! +" + +log_scan "${TEST_NAME}-stop" "${FILE}" 40 1 \ + 'The Cylc suite host will soon become un-available' \ + 'This suite will be shutdown as the suite host is unable to continue' \ + 'Suite shutting down - REQUEST(NOW)' \ + +run_ok "${TEST_NAME}-ps-2" ssh ${CYLC_TEST_HOST2} ps -fu "${USER}" +grep_ok "$(job-ps-line bar)" "${TEST_NAME}-ps-2.stdout" + +cylc stop "${SUITE_NAME}" --now --now +poll test -f "${SUITE_RUN_DIR}/.service/contact" +sleep 1 +purge_suite "${SUITE_NAME}"