Skip to content

Commit

Permalink
auto stop-restart: handle jobs running on localhost
Browse files Browse the repository at this point in the history
  • Loading branch information
oliver-sanders committed Nov 8, 2018
1 parent c3c3bc1 commit ddb6334
Show file tree
Hide file tree
Showing 5 changed files with 215 additions and 38 deletions.
30 changes: 22 additions & 8 deletions doc/src/cylc-user-guide/cug.tex
Original file line number Diff line number Diff line change
Expand Up @@ -8236,13 +8236,27 @@ \subsection{Auto Stop-Restart}
\item \lstinline=[run hosts][suite servers]run hosts=
\end{myitemize}
When a host is added to the \lstinline=condemned hosts= list in,
any suites will automatically shutdown then restart selecting a new host from
\lstinline=run hosts=. If a host is suffixed with an exclamation
mark then Cylc will not attempt to automatically restart the suite.
The auto stop-restart feature has two modes:
\begin{description}
\item[Normal Mode] \hfill
When a host is added to the \lstinline=condemned hosts= list, any suites
running on that host will automatically shutdown then restart selecting a
new host from \lstinline=run hosts=.
For safety, before attempting to stop the suite cylc will first wait for any
jobs running locally (under background or at) to complete.
\item[Force Mode] \hfill
If a host is suffixed with an exclamation mark then Cylc will not attempt
to automatically restart the suite and any local jobs (running under
background or at) will be left running.
\end{description}
For example in the following configuration any suites running on
lstinline=foo= will attempt to restart on \lstinline=pub= whereas any suites
\lstinline=foo= will attempt to restart on \lstinline=pub= whereas any suites
running on \lstinline=bar= will stop immediately, making no attempt to restart.
\begin{lstlisting}
Expand All @@ -8256,10 +8270,10 @@ \subsection{Auto Stop-Restart}
Suites will wait for a random period of time between zero and
\lstinline=auto restart delay= seconds before attempting to stop and restart.
This auto shutdown-restart functionality can only operate provided that the
user hasn't specified any behaviour which is not preserved by
At present the auto shutdown-restart functionality can only operate provided
that the user hasn't specified any behaviour which is not preserved by
\lstinline=cylc restart= (e.g.\ user specified hold point or run mode). This
caveat will be removed in a future version, at present Cylc will not attempt to
caveat will be removed in a future version, currently Cylc will not attempt to
auto shutdown-restart suites which meet this criterion but will log a critical
error message to alert the user.
Expand Down
5 changes: 3 additions & 2 deletions lib/cylc/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,9 @@ def construct_ssh_cmd(raw_cmd, user=None, host=None, forward_x11=False,
user_at_host += 'localhost'
command.append(user_at_host)

# Pass cylc version (and optionally UTC mode) through.
command += ['env', quote(r'CYLC_VERSION=%s' % CYLC_VERSION)]
# Pass CYLC_VERSION, CYLC_CONF_PATH and optionally CYLC_UTC through.
command += ['env', quote(r'CYLC_VERSION=%s' % CYLC_VERSION),
quote('CYLC_CONF_PATH=%s' % os.environ.get('CYLC_CONF_PATH'))]
if set_UTC and os.getenv('CYLC_UTC') in ["True", "true"]:
command.append(quote(r'CYLC_UTC=True'))
command.append(quote(r'TZ=UTC'))
Expand Down
80 changes: 52 additions & 28 deletions lib/cylc/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ class Scheduler(object):
START_MESSAGE_TMPL = (
START_MESSAGE_PREFIX + 'server=%(host)s:%(port)s pid=%(pid)s')

AUTO_STOP_RESTART_NORMAL = 'stop and restart'
AUTO_STOP_RESTART_FORCE = 'stop'

# Dependency negotiation etc. will run after these commands
PROC_CMDS = (
'release_suite',
Expand Down Expand Up @@ -217,7 +220,7 @@ def __init__(self, is_restart, options, args):

# health check settings
self.time_next_health_check = None
self.auto_restart = False
self.auto_restart_mode = None
self.auto_restart_time = None

def start(self):
Expand Down Expand Up @@ -247,7 +250,7 @@ def start(self):
except SchedulerStop as exc:
# deliberate stop
self.shutdown(exc)
if self.auto_restart:
if self.auto_restart_mode == self.AUTO_STOP_RESTART_NORMAL:
self.suite_auto_restart()

except SchedulerError as exc:
Expand Down Expand Up @@ -1216,7 +1219,7 @@ def process_task_pool(self):
self._get_events_conf('reset inactivity timer')):
self.set_suite_inactivity_timer()
self.pool.match_dependencies()
if self.stop_mode is None:
if self.stop_mode is None and self.auto_restart_time is None:
itasks = self.pool.get_ready_tasks()
if itasks:
cylc.flags.iflag = True
Expand Down Expand Up @@ -1339,11 +1342,32 @@ def suite_shutdown(self):
self.command_kill_tasks()
self.time_next_kill = time() + self.INTERVAL_STOP_KILL

# Is the suite set to auto restart now?
if (self.auto_restart_time is not None and
time() > self.auto_restart_time):
self.auto_restart = True
self._set_stop(TaskPool.STOP_REQUEST_NOW_NOW)
# Is the suite set to auto stop [+restart] now ...
if (self.auto_restart_time is None or time() < self.auto_restart_time):
# ... no
pass
elif self.auto_restart_mode == self.AUTO_STOP_RESTART_NORMAL:
# ... yes - wait for local jobs to complete before restarting
# * Avoid polling issues see #2843
# * Ensure the host can be safely taken down once suites
for itask in self.pool.get_tasks():
if (
itask.task_host == 'localhost' and
itask.summary['batch_sys_name'] in ['background', 'at'] and
itask.state.status in TASK_STATUSES_ACTIVE
):
LOG.info('Waiting for jobs running on localhost to '
'complete before attempting restart')
break
else:
self._set_stop(TaskPool.STOP_REQUEST_NOW_NOW)
elif self.auto_restart_mode == self.AUTO_STOP_RESTART_FORCE:
# ... yes - leave local jobs running then stop the suite
# (no restart)
self._set_stop(TaskPool.STOP_REQUEST_NOW)
else:
raise SchedulerError('Invalid auto_restart_mode=%s' %
self.auto_restart_mode)

def suite_auto_restart(self, max_retries=3):
"""Attempt to restart the suite assuming it has already stopped."""
Expand Down Expand Up @@ -1373,7 +1397,8 @@ def suite_auto_restart(self, max_retries=3):
'manual restart required.' % max_retries)
return False

def set_auto_restart(self, restart_delay=None):
def set_auto_restart(self, restart_delay=None,
mode=AUTO_STOP_RESTART_NORMAL):
"""Configure the suite to automatically stop and restart.
Restart handled by `suite_auto_restart`.
Expand All @@ -1383,6 +1408,7 @@ def set_auto_restart(self, restart_delay=None):
Suite will wait a random period between 0 and
`restart_delay` seconds before attempting to stop/restart in
order to avoid multiple suites restarting simultaneously.
mode (str): Auto stop-restart mode.
Return:
bool: False if it is not possible to automatically stop/restart
Expand All @@ -1392,17 +1418,13 @@ def set_auto_restart(self, restart_delay=None):
if self.stop_mode or self.auto_restart_time is not None:
return True

# Check the suite is auto-restartable see #2799.
if (
not self.can_auto_stop or
self.options.final_point_string or
self.pool_hold_point or
self.run_mode != 'live' or
self.stop_clock_time or
self.stop_point or
self.stop_task
):
LOG.critical('Suite cannot automatically restart.')
# Force mode, stop the suite now, don't restart it.
if mode == self.AUTO_STOP_RESTART_FORCE:
self.auto_restart_time = time()
self.auto_restart_mode = mode
return True

if not self.can_auto_restart():
return False

LOG.info('Suite will automatically restart on a new host.')
Expand All @@ -1417,10 +1439,14 @@ def set_auto_restart(self, restart_delay=None):
self.auto_restart_time = shutdown_time
else:
self.auto_restart_time = time()

self.auto_restart_mode = self.AUTO_STOP_RESTART_NORMAL

return True

def can_auto_restart(self):
"""Determine whether this suite can safely auto stop-restart."""
# Check the suite is auto-restartable see #2799.
ret = ['Incompatible configuration: "%s"' % key for key, value in [
('can_auto_stop', not self.can_auto_stop),
('final_point', self.options.final_point_string),
Expand All @@ -1432,6 +1458,7 @@ def can_auto_restart(self):
('stop_task', self.stop_task)
] if value]

# Check whether there is currently an available host to restart on.
try:
HostAppointer(cached=False).appoint_host()
except EmptyHostList:
Expand All @@ -1458,7 +1485,6 @@ def suite_health_check(self, has_changes):
4. Suite contact file has the right info?
"""

# 1. check if suite is stalled - if so call handler if defined
if self.stop_mode is None and not has_changes:
self.check_suite_stalled()
Expand All @@ -1472,17 +1498,17 @@ def suite_health_check(self, has_changes):
current_glbl_cfg = glbl_cfg(cached=False)
for host in current_glbl_cfg.get(['suite servers',
'condemned hosts']):
force_kill = False
mode = self.AUTO_STOP_RESTART_NORMAL
if host.endswith('!'):
# host ends in an `!` -> force shutdown mode
force_kill = True
mode = self.AUTO_STOP_RESTART_FORCE
host = host[:-1]

if get_fqdn_by_host(host) == self.host:
# this host is condemned, take the appropriate action
LOG.info('The Cylc suite host will soon become '
'un-available.')
if force_kill:
if mode == self.AUTO_STOP_RESTART_FORCE:
# server is condemned in "force" mode -> stop
# the suite, don't attempt to restart
LOG.critical(
Expand All @@ -1491,10 +1517,9 @@ def suite_health_check(self, has_changes):
'When another suite host becomes available '
'the suite can be restarted by:\n'
' $ cylc restart %s' % self.suite)
self._set_stop(TaskPool.STOP_REQUEST_NOW)
return # skip remaining health checks
if self.set_auto_restart(mode=mode):
return # skip remaining health checks
elif (
self.can_auto_restart() and
self.set_auto_restart(current_glbl_cfg.get(
['suite servers', 'auto restart delay']))
):
Expand Down Expand Up @@ -1542,7 +1567,6 @@ def update_profiler_logs(self, tinit):

def run(self):
"""Main loop."""

self.initialise_scheduler()
while True: # MAIN LOOP
tinit = time()
Expand Down
15 changes: 15 additions & 0 deletions tests/lib/bash/test_header
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@
# (stdin if FILE_CONTROL is "-" or missing).
# grep_ok PATTERN FILE
# Run "grep -q -e PATTERN FILE".
# grep_fail PATTERN FILE
# Run "grep -v -q -e PATTERN FILE".
# count_ok PATTERN FILE COUNT
# Test that PATTERN occurs in exactly COUNT lines of FILE.
# exists_ok FILE
Expand Down Expand Up @@ -337,6 +339,19 @@ grep_ok() {
fail "${TEST_NAME}"
}

grep_fail() {
local BRE="$1"
local FILE="$2"
local TEST_NAME="$(basename "${FILE}")-grep-ok"
if grep -v -q -e "${BRE}" "${FILE}"; then
ok "${TEST_NAME}"
return
fi
mkdir -p "${TEST_LOG_DIR}"
echo "Found ${BRE} in ${FILE}" >"${TEST_LOG_DIR}/${TEST_NAME}.stderr"
fail "${TEST_NAME}"
}

exists_ok() {
local FILE=$1
local TEST_NAME="$(basename "${FILE}")-file-exists-ok"
Expand Down
Loading

0 comments on commit ddb6334

Please sign in to comment.