Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update STMP and PTMP settings in host file for Orion and Hercules #2614

2 changes: 1 addition & 1 deletion ci/cases/yamls/atmaerosnowDA_defaults_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ defaults:
base:
DOIAU: "NO"
DO_JEDISNOWDA: "YES"
ACCOUNT: {{ 'SLURM_ACCOUNT' | getenv }}
ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
2 changes: 1 addition & 1 deletion ci/cases/yamls/gefs_ci_defaults.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
defaults:
!INC {{ HOMEgfs }}/parm/config/gefs/yaml/defaults.yaml
base:
ACCOUNT: {{ 'SLURM_ACCOUNT' | getenv }}
HPC_ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
2 changes: 1 addition & 1 deletion ci/cases/yamls/gfs_defaults_ci.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
defaults:
!INC {{ HOMEgfs }}/parm/config/gfs/yaml/defaults.yaml
base:
ACCOUNT: {{ 'SLURM_ACCOUNT' | getenv }}
ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
2 changes: 1 addition & 1 deletion ci/cases/yamls/gfs_extended_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ defaults:
!INC {{ HOMEgfs }}/parm/config/gfs/yaml/defaults.yaml

base:
ACCOUNT: {{ 'SLURM_ACCOUNT' | getenv }}
ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
DO_GOES: "YES"
DO_BUFRSND: "YES"
DO_GEMPAK: "YES"
Expand Down
2 changes: 1 addition & 1 deletion ci/cases/yamls/soca_gfs_defaults_ci.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
defaults:
!INC {{ HOMEgfs }}/parm/config/gfs/yaml/defaults.yaml
base:
ACCOUNT: {{ 'SLURM_ACCOUNT' | getenv }}
ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
DO_JEDIOCNVAR: "YES"
2 changes: 1 addition & 1 deletion ci/cases/yamls/ufs_hybatmDA_defaults.ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ base:
DOIAU: "NO"
DO_JEDIATMVAR: "YES"
DO_JEDIATMENS: "YES"
ACCOUNT: {{ 'SLURM_ACCOUNT' | getenv }}
ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }}
atmanl:
LAYOUT_X_ATMANL: 4
LAYOUT_Y_ATMANL: 4
Expand Down
4 changes: 1 addition & 3 deletions ci/platforms/config.hera
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

export GFS_CI_ROOT=/scratch1/NCEPDEV/global/Terry.McGuinness/GFS_CI_ROOT
export ICSDIR_ROOT=/scratch1/NCEPDEV/global/glopara/data/ICSDIR
export STMP="/scratch1/NCEPDEV/stmp2/${USER}"
export PTMP="/scratch1/NCEPDEV/stmp2/${USER}"
export SLURM_ACCOUNT=nems
export HPC_ACCOUNT=nems
export max_concurrent_cases=5
export max_concurrent_pr=4
4 changes: 1 addition & 3 deletions ci/platforms/config.hercules
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

export GFS_CI_ROOT=/work2/noaa/stmp/GFS_CI_ROOT/HERCULES
export ICSDIR_ROOT=/work/noaa/global/glopara/data/ICSDIR
export STMP="/work2/noaa/stmp/${USER}/HERCULES"
export PTMP="/work2/noaa/stmp/${USER}/HERCULES"
export SLURM_ACCOUNT=nems
export HPC_ACCOUNT=nems
export max_concurrent_cases=5
export max_concurrent_pr=4
4 changes: 1 addition & 3 deletions ci/platforms/config.orion
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

export GFS_CI_ROOT=/work2/noaa/stmp/GFS_CI_ROOT/ORION
export ICSDIR_ROOT=/work/noaa/global/glopara/data/ICSDIR
export STMP="/work2/noaa/stmp/${USER}/ORION"
export PTMP="/work2/noaa/stmp/${USER}/ORION"
export SLURM_ACCOUNT=nems
export HPC_ACCOUNT=nems
export max_concurrent_cases=5
export max_concurrent_pr=4
4 changes: 1 addition & 3 deletions ci/platforms/config.wcoss2
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

export GFS_CI_ROOT=/lfs/h2/emc/global/noscrub/globalworkflow.ci/GFS_CI_ROOT
export ICSDIR_ROOT=/lfs/h2/emc/global/noscrub/emc.global/data/ICSDIR
export STMP="/lfs/h2/emc/stmp/${USER}"
export PTMP="/lfs/h2/emc/ptmp/${USER}"
export SLURM_ACCOUNT=GFS-DEV
export HPC_ACCOUNT=GFS-DEV
export max_concurrent_cases=5
export max_concurrent_pr=4
2 changes: 1 addition & 1 deletion ci/scripts/run-check_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ while true; do
rocotorun -v "${ROCOTO_VERBOSE:-0}" -w "${xml}" -d "${db}"

# Wait before running rocotostat
sleep 10
sleep 60

# Get job statistics
echo "Gather Rocoto statistics"
Expand Down
156 changes: 125 additions & 31 deletions ci/scripts/utils/rocotostat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,31 @@

import sys
import os
import copy
from time import sleep

from wxflow import Executable, which, Logger, CommandNotFoundError
from wxflow import which, Logger, CommandNotFoundError, ProcessError
from argparse import ArgumentParser, FileType

from collections import Counter

logger = Logger(level=os.environ.get("LOGGING_LEVEL", "DEBUG"), colored_log=False)


def attempt_multiple_times(expression, max_attempts, sleep_duration=0, exception_class=Exception):
attempt = 0
last_exception = None
while attempt < max_attempts:
try:
pass
return expression()
except exception_class as last_exception:
attempt += 1
sleep(sleep_duration)
else:
raise last_exception


def input_args():
"""
Parse command-line arguments.
Expand Down Expand Up @@ -39,66 +57,142 @@ def input_args():
return args


def rocoto_statcount():
"""
Run rocotostat and process its output.
def rocotostat_summary(rocotostat):
"""
rocoto_summary Run rocotostat and process its output.

args = input_args()
rocoto_summary(rocotostat) adds a default argument '--summary' to the rocotostat
command, runs it, and processes its output to return a dictionary with the total
number of cycles and the number of cycles marked as 'Done'.

try:
rocotostat = which("rocotostat")
except CommandNotFoundError:
logger.exception("rocotostat not found in PATH")
raise CommandNotFoundError("rocotostat not found in PATH")

rocotostat_all = which("rocotostat")
rocotostat.add_default_arg(['-w', os.path.abspath(args.w.name), '-d', os.path.abspath(args.d.name), '-s'])
rocotostat_all.add_default_arg(['-w', os.path.abspath(args.w.name), '-d', os.path.abspath(args.d.name), '-a'])
Input:
rocotostat - The rocotostat command.

rocotostat_output = rocotostat(output=str)
Output:
rocoto_status - A dictionary with the total number of cycles and the number of cycles marked as 'Done'.
"""
rocotostat = copy.deepcopy(rocotostat)
rocotostat.add_default_arg('--summary')
rocotostat_output = attempt_multiple_times(lambda: rocotostat(output=str), 3, 90, ProcessError)
rocotostat_output = rocotostat_output.splitlines()[1:]
rocotostat_output = [line.split()[0:2] for line in rocotostat_output]

rocotostat_output_all = rocotostat_all(output=str)
rocotostat_output_all = rocotostat_output_all.splitlines()[1:]
rocotostat_output_all = [line.split()[0:4] for line in rocotostat_output_all]
rocotostat_output_all = [line for line in rocotostat_output_all if len(line) != 1]

rocoto_status = {
'CYCLES_TOTAL': len(rocotostat_output),
'CYCLES_DONE': sum([sublist.count('Done') for sublist in rocotostat_output])
}
return rocoto_status


def rocoto_statcount(rocotostat):
"""
rocoto_statcount Run rocotostat and process its output.

rocoto_statcount(rocotostat) adds a default argument '--all' to the rocotostat
command, runs it, and processes its output to return a dictionary with the count
of each status case.

Input:
rocotostat - The rocotostat command.

Output:
rocoto_status - A dictionary with the count of each status case.
"""

rocotostat = copy.deepcopy(rocotostat)
rocotostat.add_default_arg('--all')

rocotostat_output = attempt_multiple_times(lambda: rocotostat(output=str), 4, 120, ProcessError)
rocotostat_output = rocotostat_output.splitlines()[1:]
rocotostat_output = [line.split()[0:4] for line in rocotostat_output]
rocotostat_output = [line for line in rocotostat_output if len(line) != 1]

status_cases = ['SUCCEEDED', 'FAIL', 'DEAD', 'RUNNING', 'SUBMITTING', 'QUEUED']

rocoto_status = {}
status_counts = Counter(case for sublist in rocotostat_output for case in sublist)
for case in status_cases:
rocoto_status[case] = sum([sublist.count(case) for sublist in rocotostat_output_all])
rocoto_status[case] = status_counts[case]

return rocoto_status


def is_done(rocoto_status):
"""
is_done Check if all cycles are done.

is_done(rocoto_status) checks if the total number of cycles equals the number of
done cycles in the rocoto_status dictionary.

Input:
rocoto_status - A dictionary with the count of each status case.

Output:
boolean - True if all cycles are done, False otherwise.
"""

if rocoto_status['CYCLES_TOTAL'] == rocoto_status['CYCLES_DONE']:
return True
else:
return False


def is_stalled(rocoto_status):
"""
is_stalled Check if all cycles are stalled.

is_stalled(rocoto_status) checks if all cycles are stalled by verifying if
there are no jobs that are RUNNING, SUBMITTING, or QUEUED.

Input:
rocoto_status - A dictionary with the count of each status case.

Output:
boolean - True if all cycles are stalled, False otherwise.
"""

if rocoto_status['RUNNING'] + rocoto_status['SUBMITTING'] + rocoto_status['QUEUED'] == 0:
return True
else:
return False


if __name__ == '__main__':
"""
main Execute the script.

main() parses the input arguments, checks if the rocotostat command is available,
adds default arguments to the rocotostat command, and runs it and reports
out to stdout spcific information of rocoto workflow.
"""

args = input_args()

error_return = 0
rocoto_status = rocoto_statcount()
try:
rocotostat = which("rocotostat")
except CommandNotFoundError:
logger.exception("rocotostat not found in PATH")
raise CommandNotFoundError("rocotostat not found in PATH")

if rocoto_status['CYCLES_TOTAL'] == rocoto_status['CYCLES_DONE']:
rocotostat.add_default_arg(['-w', os.path.abspath(args.w.name), '-d', os.path.abspath(args.d.name)])

rocoto_status = rocoto_statcount(rocotostat)
rocoto_status.update(rocotostat_summary(rocotostat))

error_return = 0
if is_done(rocoto_status):
rocoto_state = 'DONE'
elif rocoto_status['DEAD'] > 0:
error_return = rocoto_status['FAIL'] + rocoto_status['DEAD']
rocoto_state = 'FAIL'
elif 'UNKNOWN' in rocoto_status:
error_return = rocoto_status['UNKNOWN']
rocoto_state = 'UNKNOWN'
elif rocoto_status['RUNNING'] + rocoto_status['SUBMITTING'] + rocoto_status['QUEUED'] == 0:
#
# TODO for now a STALLED state will be just a warning as it can
# produce a false negative if there is a timestamp on a file dependency.
#
# error_return = -3
rocoto_state = 'STALLED'
elif is_stalled(rocoto_status):
rocoto_status = attempt_multiple_times(rocoto_statcount(rocotostat), 2, 120, ProcessError)
if is_stalled(rocoto_status):
error_return = 3
rocoto_state = 'STALLED'
else:
rocoto_state = 'RUNNING'

Expand Down
4 changes: 2 additions & 2 deletions workflow/hosts/hercules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ BASE_CPLIC: '/work/noaa/global/glopara/data/ICSDIR/prototype_ICs'
PACKAGEROOT: '/work/noaa/global/glopara/nwpara'
COMINsyn: '/work/noaa/global/glopara/com/gfs/prod/syndat'
HOMEDIR: '/work/noaa/global/${USER}'
STMP: '/work/noaa/stmp/${USER}'
PTMP: '/work/noaa/stmp/${USER}'
STMP: '/work/noaa/stmp/${USER}/HERCULES'
PTMP: '/work/noaa/stmp/${USER}/HERCULES'
NOSCRUB: $HOMEDIR
SCHEDULER: slurm
ACCOUNT: fv3-cpu
Expand Down
4 changes: 2 additions & 2 deletions workflow/hosts/orion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ BASE_CPLIC: '/work/noaa/global/glopara/data/ICSDIR/prototype_ICs'
PACKAGEROOT: '/work/noaa/global/glopara/nwpara'
COMINsyn: '/work/noaa/global/glopara/com/gfs/prod/syndat'
HOMEDIR: '/work/noaa/global/${USER}'
STMP: '/work/noaa/stmp/${USER}'
PTMP: '/work/noaa/stmp/${USER}'
STMP: '/work/noaa/stmp/${USER}/ORION'
PTMP: '/work/noaa/stmp/${USER}/ORION'
NOSCRUB: $HOMEDIR
SCHEDULER: slurm
ACCOUNT: fv3-cpu
Expand Down
Loading